diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md
index a18c454c..1ff47330 100644
--- a/.github/CONTRIBUTING.md
+++ b/.github/CONTRIBUTING.md
@@ -27,4 +27,4 @@ git commit -m "xxxxxx, test=doc"
 1. 虽然跳过了 CI，但是还要先排队排到才能跳过，所以非自己方向看到 pending 不要着急 🤣
 2. 在 `git commit --amend` 的时候才加 `test=xxx` 可能不太有效
 3. 一个 pr 多次提交 commit 注意每次都要加 `test=xxx`，因为每个 commit 都会触发 CI
-4. 删除 python 环境中已经安装好的的 paddlespeech，否则可能会影响 import paddlespeech 的顺序</div>
+4. 删除 python 环境中已经安装好的 paddlespeech，否则可能会影响 import paddlespeech 的顺序</div>
diff --git a/.github/ISSUE_TEMPLATE/bug-report-tts.md b/.github/ISSUE_TEMPLATE/bug-report-tts.md
index 64b33c32..e2322c23 100644
--- a/.github/ISSUE_TEMPLATE/bug-report-tts.md
+++ b/.github/ISSUE_TEMPLATE/bug-report-tts.md
@@ -3,7 +3,6 @@ name: "\U0001F41B TTS Bug Report"
 about: Create a report to help us improve
 title: "[TTS]XXXX"
 labels: Bug, T2S
-assignees: yt605155624
 
 ---
 
diff --git a/README.md b/README.md
index 3c60db65..abb0a55e 100644
--- a/README.md
+++ b/README.md
@@ -178,6 +178,13 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision
   - 🧩  *Cascaded models application*: as an extension of the typical traditional audio tasks, we combine the workflows of the aforementioned tasks with other fields like Natural language processing (NLP) and Computer Vision (CV).
 
 ### Recent Update
+- 👑 2023.05.31: Add [WavLM ASR-en](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/librispeech/asr5), WavLM fine-tuning for ASR on LibriSpeech.
+- 👑 2023.05.04: Add [HuBERT ASR-en](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/librispeech/asr4), HuBERT fine-tuning for ASR on LibriSpeech.
+- ⚡ 2023.04.28: Fix [0-d tensor](https://github.com/PaddlePaddle/PaddleSpeech/pull/3214), with the upgrade of paddlepaddle==2.5, the problem of modifying 0-d tensor has been solved.
+- 👑 2023.04.25: Add [AMP for U2 conformer](https://github.com/PaddlePaddle/PaddleSpeech/pull/3167).
+- 🔥 2023.04.06: Add [subtitle file (.srt format) generation example](./demos/streaming_asr_server).
+- 👑 2023.04.25: Add [AMP for U2 conformer](https://github.com/PaddlePaddle/PaddleSpeech/pull/3167).
+- 🔥 2023.03.14: Add SVS(Singing Voice Synthesis) examples with Opencpop dataset, including [DiffSinger](./examples/opencpop/svs1)、[PWGAN](./examples/opencpop/voc1) and [HiFiGAN](./examples/opencpop/voc5), the effect is continuously optimized.
 - 👑 2023.03.09: Add [Wav2vec2ASR-zh](./examples/aishell/asr3).
 - 🎉 2023.03.07: Add [TTS ARM Linux C++ Demo](./demos/TTSArmLinux).
 - 🔥 2023.03.03 Add Voice Conversion [StarGANv2-VC synthesize pipeline](./examples/vctk/vc3).
@@ -221,13 +228,13 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision
 
 ## Installation
 
-We strongly recommend our users to install PaddleSpeech in **Linux** with *python>=3.7* and *paddlepaddle>=2.4.1*.
+We strongly recommend our users to install PaddleSpeech in **Linux** with *python>=3.8* and *paddlepaddle<=2.5.1*. Some new versions of Paddle do not have support for adaptation in PaddleSpeech, so currently only versions 2.5.1 and earlier can be supported.
 
 ### **Dependency Introduction**
 
 + gcc >= 4.8.5
-+ paddlepaddle >= 2.4.1
-+ python >= 3.7
++ paddlepaddle <= 2.5.1
++ python >= 3.8
 + OS support:  Linux(recommend), Windows, Mac OSX
 
 PaddleSpeech depends on paddlepaddle. For installation, please refer to the official website of [paddlepaddle](https://www.paddlepaddle.org.cn/en) and choose according to your own machine. Here is an example of the cpu version.
@@ -577,14 +584,14 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r
   </thead>
   <tbody>
     <tr>
-    <td> Text Frontend </td>
-    <td colspan="2"> &emsp; </td>
-    <td>
-    <a href = "./examples/other/tn">tn</a> / <a href = "./examples/other/g2p">g2p</a>
-    </td>
+      <td> Text Frontend </td>
+      <td colspan="2"> &emsp; </td>
+      <td>
+      <a href = "./examples/other/tn">tn</a> / <a href = "./examples/other/g2p">g2p</a>
+      </td>
     </tr>
     <tr>
-      <td rowspan="5">Acoustic Model</td>
+      <td rowspan="6">Acoustic Model</td>
       <td>Tacotron2</td>
       <td>LJSpeech / CSMSC</td>
       <td>
@@ -619,6 +626,13 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r
       <a href = "./examples/vctk/ernie_sat">ERNIE-SAT-vctk</a> / <a href = "./examples/aishell3/ernie_sat">ERNIE-SAT-aishell3</a> / <a href = "./examples/aishell3_vctk/ernie_sat">ERNIE-SAT-zh_en</a>
       </td>
     </tr>
+    <tr>
+      <td>DiffSinger</td>
+      <td>Opencpop</td>
+      <td>
+      <a href = "./examples/opencpop/svs1">DiffSinger-opencpop</a>
+      </td>
+   </tr>
    <tr>
       <td rowspan="6">Vocoder</td>
       <td >WaveFlow</td>
@@ -629,9 +643,9 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r
     </tr>
     <tr>
       <td >Parallel WaveGAN</td>
-      <td >LJSpeech / VCTK / CSMSC / AISHELL-3</td>
+      <td >LJSpeech / VCTK / CSMSC / AISHELL-3 / Opencpop</td>
       <td>
-      <a href = "./examples/ljspeech/voc1">PWGAN-ljspeech</a> / <a href = "./examples/vctk/voc1">PWGAN-vctk</a> / <a href = "./examples/csmsc/voc1">PWGAN-csmsc</a> /  <a href = "./examples/aishell3/voc1">PWGAN-aishell3</a>
+      <a href = "./examples/ljspeech/voc1">PWGAN-ljspeech</a> / <a href = "./examples/vctk/voc1">PWGAN-vctk</a> / <a href = "./examples/csmsc/voc1">PWGAN-csmsc</a> /  <a href = "./examples/aishell3/voc1">PWGAN-aishell3</a> / <a href = "./examples/opencpop/voc1">PWGAN-opencpop</a>
       </td>
     </tr>
     <tr>
@@ -650,9 +664,9 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r
     </tr>
     <tr>
       <td>HiFiGAN</td>
-      <td>LJSpeech / VCTK / CSMSC / AISHELL-3</td>
+      <td>LJSpeech / VCTK / CSMSC / AISHELL-3 / Opencpop</td>
       <td>
-      <a href = "./examples/ljspeech/voc5">HiFiGAN-ljspeech</a> / <a href = "./examples/vctk/voc5">HiFiGAN-vctk</a> / <a href = "./examples/csmsc/voc5">HiFiGAN-csmsc</a> / <a href = "./examples/aishell3/voc5">HiFiGAN-aishell3</a>
+      <a href = "./examples/ljspeech/voc5">HiFiGAN-ljspeech</a> / <a href = "./examples/vctk/voc5">HiFiGAN-vctk</a> / <a href = "./examples/csmsc/voc5">HiFiGAN-csmsc</a> / <a href = "./examples/aishell3/voc5">HiFiGAN-aishell3</a> / <a href = "./examples/opencpop/voc5">HiFiGAN-opencpop</a>
       </td>
     </tr>
     <tr>
@@ -880,15 +894,20 @@ The Text-to-Speech module is originally called [Parakeet](https://github.com/Pad
 
 - **[VTuberTalk](https://github.com/jerryuhoo/VTuberTalk): Use PaddleSpeech TTS and ASR to clone voice from videos.**
 
-<div align="center">
-<img src="https://raw.githubusercontent.com/jerryuhoo/VTuberTalk/main/gui/gui.png"  width = "500px"  />
-</div>
-
 
 ## Citation
 
 To cite PaddleSpeech for research, please use the following format.
+
 ```text
+@inproceedings{zhang2022paddlespeech,
+    title = {PaddleSpeech: An Easy-to-Use All-in-One Speech Toolkit},
+    author = {Hui Zhang, Tian Yuan, Junkun Chen, Xintong Li, Renjie Zheng, Yuxin Huang, Xiaojie Chen, Enlei Gong, Zeyu Chen, Xiaoguang Hu, dianhai yu, Yanjun Ma, Liang Huang},
+    booktitle = {Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies: Demonstrations},
+    year = {2022},
+    publisher = {Association for Computational Linguistics},
+}
+
 @InProceedings{pmlr-v162-bai22d,
   title = {{A}$^3${T}: Alignment-Aware Acoustic and Text Pretraining for Speech Synthesis and Editing},
   author = {Bai, He and Zheng, Renjie and Chen, Junkun and Ma, Mingbo and Li, Xintong and Huang, Liang},
@@ -903,14 +922,6 @@ To cite PaddleSpeech for research, please use the following format.
   url = {https://proceedings.mlr.press/v162/bai22d.html},
 }
 
-@inproceedings{zhang2022paddlespeech,
-    title = {PaddleSpeech: An Easy-to-Use All-in-One Speech Toolkit},
-    author = {Hui Zhang, Tian Yuan, Junkun Chen, Xintong Li, Renjie Zheng, Yuxin Huang, Xiaojie Chen, Enlei Gong, Zeyu Chen, Xiaoguang Hu, dianhai yu, Yanjun Ma, Liang Huang},
-    booktitle = {Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies: Demonstrations},
-    year = {2022},
-    publisher = {Association for Computational Linguistics},
-}
-
 @inproceedings{zheng2021fused,
   title={Fused acoustic and text encoding for multimodal bilingual pretraining and speech translation},
   author={Zheng, Renjie and Chen, Junkun and Ma, Mingbo and Huang, Liang},
diff --git a/README_cn.md b/README_cn.md
index 29ee387c..f743c287 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -8,7 +8,7 @@
     <a href="./LICENSE"><img src="https://img.shields.io/badge/license-Apache%202-red.svg"></a>
     <a href="https://github.com/PaddlePaddle/PaddleSpeech/releases"><img src="https://img.shields.io/github/v/release/PaddlePaddle/PaddleSpeech?color=ffa"></a>
     <a href="support os"><img src="https://img.shields.io/badge/os-linux%2C%20win%2C%20mac-pink.svg"></a>
-    <a href=""><img src="https://img.shields.io/badge/python-3.7+-aff.svg"></a>
+    <a href=""><img src="https://img.shields.io/badge/python-3.8+-aff.svg"></a>
     <a href="https://github.com/PaddlePaddle/PaddleSpeech/graphs/contributors"><img src="https://img.shields.io/github/contributors/PaddlePaddle/PaddleSpeech?color=9ea"></a>
     <a href="https://github.com/PaddlePaddle/PaddleSpeech/commits"><img src="https://img.shields.io/github/commit-activity/m/PaddlePaddle/PaddleSpeech?color=3af"></a>
     <a href="https://github.com/PaddlePaddle/PaddleSpeech/issues"><img src="https://img.shields.io/github/issues/PaddlePaddle/PaddleSpeech?color=9cc"></a>
@@ -183,6 +183,13 @@
   - 🧩 级联模型应用: 作为传统语音任务的扩展，我们结合了自然语言处理、计算机视觉等任务，实现更接近实际需求的产业级应用。
 
 ### 近期更新
+- 👑 2023.05.31: 新增 [WavLM ASR-en](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/librispeech/asr5), 基于WavLM的英语识别微调，使用LibriSpeech数据集
+- 👑 2023.05.04: 新增 [HuBERT ASR-en](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/librispeech/asr4), 基于HuBERT的英语识别微调，使用LibriSpeech数据集
+- ⚡ 2023.04.28: 修正 [0-d tensor](https://github.com/PaddlePaddle/PaddleSpeech/pull/3214), 配合PaddlePaddle2.5升级修改了0-d tensor的问题。
+- 👑 2023.04.25: 新增 [U2 conformer 的 AMP 训练](https://github.com/PaddlePaddle/PaddleSpeech/pull/3167).
+- 👑 2023.04.06: 新增 [srt格式字幕生成功能](./demos/streaming_asr_server)。
+- 👑 2023.04.25: 新增 [U2 conformer 的 AMP 训练](https://github.com/PaddlePaddle/PaddleSpeech/pull/3167).
+- 🔥 2023.03.14: 新增基于 Opencpop 数据集的 SVS (歌唱合成) 示例，包含 [DiffSinger](./examples/opencpop/svs1)、[PWGAN](./examples/opencpop/voc1) 和 [HiFiGAN](./examples/opencpop/voc5)，效果持续优化中。
 - 👑 2023.03.09: 新增 [Wav2vec2ASR-zh](./examples/aishell/asr3)。
 - 🎉 2023.03.07: 新增 [TTS ARM Linux C++ 部署示例](./demos/TTSArmLinux)。
 - 🔥 2023.03.03: 新增声音转换模型 [StarGANv2-VC 合成流程](./examples/vctk/vc3)。
@@ -231,12 +238,12 @@
 <a name="安装"></a>
 ## 安装
 
-我们强烈建议用户在 **Linux** 环境下，*3.7* 以上版本的 *python* 上安装 PaddleSpeech。
+我们强烈建议用户在 **Linux** 环境下，*3.8* 以上版本的 *python* 上安装 PaddleSpeech。同时，有一些Paddle新版本的内容没有在做适配的支持，因此目前只能使用2.5.1及之前的版本。
 
 ### 相关依赖
 + gcc >= 4.8.5
-+ paddlepaddle >= 2.4.1
-+ python >= 3.7
++ paddlepaddle <= 2.5.1
++ python >= 3.8
 + linux(推荐), mac, windows
 
 PaddleSpeech 依赖于 paddlepaddle，安装可以参考[ paddlepaddle 官网](https://www.paddlepaddle.org.cn/)，根据自己机器的情况进行选择。这里给出 cpu 版本示例，其它版本大家可以根据自己机器的情况进行安装。
@@ -576,43 +583,50 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块：文本前端、声
     <td>
     <a href = "./examples/other/tn">tn</a> / <a href = "./examples/other/g2p">g2p</a>
     </td>
-    </tr>
-    <tr>
-      <td rowspan="5">声学模型</td>
+   </tr>
+   <tr>
+      <td rowspan="6">声学模型</td>
       <td>Tacotron2</td>
       <td>LJSpeech / CSMSC</td>
       <td>
       <a href = "./examples/ljspeech/tts0">tacotron2-ljspeech</a> / <a href = "./examples/csmsc/tts0">tacotron2-csmsc</a>
       </td>
-    </tr>
-    <tr>
+   </tr>
+   <tr>
       <td>Transformer TTS</td>
       <td>LJSpeech</td>
       <td>
       <a href = "./examples/ljspeech/tts1">transformer-ljspeech</a>
       </td>
-    </tr>
-    <tr>
+   </tr>
+   <tr>
       <td>SpeedySpeech</td>
       <td>CSMSC</td>
       <td >
       <a href = "./examples/csmsc/tts2">speedyspeech-csmsc</a>
       </td>
-    </tr>
-    <tr>
+   </tr>
+   <tr>
       <td>FastSpeech2</td>
       <td>LJSpeech / VCTK / CSMSC / AISHELL-3 / ZH_EN / finetune</td>
       <td>
       <a href = "./examples/ljspeech/tts3">fastspeech2-ljspeech</a> / <a href = "./examples/vctk/tts3">fastspeech2-vctk</a> / <a href = "./examples/csmsc/tts3">fastspeech2-csmsc</a> / <a href = "./examples/aishell3/tts3">fastspeech2-aishell3</a> / <a href = "./examples/zh_en_tts/tts3">fastspeech2-zh_en</a> / <a href = "./examples/other/tts_finetune/tts3">fastspeech2-finetune</a>
       </td>
-    </tr>
-    <tr>
+   </tr>
+   <tr>
       <td><a href = "https://arxiv.org/abs/2211.03545">ERNIE-SAT</a></td>
       <td>VCTK / AISHELL-3 / ZH_EN</td>
       <td>
       <a href = "./examples/vctk/ernie_sat">ERNIE-SAT-vctk</a> / <a href = "./examples/aishell3/ernie_sat">ERNIE-SAT-aishell3</a> / <a href = "./examples/aishell3_vctk/ernie_sat">ERNIE-SAT-zh_en</a>
       </td>
-    </tr>
+   </tr>
+   <tr>
+      <td>DiffSinger</td>
+      <td>Opencpop</td>
+      <td>
+      <a href = "./examples/opencpop/svs1">DiffSinger-opencpop</a>
+      </td>
+   </tr>
    <tr>
       <td rowspan="6">声码器</td>
       <td >WaveFlow</td>
@@ -623,9 +637,9 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块：文本前端、声
     </tr>
     <tr>
       <td >Parallel WaveGAN</td>
-      <td >LJSpeech / VCTK / CSMSC / AISHELL-3</td>
+      <td >LJSpeech / VCTK / CSMSC / AISHELL-3 / Opencpop</td>
       <td>
-      <a href = "./examples/ljspeech/voc1">PWGAN-ljspeech</a> / <a href = "./examples/vctk/voc1">PWGAN-vctk</a> / <a href = "./examples/csmsc/voc1">PWGAN-csmsc</a> /  <a href = "./examples/aishell3/voc1">PWGAN-aishell3</a>
+      <a href = "./examples/ljspeech/voc1">PWGAN-ljspeech</a> / <a href = "./examples/vctk/voc1">PWGAN-vctk</a> / <a href = "./examples/csmsc/voc1">PWGAN-csmsc</a> /  <a href = "./examples/aishell3/voc1">PWGAN-aishell3</a> / <a href = "./examples/opencpop/voc1">PWGAN-opencpop</a>
       </td>
     </tr>
     <tr>
@@ -644,9 +658,9 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块：文本前端、声
     </tr>
     <tr>
       <td >HiFiGAN</td>
-      <td >LJSpeech / VCTK / CSMSC / AISHELL-3</td>
+      <td >LJSpeech / VCTK / CSMSC / AISHELL-3 / Opencpop</td>
       <td>
-      <a href = "./examples/ljspeech/voc5">HiFiGAN-ljspeech</a> / <a href = "./examples/vctk/voc5">HiFiGAN-vctk</a> / <a href = "./examples/csmsc/voc5">HiFiGAN-csmsc</a> / <a href = "./examples/aishell3/voc5">HiFiGAN-aishell3</a>
+      <a href = "./examples/ljspeech/voc5">HiFiGAN-ljspeech</a> / <a href = "./examples/vctk/voc5">HiFiGAN-vctk</a> / <a href = "./examples/csmsc/voc5">HiFiGAN-csmsc</a> / <a href = "./examples/aishell3/voc5">HiFiGAN-aishell3</a> / <a href = "./examples/opencpop/voc5">HiFiGAN-opencpop</a>
       </td>
     </tr>
     <tr>
@@ -703,6 +717,7 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块：文本前端、声
   </tbody>
 </table>
 
+
 <a name="声音分类模型"></a>
 **声音分类**
 
diff --git a/audio/paddleaudio/backends/soundfile_backend.py b/audio/paddleaudio/backends/soundfile_backend.py
index ae7b5b52..9195ea09 100644
--- a/audio/paddleaudio/backends/soundfile_backend.py
+++ b/audio/paddleaudio/backends/soundfile_backend.py
@@ -191,7 +191,7 @@ def soundfile_save(y: np.ndarray, sr: int, file: os.PathLike) -> None:
 
     if sr <= 0:
         raise ParameterError(
-            f'Sample rate should be larger than 0, recieved sr = {sr}')
+            f'Sample rate should be larger than 0, received sr = {sr}')
 
     if y.dtype not in ['int16', 'int8']:
         warnings.warn(
diff --git a/audio/setup.py b/audio/setup.py
index 823e5dfa..f7d45944 100644
--- a/audio/setup.py
+++ b/audio/setup.py
@@ -34,12 +34,14 @@ from tools import setup_helpers
 
 ROOT_DIR = Path(__file__).parent.resolve()
 
-VERSION = '1.1.0'
+VERSION = '1.2.0'
 COMMITID = 'none'
 
 base = [
-    "kaldiio",
+    # paddleaudio align with librosa==0.8.1, which need numpy==1.23.x
     "librosa==0.8.1",
+    "numpy==1.23.5",
+    "kaldiio",
     "pathos",
     "pybind11",
     "parameterized",
diff --git a/audio/tests/features/base.py b/audio/tests/features/base.py
index d183b72a..3bb1d1dd 100644
--- a/audio/tests/features/base.py
+++ b/audio/tests/features/base.py
@@ -37,7 +37,7 @@ class FeatTest(unittest.TestCase):
         self.waveform, self.sr = load(os.path.abspath(os.path.basename(url)))
         self.waveform = self.waveform.astype(
             np.float32
-        )  # paddlespeech.s2t.transform.spectrogram only supports float32 
+        )  # paddlespeech.audio.transform.spectrogram only supports float32
         dim = len(self.waveform.shape)
 
         assert dim in [1, 2]
diff --git a/audio/tests/features/test_istft.py b/audio/tests/features/test_istft.py
index 9cf8cdd6..ea1ee5cb 100644
--- a/audio/tests/features/test_istft.py
+++ b/audio/tests/features/test_istft.py
@@ -18,8 +18,8 @@ import paddle
 from paddleaudio.functional.window import get_window
 
 from .base import FeatTest
-from paddlespeech.s2t.transform.spectrogram import IStft
-from paddlespeech.s2t.transform.spectrogram import Stft
+from paddlespeech.audio.transform.spectrogram import IStft
+from paddlespeech.audio.transform.spectrogram import Stft
 
 
 class TestIstft(FeatTest):
diff --git a/audio/tests/features/test_log_melspectrogram.py b/audio/tests/features/test_log_melspectrogram.py
index 7d568038..b2765d3b 100644
--- a/audio/tests/features/test_log_melspectrogram.py
+++ b/audio/tests/features/test_log_melspectrogram.py
@@ -18,7 +18,7 @@ import paddle
 import paddleaudio
 
 from .base import FeatTest
-from paddlespeech.s2t.transform.spectrogram import LogMelSpectrogram
+from paddlespeech.audio.transform.spectrogram import LogMelSpectrogram
 
 
 class TestLogMelSpectrogram(FeatTest):
diff --git a/audio/tests/features/test_spectrogram.py b/audio/tests/features/test_spectrogram.py
index 5fe5afee..6f460963 100644
--- a/audio/tests/features/test_spectrogram.py
+++ b/audio/tests/features/test_spectrogram.py
@@ -18,7 +18,7 @@ import paddle
 import paddleaudio
 
 from .base import FeatTest
-from paddlespeech.s2t.transform.spectrogram import Spectrogram
+from paddlespeech.audio.transform.spectrogram import Spectrogram
 
 
 class TestSpectrogram(FeatTest):
diff --git a/audio/tests/features/test_stft.py b/audio/tests/features/test_stft.py
index 58792ffe..9511a292 100644
--- a/audio/tests/features/test_stft.py
+++ b/audio/tests/features/test_stft.py
@@ -18,7 +18,7 @@ import paddle
 from paddleaudio.functional.window import get_window
 
 from .base import FeatTest
-from paddlespeech.s2t.transform.spectrogram import Stft
+from paddlespeech.audio.transform.spectrogram import Stft
 
 
 class TestStft(FeatTest):
diff --git a/dataset/aidatatang_200zh/aidatatang_200zh.py b/dataset/aidatatang_200zh/aidatatang_200zh.py
index 85f478c2..3b706c49 100644
--- a/dataset/aidatatang_200zh/aidatatang_200zh.py
+++ b/dataset/aidatatang_200zh/aidatatang_200zh.py
@@ -18,139 +18,7 @@ Manifest file is a json-format file with each line containing the
 meta data (i.e. audio filepath, transcript and audio duration)
 of each audio file in the data set.
 """
-import argparse
-import codecs
-import json
-import os
-from pathlib import Path
-
-import soundfile
-
-from utils.utility import download
-from utils.utility import unpack
-
-DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
-
-URL_ROOT = 'http://www.openslr.org/resources/62'
-# URL_ROOT = 'https://openslr.magicdatatech.com/resources/62'
-DATA_URL = URL_ROOT + '/aidatatang_200zh.tgz'
-MD5_DATA = '6e0f4f39cd5f667a7ee53c397c8d0949'
-
-parser = argparse.ArgumentParser(description=__doc__)
-parser.add_argument(
-    "--target_dir",
-    default=DATA_HOME + "/aidatatang_200zh",
-    type=str,
-    help="Directory to save the dataset. (default: %(default)s)")
-parser.add_argument(
-    "--manifest_prefix",
-    default="manifest",
-    type=str,
-    help="Filepath prefix for output manifests. (default: %(default)s)")
-args = parser.parse_args()
-
-
-def create_manifest(data_dir, manifest_path_prefix):
-    print("Creating manifest %s ..." % manifest_path_prefix)
-    json_lines = []
-    transcript_path = os.path.join(data_dir, 'transcript',
-                                   'aidatatang_200_zh_transcript.txt')
-    transcript_dict = {}
-    for line in codecs.open(transcript_path, 'r', 'utf-8'):
-        line = line.strip()
-        if line == '':
-            continue
-        audio_id, text = line.split(' ', 1)
-        # remove withespace, charactor text
-        text = ''.join(text.split())
-        transcript_dict[audio_id] = text
-
-    data_types = ['train', 'dev', 'test']
-    for dtype in data_types:
-        del json_lines[:]
-        total_sec = 0.0
-        total_text = 0.0
-        total_num = 0
-
-        audio_dir = os.path.join(data_dir, 'corpus/', dtype)
-        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
-            for fname in filelist:
-                if not fname.endswith('.wav'):
-                    continue
-
-                audio_path = os.path.abspath(os.path.join(subfolder, fname))
-                audio_id = os.path.basename(fname)[:-4]
-                utt2spk = Path(audio_path).parent.name
-
-                audio_data, samplerate = soundfile.read(audio_path)
-                duration = float(len(audio_data) / samplerate)
-                text = transcript_dict[audio_id]
-                json_lines.append(
-                    json.dumps(
-                        {
-                            'utt': audio_id,
-                            'utt2spk': str(utt2spk),
-                            'feat': audio_path,
-                            'feat_shape': (duration, ),  # second
-                            'text': text,
-                        },
-                        ensure_ascii=False))
-
-                total_sec += duration
-                total_text += len(text)
-                total_num += 1
-
-        manifest_path = manifest_path_prefix + '.' + dtype
-        with codecs.open(manifest_path, 'w', 'utf-8') as fout:
-            for line in json_lines:
-                fout.write(line + '\n')
-
-        manifest_dir = os.path.dirname(manifest_path_prefix)
-        meta_path = os.path.join(manifest_dir, dtype) + '.meta'
-        with open(meta_path, 'w') as f:
-            print(f"{dtype}:", file=f)
-            print(f"{total_num} utts", file=f)
-            print(f"{total_sec / (60*60)} h", file=f)
-            print(f"{total_text} text", file=f)
-            print(f"{total_text / total_sec} text/sec", file=f)
-            print(f"{total_sec / total_num} sec/utt", file=f)
-
-
-def prepare_dataset(url, md5sum, target_dir, manifest_path, subset):
-    """Download, unpack and create manifest file."""
-    data_dir = os.path.join(target_dir, subset)
-    if not os.path.exists(data_dir):
-        filepath = download(url, md5sum, target_dir)
-        unpack(filepath, target_dir)
-        # unpack all audio tar files
-        audio_dir = os.path.join(data_dir, 'corpus')
-        for subfolder, dirlist, filelist in sorted(os.walk(audio_dir)):
-            for sub in dirlist:
-                print(f"unpack dir {sub}...")
-                for folder, _, filelist in sorted(
-                        os.walk(os.path.join(subfolder, sub))):
-                    for ftar in filelist:
-                        unpack(os.path.join(folder, ftar), folder, True)
-    else:
-        print("Skip downloading and unpacking. Data already exists in %s." %
-              target_dir)
-
-    create_manifest(data_dir, manifest_path)
-
-
-def main():
-    if args.target_dir.startswith('~'):
-        args.target_dir = os.path.expanduser(args.target_dir)
-
-    prepare_dataset(
-        url=DATA_URL,
-        md5sum=MD5_DATA,
-        target_dir=args.target_dir,
-        manifest_path=args.manifest_prefix,
-        subset='aidatatang_200zh')
-
-    print("Data download and manifest prepare done!")
-
+from paddlespeech.dataset.aidatatang_200zh import aidatatang_200zh_main
 
 if __name__ == '__main__':
-    main()
+    aidatatang_200zh_main()
diff --git a/dataset/aishell/README.md b/dataset/aishell/README.md
deleted file mode 100644
index a7dd0cf3..00000000
--- a/dataset/aishell/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# [Aishell1](http://openslr.elda.org/33/)
-
-This Open Source Mandarin Speech Corpus, AISHELL-ASR0009-OS1, is 178 hours long. It is a part of AISHELL-ASR0009, of which utterance contains 11 domains, including smart home, autonomous driving, and industrial production. The whole recording was put in quiet indoor environment, using 3 different devices at the same time: high fidelity microphone (44.1kHz, 16-bit,); Android-system mobile phone (16kHz, 16-bit), iOS-system mobile phone (16kHz, 16-bit). Audios in high fidelity were re-sampled to 16kHz to build AISHELL- ASR0009-OS1. 400 speakers from different accent areas in China were invited to participate in the recording. The manual transcription accuracy rate is above 95%, through professional speech annotation and strict quality inspection. The corpus is divided into training, development and testing sets. ( This database is free for academic research, not in the commerce, if without permission. )
diff --git a/dataset/aishell/aishell.py b/dataset/aishell/aishell.py
index ec43104d..b3288757 100644
--- a/dataset/aishell/aishell.py
+++ b/dataset/aishell/aishell.py
@@ -18,143 +18,7 @@ Manifest file is a json-format file with each line containing the
 meta data (i.e. audio filepath, transcript and audio duration)
 of each audio file in the data set.
 """
-import argparse
-import codecs
-import json
-import os
-from pathlib import Path
-
-import soundfile
-
-from utils.utility import download
-from utils.utility import unpack
-
-DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
-
-URL_ROOT = 'http://openslr.elda.org/resources/33'
-# URL_ROOT = 'https://openslr.magicdatatech.com/resources/33'
-DATA_URL = URL_ROOT + '/data_aishell.tgz'
-MD5_DATA = '2f494334227864a8a8fec932999db9d8'
-RESOURCE_URL = URL_ROOT + '/resource_aishell.tgz'
-MD5_RESOURCE = '957d480a0fcac85fc18e550756f624e5'
-
-parser = argparse.ArgumentParser(description=__doc__)
-parser.add_argument(
-    "--target_dir",
-    default=DATA_HOME + "/Aishell",
-    type=str,
-    help="Directory to save the dataset. (default: %(default)s)")
-parser.add_argument(
-    "--manifest_prefix",
-    default="manifest",
-    type=str,
-    help="Filepath prefix for output manifests. (default: %(default)s)")
-args = parser.parse_args()
-
-
-def create_manifest(data_dir, manifest_path_prefix):
-    print("Creating manifest %s ..." % manifest_path_prefix)
-    json_lines = []
-    transcript_path = os.path.join(data_dir, 'transcript',
-                                   'aishell_transcript_v0.8.txt')
-    transcript_dict = {}
-    for line in codecs.open(transcript_path, 'r', 'utf-8'):
-        line = line.strip()
-        if line == '':
-            continue
-        audio_id, text = line.split(' ', 1)
-        # remove withespace, charactor text
-        text = ''.join(text.split())
-        transcript_dict[audio_id] = text
-
-    data_types = ['train', 'dev', 'test']
-    for dtype in data_types:
-        del json_lines[:]
-        total_sec = 0.0
-        total_text = 0.0
-        total_num = 0
-
-        audio_dir = os.path.join(data_dir, 'wav', dtype)
-        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
-            for fname in filelist:
-                audio_path = os.path.abspath(os.path.join(subfolder, fname))
-                audio_id = os.path.basename(fname)[:-4]
-                # if no transcription for audio then skipped
-                if audio_id not in transcript_dict:
-                    continue
-
-                utt2spk = Path(audio_path).parent.name
-                audio_data, samplerate = soundfile.read(audio_path)
-                duration = float(len(audio_data) / samplerate)
-                text = transcript_dict[audio_id]
-                json_lines.append(
-                    json.dumps(
-                        {
-                            'utt': audio_id,
-                            'utt2spk': str(utt2spk),
-                            'feat': audio_path,
-                            'feat_shape': (duration, ),  # second
-                            'text': text
-                        },
-                        ensure_ascii=False))
-
-                total_sec += duration
-                total_text += len(text)
-                total_num += 1
-
-        manifest_path = manifest_path_prefix + '.' + dtype
-        with codecs.open(manifest_path, 'w', 'utf-8') as fout:
-            for line in json_lines:
-                fout.write(line + '\n')
-
-        manifest_dir = os.path.dirname(manifest_path_prefix)
-        meta_path = os.path.join(manifest_dir, dtype) + '.meta'
-        with open(meta_path, 'w') as f:
-            print(f"{dtype}:", file=f)
-            print(f"{total_num} utts", file=f)
-            print(f"{total_sec / (60*60)} h", file=f)
-            print(f"{total_text} text", file=f)
-            print(f"{total_text / total_sec} text/sec", file=f)
-            print(f"{total_sec / total_num} sec/utt", file=f)
-
-
-def prepare_dataset(url, md5sum, target_dir, manifest_path=None):
-    """Download, unpack and create manifest file."""
-    data_dir = os.path.join(target_dir, 'data_aishell')
-    if not os.path.exists(data_dir):
-        filepath = download(url, md5sum, target_dir)
-        unpack(filepath, target_dir)
-        # unpack all audio tar files
-        audio_dir = os.path.join(data_dir, 'wav')
-        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
-            for ftar in filelist:
-                unpack(os.path.join(subfolder, ftar), subfolder, True)
-    else:
-        print("Skip downloading and unpacking. Data already exists in %s." %
-              target_dir)
-
-    if manifest_path:
-        create_manifest(data_dir, manifest_path)
-
-
-def main():
-    if args.target_dir.startswith('~'):
-        args.target_dir = os.path.expanduser(args.target_dir)
-
-    prepare_dataset(
-        url=DATA_URL,
-        md5sum=MD5_DATA,
-        target_dir=args.target_dir,
-        manifest_path=args.manifest_prefix)
-
-    prepare_dataset(
-        url=RESOURCE_URL,
-        md5sum=MD5_RESOURCE,
-        target_dir=args.target_dir,
-        manifest_path=None)
-
-    print("Data download and manifest prepare done!")
-
+from paddlespeech.dataset.aishell import aishell_main
 
 if __name__ == '__main__':
-    main()
+    aishell_main()
diff --git a/dataset/librispeech/librispeech.py b/dataset/librispeech/librispeech.py
index 2d6f1763..44567b0c 100644
--- a/dataset/librispeech/librispeech.py
+++ b/dataset/librispeech/librispeech.py
@@ -28,8 +28,8 @@ from multiprocessing.pool import Pool
 import distutils.util
 import soundfile
 
-from utils.utility import download
-from utils.utility import unpack
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unpack
 
 URL_ROOT = "http://openslr.elda.org/resources/12"
 #URL_ROOT = "https://openslr.magicdatatech.com/resources/12"
diff --git a/dataset/mini_librispeech/mini_librispeech.py b/dataset/mini_librispeech/mini_librispeech.py
index 0eb80bf8..24bd98d8 100644
--- a/dataset/mini_librispeech/mini_librispeech.py
+++ b/dataset/mini_librispeech/mini_librispeech.py
@@ -27,8 +27,8 @@ from multiprocessing.pool import Pool
 
 import soundfile
 
-from utils.utility import download
-from utils.utility import unpack
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unpack
 
 URL_ROOT = "http://openslr.elda.org/resources/31"
 URL_TRAIN_CLEAN = URL_ROOT + "/train-clean-5.tar.gz"
diff --git a/dataset/musan/musan.py b/dataset/musan/musan.py
index ae3430b2..85d986e8 100644
--- a/dataset/musan/musan.py
+++ b/dataset/musan/musan.py
@@ -29,8 +29,8 @@ import os
 
 import soundfile
 
-from utils.utility import download
-from utils.utility import unpack
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unpack
 
 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
 
diff --git a/dataset/rir_noise/rir_noise.py b/dataset/rir_noise/rir_noise.py
index b1d47558..b98dff72 100644
--- a/dataset/rir_noise/rir_noise.py
+++ b/dataset/rir_noise/rir_noise.py
@@ -29,8 +29,8 @@ import os
 
 import soundfile
 
-from utils.utility import download
-from utils.utility import unzip
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unzip
 
 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
 
diff --git a/dataset/thchs30/thchs30.py b/dataset/thchs30/thchs30.py
index d41c0e17..c5c3eb7a 100644
--- a/dataset/thchs30/thchs30.py
+++ b/dataset/thchs30/thchs30.py
@@ -27,8 +27,8 @@ from pathlib import Path
 
 import soundfile
 
-from utils.utility import download
-from utils.utility import unpack
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unpack
 
 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
 
diff --git a/dataset/timit/timit.py b/dataset/timit/timit.py
index c4a9f066..f3889d17 100644
--- a/dataset/timit/timit.py
+++ b/dataset/timit/timit.py
@@ -28,7 +28,7 @@ from pathlib import Path
 
 import soundfile
 
-from utils.utility import unzip
+from paddlespeech.dataset.download import unzip
 
 URL_ROOT = ""
 MD5_DATA = "45c68037c7fdfe063a43c851f181fb2d"
diff --git a/dataset/voxceleb/voxceleb1.py b/dataset/voxceleb/voxceleb1.py
index 95827f70..8d410067 100644
--- a/dataset/voxceleb/voxceleb1.py
+++ b/dataset/voxceleb/voxceleb1.py
@@ -31,9 +31,9 @@ from pathlib import Path
 
 import soundfile
 
-from utils.utility import check_md5sum
-from utils.utility import download
-from utils.utility import unzip
+from paddlespeech.dataset.download import check_md5sum
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unzip
 
 # all the data will be download in the current data/voxceleb directory default
 DATA_HOME = os.path.expanduser('.')
diff --git a/dataset/voxceleb/voxceleb2.py b/dataset/voxceleb/voxceleb2.py
index fe9e8b9c..6df6d1f3 100644
--- a/dataset/voxceleb/voxceleb2.py
+++ b/dataset/voxceleb/voxceleb2.py
@@ -27,9 +27,9 @@ from pathlib import Path
 
 import soundfile
 
-from utils.utility import check_md5sum
-from utils.utility import download
-from utils.utility import unzip
+from paddlespeech.dataset.download import check_md5sum
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unzip
 
 # all the data will be download in the current data/voxceleb directory default
 DATA_HOME = os.path.expanduser('.')
diff --git a/dataset/voxforge/voxforge.py b/dataset/voxforge/voxforge.py
index 373791bf..327d200b 100644
--- a/dataset/voxforge/voxforge.py
+++ b/dataset/voxforge/voxforge.py
@@ -28,9 +28,9 @@ import subprocess
 
 import soundfile
 
-from utils.utility import download_multi
-from utils.utility import getfile_insensitive
-from utils.utility import unpack
+from paddlespeech.dataset.download import download_multi
+from paddlespeech.dataset.download import getfile_insensitive
+from paddlespeech.dataset.download import unpack
 
 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
 
diff --git a/demos/README.md b/demos/README.md
index a4196786..6f9cd2e4 100644
--- a/demos/README.md
+++ b/demos/README.md
@@ -18,4 +18,4 @@ This directory contains many speech applications in multiple scenarios.
 * style_fs2 - multi style control for FastSpeech2 model  
 * text_to_speech - convert text into speech 
 * self supervised pretraining - speech feature extraction and speech recognition based on wav2vec2
-* Wishper - speech recognize and translate based on Whisper model
+* Whisper - speech recognize and translate based on Whisper model
diff --git a/demos/TTSAndroid/README.md b/demos/TTSAndroid/README.md
index 36ff969f..36848cbe 100644
--- a/demos/TTSAndroid/README.md
+++ b/demos/TTSAndroid/README.md
@@ -1,6 +1,6 @@
 # 语音合成 Java API Demo 使用指南
 
-在 Android 上实现语音合成功能，此 Demo 有很好的的易用性和开放性，如在 Demo 中跑自己训练好的模型等。
+在 Android 上实现语音合成功能，此 Demo 有很好的易用性和开放性，如在 Demo 中跑自己训练好的模型等。
 
 本文主要介绍语音合成 Demo 运行方法。
 
diff --git a/demos/audio_searching/src/test_audio_search.py b/demos/audio_searching/src/test_audio_search.py
index cb91e156..f9ea2929 100644
--- a/demos/audio_searching/src/test_audio_search.py
+++ b/demos/audio_searching/src/test_audio_search.py
@@ -14,8 +14,8 @@
 from audio_search import app
 from fastapi.testclient import TestClient
 
-from utils.utility import download
-from utils.utility import unpack
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unpack
 
 client = TestClient(app)
 
diff --git a/demos/audio_searching/src/test_vpr_search.py b/demos/audio_searching/src/test_vpr_search.py
index 298e12eb..cc795564 100644
--- a/demos/audio_searching/src/test_vpr_search.py
+++ b/demos/audio_searching/src/test_vpr_search.py
@@ -14,8 +14,8 @@
 from fastapi.testclient import TestClient
 from vpr_search import app
 
-from utils.utility import download
-from utils.utility import unpack
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unpack
 
 client = TestClient(app)
 
diff --git a/demos/speech_server/README.md b/demos/speech_server/README.md
index 7e7d4b2c..116f1fd7 100644
--- a/demos/speech_server/README.md
+++ b/demos/speech_server/README.md
@@ -34,6 +34,8 @@ Currently the engine type supports two forms: python and inference (Paddle Infer
   paddlespeech_server start --config_file ./conf/application.yaml
   ```
 
+  > **Note:** For mixed Chinese and English speech recognition, please use the `./conf/conformer_talcs_application.yaml` configuration file 
+
   Usage:
   
   ```bash
@@ -85,6 +87,7 @@ Here are sample files for this ASR client demo that can be downloaded:
 ```bash
 wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
 wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
+wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav
 ```
 
 **Note:** The response time will be slightly longer when using the client for the first time
@@ -92,8 +95,11 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
 
    If `127.0.0.1` is not accessible, you need to use the actual service IP address.
 
-   ```
+   ```bash
    paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input ./zh.wav
+  
+   # Chinese and English mixed speech recognition,  using `./conf/conformer_talcs_application.yaml` config file
+   paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input ./ch_zh_mix.wav
    ```
 
   Usage:
diff --git a/demos/speech_server/README_cn.md b/demos/speech_server/README_cn.md
index 59492828..f2cb349e 100644
--- a/demos/speech_server/README_cn.md
+++ b/demos/speech_server/README_cn.md
@@ -37,6 +37,8 @@
   paddlespeech_server start --config_file ./conf/application.yaml
   ```
 
+  > **注意：** 中英文混合语音识别请使用  `./conf/conformer_talcs_application.yaml` 配置文件
+
   使用方法：
   
   ```bash
@@ -79,6 +81,8 @@
   [2022-02-23 14:57:56] [INFO] [server.py:204] Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit)
   ```
 
+
+
 ### 4. ASR 客户端使用方法
 
 ASR 客户端的输入是一个 WAV 文件（`.wav`），并且采样率必须与模型的采样率相同。
@@ -87,6 +91,7 @@ ASR 客户端的输入是一个 WAV 文件（`.wav`），并且采样率必须
 ```bash
 wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
 wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
+wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav
 ```
 
 **注意：** 初次使用客户端时响应时间会略长
@@ -94,8 +99,11 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
 
   若 `127.0.0.1` 不能访问，则需要使用实际服务 IP 地址
 
-  ```
+  ```bash
   paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input ./zh.wav
+  
+  # 中英文混合语音识别 , 请使用  `./conf/conformer_talcs_application.yaml` 配置文件
+  paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input ./ch_zh_mix.wav
   ```
 
   使用帮助:
diff --git a/demos/speech_server/conf/conformer_talcs_application.yaml b/demos/speech_server/conf/conformer_talcs_application.yaml
new file mode 100644
index 00000000..f5f9897b
--- /dev/null
+++ b/demos/speech_server/conf/conformer_talcs_application.yaml
@@ -0,0 +1,163 @@
+# This is the parameter configuration file for PaddleSpeech Offline Serving.
+
+#################################################################################
+#                             SERVER SETTING                                    #
+#################################################################################
+host: 0.0.0.0
+port: 8090
+
+# The task format in the engin_list is: <speech task>_<engine type>
+# task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference', 'cls_python', 'cls_inference', 'text_python', 'vector_python']
+protocol: 'http'
+engine_list: ['asr_python', 'tts_python', 'cls_python', 'text_python', 'vector_python']
+
+
+#################################################################################
+#                                ENGINE CONFIG                                  #
+#################################################################################
+
+################################### ASR #########################################
+################### speech task: asr; engine_type: python #######################
+asr_python:
+    model: 'conformer_talcs'
+    lang: 'zh_en'
+    sample_rate: 16000
+    cfg_path: # [optional]
+    ckpt_path: # [optional]
+    decode_method: 'attention_rescoring'
+    force_yes: True
+    codeswitch: True
+    device:  # set 'gpu:id' or 'cpu'
+
+################### speech task: asr; engine_type: inference #######################
+asr_inference:
+    # model_type choices=['deepspeech2offline_aishell']
+    model_type: 'deepspeech2offline_aishell'
+    am_model: # the pdmodel file of am static model [optional]
+    am_params:  # the pdiparams file of am static model [optional]
+    lang: 'zh'
+    sample_rate: 16000
+    cfg_path: 
+    decode_method: 
+    force_yes: True
+
+    am_predictor_conf:
+        device:  # set 'gpu:id' or 'cpu'
+        switch_ir_optim: True
+        glog_info: False  # True -> print glog
+        summary: True  # False -> do not show predictor config
+
+
+################################### TTS #########################################
+################### speech task: tts; engine_type: python #######################
+tts_python:
+    # am (acoustic model) choices=['speedyspeech_csmsc', 'fastspeech2_csmsc',
+    #                             'fastspeech2_ljspeech', 'fastspeech2_aishell3',
+    #                             'fastspeech2_vctk', 'fastspeech2_mix',
+    #                             'tacotron2_csmsc', 'tacotron2_ljspeech']
+    am: 'fastspeech2_csmsc'   
+    am_config: 
+    am_ckpt: 
+    am_stat: 
+    phones_dict: 
+    tones_dict: 
+    speaker_dict: 
+
+
+    # voc (vocoder) choices=['pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3',
+    #                        'pwgan_vctk', 'mb_melgan_csmsc', 'style_melgan_csmsc',
+    #                        'hifigan_csmsc', 'hifigan_ljspeech', 'hifigan_aishell3',
+    #                        'hifigan_vctk', 'wavernn_csmsc']
+    voc: 'mb_melgan_csmsc'
+    voc_config: 
+    voc_ckpt: 
+    voc_stat: 
+
+    # others
+    lang: 'zh'
+    device:  # set 'gpu:id' or 'cpu'
+
+
+################### speech task: tts; engine_type: inference #######################
+tts_inference:
+    # am (acoustic model) choices=['speedyspeech_csmsc', 'fastspeech2_csmsc']
+    am: 'fastspeech2_csmsc'   
+    am_model: # the pdmodel file of your am static model (XX.pdmodel)
+    am_params: # the pdiparams file of your am static model (XX.pdipparams)
+    am_sample_rate: 24000
+    phones_dict: 
+    tones_dict: 
+    speaker_dict: 
+
+
+    am_predictor_conf:
+        device:  # set 'gpu:id' or 'cpu'
+        switch_ir_optim: True
+        glog_info: False # True -> print glog
+        summary: True  # False -> do not show predictor config
+
+    # voc (vocoder) choices=['pwgan_csmsc', 'mb_melgan_csmsc','hifigan_csmsc']
+    voc: 'mb_melgan_csmsc'
+    voc_model: # the pdmodel file of your vocoder static model (XX.pdmodel)
+    voc_params: # the pdiparams file of your vocoder static model (XX.pdipparams)
+    voc_sample_rate: 24000
+
+    voc_predictor_conf:
+        device:  # set 'gpu:id' or 'cpu'  
+        switch_ir_optim: True  
+        glog_info: False # True -> print glog
+        summary: True  # False -> do not show predictor config
+
+    # others
+    lang: 'zh'
+
+
+################################### CLS #########################################
+################### speech task: cls; engine_type: python #######################
+cls_python:
+    # model choices=['panns_cnn14', 'panns_cnn10', 'panns_cnn6']
+    model: 'panns_cnn14'
+    cfg_path: # [optional] Config of cls task.
+    ckpt_path: # [optional] Checkpoint file of model.
+    label_file: # [optional] Label file of cls task.
+    device:  # set 'gpu:id' or 'cpu'
+
+
+################### speech task: cls; engine_type: inference #######################
+cls_inference:
+    # model_type choices=['panns_cnn14', 'panns_cnn10', 'panns_cnn6']
+    model_type: 'panns_cnn14' 
+    cfg_path: 
+    model_path:  # the pdmodel file of am static model [optional]
+    params_path:  # the pdiparams file of am static model [optional]
+    label_file:  # [optional] Label file of cls task.
+
+    predictor_conf:
+        device:  # set 'gpu:id' or 'cpu'
+        switch_ir_optim: True
+        glog_info: False  # True -> print glog
+        summary: True  # False -> do not show predictor config
+
+
+################################### Text #########################################
+################### text task: punc; engine_type: python #######################
+text_python:
+    task: punc
+    model_type: 'ernie_linear_p3_wudao'
+    lang: 'zh'
+    sample_rate: 16000
+    cfg_path: # [optional]
+    ckpt_path: # [optional]
+    vocab_file: # [optional]
+    device:  # set 'gpu:id' or 'cpu'
+
+
+################################### Vector ######################################
+################### Vector task: spk; engine_type: python #######################
+vector_python:
+    task: spk
+    model_type: 'ecapatdnn_voxceleb12'
+    sample_rate: 16000
+    cfg_path: # [optional]
+    ckpt_path: # [optional]
+    device:  # set 'gpu:id' or 'cpu'
diff --git a/demos/speech_ssl/README.md b/demos/speech_ssl/README.md
index b98a7cc6..ef9b2237 100644
--- a/demos/speech_ssl/README.md
+++ b/demos/speech_ssl/README.md
@@ -36,7 +36,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
   ```
   Arguments:
   - `input`(required): Audio file to recognize.
-  - `model`: Model type of asr task. Default: `wav2vec2ASR_librispeech`.
+  - `model`: Model type of asr task. Default: `wav2vec2`, choices: [wav2vec2, hubert, wavlm].
   - `task`: Output type. Default: `asr`.
   - `lang`: Model language. Default: `en`.
   - `sample_rate`: Sample rate of the model. Default: `16000`.
@@ -56,7 +56,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
 
   # to recognize text 
   text = ssl_executor(
-      model='wav2vec2ASR_librispeech',
+      model='wav2vec2',
       task='asr',
       lang='en',
       sample_rate=16000,
diff --git a/demos/speech_ssl/README_cn.md b/demos/speech_ssl/README_cn.md
index 65961ce9..a18c778a 100644
--- a/demos/speech_ssl/README_cn.md
+++ b/demos/speech_ssl/README_cn.md
@@ -36,7 +36,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
   ```
   参数：
   - `input`(必须输入)：用于识别的音频文件。
-  - `model`：ASR 任务的模型，默认值：`wav2vec2ASR_librispeech`。
+  - `model`：ASR 任务的模型，默认值：`wav2vec2`, 可选项：[wav2vec2, hubert, wavlm]。
   - `task`：输出类别，默认值：`asr`。
   - `lang`：模型语言，默认值：`en`。
   - `sample_rate`：音频采样率，默认值：`16000`。
@@ -56,7 +56,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
 
   # 识别文本
   text = ssl_executor(
-      model='wav2vec2ASR_librispeech',
+      model='wav2vec2,
       task='asr',
       lang='en',
       sample_rate=16000,
diff --git a/demos/speech_web/README.md b/demos/speech_web/README.md
index 572781ab..fc1fe710 100644
--- a/demos/speech_web/README.md
+++ b/demos/speech_web/README.md
@@ -23,7 +23,7 @@ Paddle Speech Demo 是一个以 PaddleSpeech 的语音交互功能为主体开
 
 + ERNIE-SAT：语言-语音跨模态大模型 ERNIE-SAT 可视化展示示例，支持个性化合成，跨语言语音合成（音频为中文则输入英文文本进行合成），语音编辑（修改音频文字中间的结果）功能。 ERNIE-SAT 更多实现细节，可以参考：
   + [【ERNIE-SAT with AISHELL-3 dataset】](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/ernie_sat)
-  + [【ERNIE-SAT with with AISHELL3 and VCTK datasets】](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3_vctk/ernie_sat)
+  + [【ERNIE-SAT with AISHELL3 and VCTK datasets】](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3_vctk/ernie_sat)
   + [【ERNIE-SAT with VCTK dataset】](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/ernie_sat)
 
 运行效果：
diff --git a/demos/speech_web/speech_server/main.py b/demos/speech_web/speech_server/main.py
index 03e7e599..f4678628 100644
--- a/demos/speech_web/speech_server/main.py
+++ b/demos/speech_web/speech_server/main.py
@@ -260,7 +260,7 @@ async def websocket_endpoint_online(websocket: WebSocket):
                 #              and we break the loop
                 if message['signal'] == 'start':
                     resp = {"status": "ok", "signal": "server_ready"}
-                    # do something at begining here
+                    # do something at beginning here
                     # create the instance to process the audio
                     # connection_handler = chatbot.asr.connection_handler
                     connection_handler = PaddleASRConnectionHanddler(engine)
diff --git a/demos/speech_web/speech_server/src/ge2e_clone.py b/demos/speech_web/speech_server/src/ge2e_clone.py
index 83c2b3f3..0711a40a 100644
--- a/demos/speech_web/speech_server/src/ge2e_clone.py
+++ b/demos/speech_web/speech_server/src/ge2e_clone.py
@@ -38,23 +38,9 @@ class VoiceCloneGE2E():
         output_dir = os.path.dirname(out_wav)
         ngpu = get_ngpu()
 
-        cmd = f"""
-            python3 {self.BIN_DIR}/voice_cloning.py \
-                    --am={self.am} \
-                    --am_config={self.am_config} \
-                    --am_ckpt={self.am_ckpt} \
-                    --am_stat={self.am_stat} \
-                    --voc={self.voc} \
-                    --voc_config={self.voc_config} \
-                    --voc_ckpt={self.voc_ckpt} \
-                    --voc_stat={self.voc_stat} \
-                    --ge2e_params_path={self.ge2e_params_path} \
-                    --text="{text}" \
-                    --input-dir={ref_audio_dir} \
-                    --output-dir={output_dir} \
-                    --phones-dict={self.phones_dict} \
-                    --ngpu={ngpu}
-        """
+        cmd = f"""python {self.BIN_DIR}/voice_cloning.py --am={self.am} --am_config={self.am_config} --am_ckpt={self.am_ckpt} --am_stat={self.am_stat} --voc={self.voc} --voc_config={self.voc_config} --voc_ckpt={self.voc_ckpt} --voc_stat={self.voc_stat} --ge2e_params_path={self.ge2e_params_path} --text="{text}" --input-dir={ref_audio_dir} --output-dir={output_dir} --phones-dict={self.phones_dict} --ngpu={ngpu}"""
+
+        print(cmd)
 
         output_name = os.path.join(output_dir, full_file_name)
         return run_cmd(cmd, output_name=output_name)
diff --git a/demos/streaming_asr_server/README.md b/demos/streaming_asr_server/README.md
index 1d33b694..31256d15 100644
--- a/demos/streaming_asr_server/README.md
+++ b/demos/streaming_asr_server/README.md
@@ -9,7 +9,7 @@ This demo is an implementation of starting the streaming speech service and acce
 
 Streaming ASR server only support `websocket` protocol, and doesn't support `http` protocol.
 
-服务接口定义请参考:
+For service interface definitions, please refer to:
 - [PaddleSpeech Streaming Server WebSocket API](https://github.com/PaddlePaddle/PaddleSpeech/wiki/PaddleSpeech-Server-WebSocket-API)
 
 ## Usage
@@ -23,7 +23,7 @@ You can choose one way from easy, meduim and hard to install paddlespeech.
 **If you install in easy mode, you need to prepare the yaml file by yourself, you can refer to 
 
 ### 2. Prepare config File
-The configuration file can be found in `conf/ws_application.yaml` 和 `conf/ws_conformer_wenetspeech_application.yaml`.
+The configuration file can be found in `conf/ws_application.yaml` or `conf/ws_conformer_wenetspeech_application.yaml`.
 
 At present, the speech tasks integrated by the model include: DeepSpeech2 and conformer.
 
@@ -87,7 +87,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
 
   server_executor = ServerExecutor()
   server_executor(
-      config_file="./conf/ws_conformer_wenetspeech_application.yaml",
+      config_file="./conf/ws_conformer_wenetspeech_application_faster.yaml",
       log_file="./log/paddlespeech.log")
   ```
 
@@ -579,3 +579,354 @@ bash server.sh
   [2022-05-07 11:11:18,915] [    INFO] - audio duration: 4.9968125, elapsed time: 15.928460597991943, RTF=3.187724293835709
   [2022-05-07 11:11:18,916] [    INFO] - asr websocket client finished : 我认为跑步最重要的就是给我带来了身体健康
   ```
+
+## Generate corresponding subtitle (.srt format) from audio file (.wav format or.mp3 format)
+
+By default, each server is deployed on the 'CPU' device and speech recognition and punctuation prediction can be deployed on different 'GPU' by modifying the' device 'parameter in the service configuration file respectively.
+
+We use `streaming_ asr_server.py` and `punc_server.py` two services to lanuch streaming speech recognition and punctuation prediction services respectively. And the `websocket_client_srt.py` script can be used to call streaming speech recognition and punctuation prediction services at the same time, and will generate the corresponding subtitle (.srt format).
+
+**need to install ffmpeg before running this script**
+
+**You should at the directory of `.../demos/streaming_asr_server/`**
+
+### 1. Start two server
+
+```bash
+Note: streaming speech recognition and punctuation prediction are configured on different graphics cards through configuration files
+paddlespeech_server start --config_file ./conf/ws_conformer_wenetspeech_application.yaml
+```
+
+Open another terminal run the following commands:
+```bash
+paddlespeech_server start --config_file conf/punc_application.yaml
+```
+
+### 2. Call client
+
+  ```bash
+  python3 local/websocket_client_srt.py --server_ip 127.0.0.1 --port 8090 --punc.server_ip 127.0.0.1 --punc.port 8190 --wavfile ../../data/认知.mp3
+  ```
+  Output:
+  ```text
+  [2023-03-30 23:26:13,991] [    INFO] - Start to do streaming asr client
+[2023-03-30 23:26:13,994] [    INFO] - asr websocket client start
+[2023-03-30 23:26:13,994] [    INFO] - endpoint: http://127.0.0.1:8190/paddlespeech/text
+[2023-03-30 23:26:13,994] [    INFO] - endpoint: ws://127.0.0.1:8090/paddlespeech/asr/streaming
+[2023-03-30 23:26:14,475] [    INFO] - /home/fxb/PaddleSpeech-develop/data/认知.mp3 converted to /home/fxb/PaddleSpeech-develop/data/认知.wav
+[2023-03-30 23:26:14,476] [    INFO] - start to process the wavscp: /home/fxb/PaddleSpeech-develop/data/认知.wav
+[2023-03-30 23:26:14,515] [    INFO] - client receive msg={"status": "ok", "signal": "server_ready"}
+[2023-03-30 23:26:14,533] [    INFO] - client receive msg={'result': ''}
+[2023-03-30 23:26:14,545] [    INFO] - client receive msg={'result': ''}
+[2023-03-30 23:26:14,556] [    INFO] - client receive msg={'result': ''}
+[2023-03-30 23:26:14,572] [    INFO] - client receive msg={'result': ''}
+[2023-03-30 23:26:14,588] [    INFO] - client receive msg={'result': ''}
+[2023-03-30 23:26:14,600] [    INFO] - client receive msg={'result': ''}
+[2023-03-30 23:26:14,613] [    INFO] - client receive msg={'result': ''}
+[2023-03-30 23:26:14,626] [    INFO] - client receive msg={'result': ''}
+[2023-03-30 23:26:15,122] [    INFO] - client receive msg={'result': '第一部'}
+[2023-03-30 23:26:15,135] [    INFO] - client receive msg={'result': '第一部'}
+[2023-03-30 23:26:15,154] [    INFO] - client receive msg={'result': '第一部'}
+[2023-03-30 23:26:15,163] [    INFO] - client receive msg={'result': '第一部'}
+[2023-03-30 23:26:15,175] [    INFO] - client receive msg={'result': '第一部'}
+[2023-03-30 23:26:15,185] [    INFO] - client receive msg={'result': '第一部'}
+[2023-03-30 23:26:15,196] [    INFO] - client receive msg={'result': '第一部'}
+[2023-03-30 23:26:15,637] [    INFO] - client receive msg={'result': '第一部分是认'}
+[2023-03-30 23:26:15,648] [    INFO] - client receive msg={'result': '第一部分是认'}
+[2023-03-30 23:26:15,657] [    INFO] - client receive msg={'result': '第一部分是认'}
+[2023-03-30 23:26:15,666] [    INFO] - client receive msg={'result': '第一部分是认'}
+[2023-03-30 23:26:15,676] [    INFO] - client receive msg={'result': '第一部分是认'}
+[2023-03-30 23:26:15,683] [    INFO] - client receive msg={'result': '第一部分是认'}
+[2023-03-30 23:26:15,691] [    INFO] - client receive msg={'result': '第一部分是认'}
+[2023-03-30 23:26:15,703] [    INFO] - client receive msg={'result': '第一部分是认'}
+[2023-03-30 23:26:16,146] [    INFO] - client receive msg={'result': '第一部分是认知部分'}
+[2023-03-30 23:26:16,159] [    INFO] - client receive msg={'result': '第一部分是认知部分'}
+[2023-03-30 23:26:16,167] [    INFO] - client receive msg={'result': '第一部分是认知部分'}
+[2023-03-30 23:26:16,177] [    INFO] - client receive msg={'result': '第一部分是认知部分'}
+[2023-03-30 23:26:16,187] [    INFO] - client receive msg={'result': '第一部分是认知部分'}
+[2023-03-30 23:26:16,197] [    INFO] - client receive msg={'result': '第一部分是认知部分'}
+[2023-03-30 23:26:16,210] [    INFO] - client receive msg={'result': '第一部分是认知部分'}
+[2023-03-30 23:26:16,694] [    INFO] - client receive msg={'result': '第一部分是认知部分'}
+[2023-03-30 23:26:16,704] [    INFO] - client receive msg={'result': '第一部分是认知部分'}
+[2023-03-30 23:26:16,713] [    INFO] - client receive msg={'result': '第一部分是认知部分'}
+[2023-03-30 23:26:16,725] [    INFO] - client receive msg={'result': '第一部分是认知部分'}
+[2023-03-30 23:26:16,737] [    INFO] - client receive msg={'result': '第一部分是认知部分'}
+[2023-03-30 23:26:16,749] [    INFO] - client receive msg={'result': '第一部分是认知部分'}
+[2023-03-30 23:26:16,759] [    INFO] - client receive msg={'result': '第一部分是认知部分'}
+[2023-03-30 23:26:16,770] [    INFO] - client receive msg={'result': '第一部分是认知部分'}
+[2023-03-30 23:26:17,279] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通'}
+[2023-03-30 23:26:17,302] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通'}
+[2023-03-30 23:26:17,316] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通'}
+[2023-03-30 23:26:17,332] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通'}
+[2023-03-30 23:26:17,343] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通'}
+[2023-03-30 23:26:17,358] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通'}
+[2023-03-30 23:26:17,373] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通'}
+[2023-03-30 23:26:17,958] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图'}
+[2023-03-30 23:26:17,971] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图'}
+[2023-03-30 23:26:17,987] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图'}
+[2023-03-30 23:26:18,000] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图'}
+[2023-03-30 23:26:18,017] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图'}
+[2023-03-30 23:26:18,028] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图'}
+[2023-03-30 23:26:18,038] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图'}
+[2023-03-30 23:26:18,049] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图'}
+[2023-03-30 23:26:18,653] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本'}
+[2023-03-30 23:26:18,689] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本'}
+[2023-03-30 23:26:18,701] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本'}
+[2023-03-30 23:26:18,712] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本'}
+[2023-03-30 23:26:18,723] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本'}
+[2023-03-30 23:26:18,750] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本'}
+[2023-03-30 23:26:18,767] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本'}
+[2023-03-30 23:26:19,295] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式'}
+[2023-03-30 23:26:19,307] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式'}
+[2023-03-30 23:26:19,323] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式'}
+[2023-03-30 23:26:19,332] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式'}
+[2023-03-30 23:26:19,342] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式'}
+[2023-03-30 23:26:19,349] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式'}
+[2023-03-30 23:26:19,373] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式'}
+[2023-03-30 23:26:19,389] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式'}
+[2023-03-30 23:26:20,046] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生'}
+[2023-03-30 23:26:20,055] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生'}
+[2023-03-30 23:26:20,067] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生'}
+[2023-03-30 23:26:20,076] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生'}
+[2023-03-30 23:26:20,094] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生'}
+[2023-03-30 23:26:20,124] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生'}
+[2023-03-30 23:26:20,135] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生'}
+[2023-03-30 23:26:20,732] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解'}
+[2023-03-30 23:26:20,742] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解'}
+[2023-03-30 23:26:20,757] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解'}
+[2023-03-30 23:26:20,770] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解'}
+[2023-03-30 23:26:20,782] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解'}
+[2023-03-30 23:26:20,798] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解'}
+[2023-03-30 23:26:20,815] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解'}
+[2023-03-30 23:26:20,834] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解'}
+[2023-03-30 23:26:21,390] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感'}
+[2023-03-30 23:26:21,405] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感'}
+[2023-03-30 23:26:21,416] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感'}
+[2023-03-30 23:26:21,428] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感'}
+[2023-03-30 23:26:21,448] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感'}
+[2023-03-30 23:26:21,459] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感'}
+[2023-03-30 23:26:21,473] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感'}
+[2023-03-30 23:26:22,065] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作'}
+[2023-03-30 23:26:22,085] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作'}
+[2023-03-30 23:26:22,110] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作'}
+[2023-03-30 23:26:22,118] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作'}
+[2023-03-30 23:26:22,137] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作'}
+[2023-03-30 23:26:22,144] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作'}
+[2023-03-30 23:26:22,154] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作'}
+[2023-03-30 23:26:22,169] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作'}
+[2023-03-30 23:26:22,698] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理'}
+[2023-03-30 23:26:22,709] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理'}
+[2023-03-30 23:26:22,731] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理'}
+[2023-03-30 23:26:22,743] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理'}
+[2023-03-30 23:26:22,755] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理'}
+[2023-03-30 23:26:22,771] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理'}
+[2023-03-30 23:26:22,782] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理'}
+[2023-03-30 23:26:23,415] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生'}
+[2023-03-30 23:26:23,430] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生'}
+[2023-03-30 23:26:23,442] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生'}
+[2023-03-30 23:26:23,456] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生'}
+[2023-03-30 23:26:23,470] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生'}
+[2023-03-30 23:26:23,487] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生'}
+[2023-03-30 23:26:23,498] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生'}
+[2023-03-30 23:26:23,524] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生'}
+[2023-03-30 23:26:24,200] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备'}
+[2023-03-30 23:26:24,210] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备'}
+[2023-03-30 23:26:24,219] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备'}
+[2023-03-30 23:26:24,231] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备'}
+[2023-03-30 23:26:24,250] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备'}
+[2023-03-30 23:26:24,262] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备'}
+[2023-03-30 23:26:24,272] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备'}
+[2023-03-30 23:26:24,898] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致'}
+[2023-03-30 23:26:24,903] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致'}
+[2023-03-30 23:26:24,907] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致'}
+[2023-03-30 23:26:24,932] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致'}
+[2023-03-30 23:26:24,957] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致'}
+[2023-03-30 23:26:24,979] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致'}
+[2023-03-30 23:26:24,991] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致'}
+[2023-03-30 23:26:25,011] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致'}
+[2023-03-30 23:26:25,616] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知'}
+[2023-03-30 23:26:25,625] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知'}
+[2023-03-30 23:26:25,648] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知'}
+[2023-03-30 23:26:25,658] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知'}
+[2023-03-30 23:26:25,669] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知'}
+[2023-03-30 23:26:25,681] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知'}
+[2023-03-30 23:26:25,690] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知'}
+[2023-03-30 23:26:25,707] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知'}
+[2023-03-30 23:26:26,378] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知'}
+[2023-03-30 23:26:26,384] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知'}
+[2023-03-30 23:26:26,389] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知'}
+[2023-03-30 23:26:26,397] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知'}
+[2023-03-30 23:26:26,402] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知'}
+[2023-03-30 23:26:26,415] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知'}
+[2023-03-30 23:26:26,428] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知'}
+[2023-03-30 23:26:27,008] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使'}
+[2023-03-30 23:26:27,018] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使'}
+[2023-03-30 23:26:27,026] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使'}
+[2023-03-30 23:26:27,037] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使'}
+[2023-03-30 23:26:27,046] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使'}
+[2023-03-30 23:26:27,054] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使'}
+[2023-03-30 23:26:27,062] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使'}
+[2023-03-30 23:26:27,070] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使'}
+[2023-03-30 23:26:27,735] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传'}
+[2023-03-30 23:26:27,745] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传'}
+[2023-03-30 23:26:27,755] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传'}
+[2023-03-30 23:26:27,769] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传'}
+[2023-03-30 23:26:27,783] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传'}
+[2023-03-30 23:26:27,794] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传'}
+[2023-03-30 23:26:27,804] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传'}
+[2023-03-30 23:26:28,454] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内'}
+[2023-03-30 23:26:28,472] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内'}
+[2023-03-30 23:26:28,481] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内'}
+[2023-03-30 23:26:28,489] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内'}
+[2023-03-30 23:26:28,499] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内'}
+[2023-03-30 23:26:28,533] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内'}
+[2023-03-30 23:26:28,543] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内'}
+[2023-03-30 23:26:28,556] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内'}
+[2023-03-30 23:26:29,212] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图'}
+[2023-03-30 23:26:29,222] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图'}
+[2023-03-30 23:26:29,233] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图'}
+[2023-03-30 23:26:29,246] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图'}
+[2023-03-30 23:26:29,258] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图'}
+[2023-03-30 23:26:29,270] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图'}
+[2023-03-30 23:26:29,286] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图'}
+[2023-03-30 23:26:30,003] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅'}
+[2023-03-30 23:26:30,013] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅'}
+[2023-03-30 23:26:30,038] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅'}
+[2023-03-30 23:26:30,048] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅'}
+[2023-03-30 23:26:30,062] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅'}
+[2023-03-30 23:26:30,074] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅'}
+[2023-03-30 23:26:30,114] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅'}
+[2023-03-30 23:26:30,125] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅'}
+[2023-03-30 23:26:30,856] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说'}
+[2023-03-30 23:26:30,876] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说'}
+[2023-03-30 23:26:30,885] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说'}
+[2023-03-30 23:26:30,897] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说'}
+[2023-03-30 23:26:30,914] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说'}
+[2023-03-30 23:26:30,940] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说'}
+[2023-03-30 23:26:30,952] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说'}
+[2023-03-30 23:26:31,655] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明'}
+[2023-03-30 23:26:31,696] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明'}
+[2023-03-30 23:26:31,709] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明'}
+[2023-03-30 23:26:31,718] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明'}
+[2023-03-30 23:26:31,727] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明'}
+[2023-03-30 23:26:31,740] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明'}
+[2023-03-30 23:26:31,757] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明'}
+[2023-03-30 23:26:31,768] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明'}
+[2023-03-30 23:26:32,476] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助'}
+[2023-03-30 23:26:32,486] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助'}
+[2023-03-30 23:26:32,495] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助'}
+[2023-03-30 23:26:32,549] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助'}
+[2023-03-30 23:26:32,560] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助'}
+[2023-03-30 23:26:32,574] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助'}
+[2023-03-30 23:26:32,590] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助'}
+[2023-03-30 23:26:33,338] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生'}
+[2023-03-30 23:26:33,356] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生'}
+[2023-03-30 23:26:33,368] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生'}
+[2023-03-30 23:26:33,386] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生'}
+[2023-03-30 23:26:33,397] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生'}
+[2023-03-30 23:26:33,409] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生'}
+[2023-03-30 23:26:33,424] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生'}
+[2023-03-30 23:26:33,434] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生'}
+[2023-03-30 23:26:34,352] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感'}
+[2023-03-30 23:26:34,364] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感'}
+[2023-03-30 23:26:34,377] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感'}
+[2023-03-30 23:26:34,395] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感'}
+[2023-03-30 23:26:34,410] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感'}
+[2023-03-30 23:26:34,423] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感'}
+[2023-03-30 23:26:34,434] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感'}
+[2023-03-30 23:26:35,373] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有'}
+[2023-03-30 23:26:35,397] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有'}
+[2023-03-30 23:26:35,410] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有'}
+[2023-03-30 23:26:35,420] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有'}
+[2023-03-30 23:26:35,437] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有'}
+[2023-03-30 23:26:35,448] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有'}
+[2023-03-30 23:26:35,460] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有'}
+[2023-03-30 23:26:35,473] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有'}
+[2023-03-30 23:26:36,288] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的'}
+[2023-03-30 23:26:36,297] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的'}
+[2023-03-30 23:26:36,306] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的'}
+[2023-03-30 23:26:36,326] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的'}
+[2023-03-30 23:26:36,336] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的'}
+[2023-03-30 23:26:36,351] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的'}
+[2023-03-30 23:26:36,365] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的'}
+[2023-03-30 23:26:37,164] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象'}
+[2023-03-30 23:26:37,173] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象'}
+[2023-03-30 23:26:37,182] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象'}
+[2023-03-30 23:26:37,192] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象'}
+[2023-03-30 23:26:37,204] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象'}
+[2023-03-30 23:26:37,232] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象'}
+[2023-03-30 23:26:37,238] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象'}
+[2023-03-30 23:26:37,252] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象'}
+[2023-03-30 23:26:38,084] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后'}
+[2023-03-30 23:26:38,093] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后'}
+[2023-03-30 23:26:38,106] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后'}
+[2023-03-30 23:26:38,122] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后'}
+[2023-03-30 23:26:38,140] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后'}
+[2023-03-30 23:26:38,181] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后'}
+[2023-03-30 23:26:38,206] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后'}
+[2023-03-30 23:26:39,094] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合'}
+[2023-03-30 23:26:39,111] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合'}
+[2023-03-30 23:26:39,132] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合'}
+[2023-03-30 23:26:39,150] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合'}
+[2023-03-30 23:26:39,174] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合'}
+[2023-03-30 23:26:39,190] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合'}
+[2023-03-30 23:26:39,197] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合'}
+[2023-03-30 23:26:39,212] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合'}
+[2023-03-30 23:26:40,009] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实'}
+[2023-03-30 23:26:40,094] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实'}
+[2023-03-30 23:26:40,105] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实'}
+[2023-03-30 23:26:40,128] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实'}
+[2023-03-30 23:26:40,149] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实'}
+[2023-03-30 23:26:40,173] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实'}
+[2023-03-30 23:26:40,189] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实'}
+[2023-03-30 23:26:40,200] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实'}
+[2023-03-30 23:26:40,952] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用'}
+[2023-03-30 23:26:40,973] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用'}
+[2023-03-30 23:26:40,986] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用'}
+[2023-03-30 23:26:40,999] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用'}
+[2023-03-30 23:26:41,013] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用'}
+[2023-03-30 23:26:41,022] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用'}
+[2023-03-30 23:26:41,033] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用'}
+[2023-03-30 23:26:41,819] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升'}
+[2023-03-30 23:26:41,832] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升'}
+[2023-03-30 23:26:41,845] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升'}
+[2023-03-30 23:26:41,878] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升'}
+[2023-03-30 23:26:41,886] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升'}
+[2023-03-30 23:26:41,893] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升'}
+[2023-03-30 23:26:41,925] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升'}
+[2023-03-30 23:26:41,935] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升'}
+[2023-03-30 23:26:42,562] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对'}
+[2023-03-30 23:26:42,589] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对'}
+[2023-03-30 23:26:42,621] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对'}
+[2023-03-30 23:26:42,634] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对'}
+[2023-03-30 23:26:42,644] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对'}
+[2023-03-30 23:26:42,657] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对'}
+[2023-03-30 23:26:42,668] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对'}
+[2023-03-30 23:26:43,380] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴'}
+[2023-03-30 23:26:43,389] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴'}
+[2023-03-30 23:26:43,436] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴'}
+[2023-03-30 23:26:43,448] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴'}
+[2023-03-30 23:26:43,462] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴'}
+[2023-03-30 23:26:43,472] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴'}
+[2023-03-30 23:26:43,486] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴'}
+[2023-03-30 23:26:43,496] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴'}
+[2023-03-30 23:26:44,346] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴趣以'}
+[2023-03-30 23:26:44,356] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴趣以'}
+[2023-03-30 23:26:44,364] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴趣以'}
+[2023-03-30 23:26:44,374] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴趣以'}
+[2023-03-30 23:26:44,389] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴趣以'}
+[2023-03-30 23:26:44,398] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴趣以'}
+[2023-03-30 23:26:44,420] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴趣以'}
+[2023-03-30 23:26:45,226] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴趣以及意义感'}
+[2023-03-30 23:26:45,235] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴趣以及意义感'}
+[2023-03-30 23:26:45,258] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴趣以及意义感'}
+[2023-03-30 23:26:45,273] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴趣以及意义感'}
+[2023-03-30 23:26:45,295] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴趣以及意义感'}
+[2023-03-30 23:26:45,306] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴趣以及意义感'}
+[2023-03-30 23:26:46,380] [    INFO] - client punctuation restored msg={'result': '第一部分是认知部分，该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理，让学生对设备有大致的认知。随后使用真实传感器的内部构造图，辅以文字说明，进一步帮助学生对传感器有更深刻的印象，最后结合具体的实践应用，提升学生对实训的兴趣以及意义感。'}
+[2023-03-30 23:27:01,059] [    INFO] - client final receive msg={'status': 'ok', 'signal': 'finished', 'result': '第一部分是认知部分，该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理，让学生对设备有大致的认知。随后使用真实传感器的内部构造图，辅以文字说明，进一步帮助学生对传感器有更深刻的印象，最后结合具体的实践应用，提升学生对实训的兴趣以及意义感。', 'times': [{'w': '第', 'bg': 0.0, 'ed': 0.36}, {'w': '一', 'bg': 0.36, 'ed': 0.48}, {'w': '部', 'bg': 0.48, 'ed': 0.62}, {'w': '分', 'bg': 0.62, 'ed': 0.8200000000000001}, {'w': '是', 'bg': 0.8200000000000001, 'ed': 1.08}, {'w': '认', 'bg': 1.08, 'ed': 1.28}, {'w': '知', 'bg': 1.28, 'ed': 1.44}, {'w': '部', 'bg': 1.44, 'ed': 1.58}, {'w': '分', 'bg': 1.58, 'ed': 2.1}, {'w': '该', 'bg': 2.1, 'ed': 2.6}, {'w': '部', 'bg': 2.6, 'ed': 2.72}, {'w': '分', 'bg': 2.72, 'ed': 2.94}, {'w': '通', 'bg': 2.94, 'ed': 3.16}, {'w': '过', 'bg': 3.16, 'ed': 3.36}, {'w': '示', 'bg': 3.36, 'ed': 3.54}, {'w': '意', 'bg': 3.54, 'ed': 3.68}, {'w': '图', 'bg': 3.68, 'ed': 3.9}, {'w': '和', 'bg': 3.9, 'ed': 4.14}, {'w': '文', 'bg': 4.14, 'ed': 4.32}, {'w': '本', 'bg': 4.32, 'ed': 4.46}, {'w': '的', 'bg': 4.46, 'ed': 4.58}, {'w': '形', 'bg': 4.58, 'ed': 4.72}, {'w': '式', 'bg': 4.72, 'ed': 5.0}, {'w': '向', 'bg': 5.0, 'ed': 5.32}, {'w': '学', 'bg': 5.32, 'ed': 5.5}, {'w': '生', 'bg': 5.5, 'ed': 5.66}, {'w': '讲', 'bg': 5.66, 'ed': 5.86}, {'w': '解', 'bg': 5.86, 'ed': 6.18}, {'w': '主', 'bg': 6.18, 'ed': 6.46}, {'w': '要', 'bg': 6.46, 'ed': 6.62}, {'w': '传', 'bg': 6.62, 'ed': 6.8}, {'w': '感', 'bg': 6.8, 'ed': 7.0}, {'w': '器', 'bg': 7.0, 'ed': 7.16}, {'w': '的', 'bg': 7.16, 'ed': 7.28}, {'w': '工', 'bg': 7.28, 'ed': 7.44}, {'w': '作', 'bg': 7.44, 'ed': 7.6000000000000005}, {'w': '原', 'bg': 7.6000000000000005, 'ed': 7.74}, {'w': '理', 'bg': 7.74, 'ed': 8.06}, {'w': '让', 'bg': 8.06, 'ed': 8.44}, {'w': '学', 'bg': 8.44, 'ed': 8.64}, {'w': '生', 'bg': 8.64, 'ed': 8.84}, {'w': '对', 'bg': 8.84, 'ed': 9.06}, {'w': '设', 'bg': 9.06, 'ed': 9.24}, {'w': '备', 'bg': 9.24, 'ed': 9.52}, {'w': '有', 'bg': 9.52, 'ed': 9.86}, {'w': '大', 'bg': 9.86, 'ed': 10.1}, {'w': '致', 'bg': 10.1, 'ed': 10.24}, {'w': '的', 'bg': 10.24, 'ed': 10.36}, {'w': '认', 'bg': 10.36, 'ed': 10.5}, {'w': '知', 'bg': 10.5, 'ed': 11.040000000000001}, {'w': '随', 'bg': 11.040000000000001, 'ed': 11.56}, {'w': '后', 'bg': 11.56, 'ed': 11.82}, {'w': '使', 'bg': 11.82, 'ed': 12.1}, {'w': '用', 'bg': 12.1, 'ed': 12.26}, {'w': '真', 'bg': 12.26, 'ed': 12.44}, {'w': '实', 'bg': 12.44, 'ed': 12.620000000000001}, {'w': '传', 'bg': 12.620000000000001, 'ed': 12.780000000000001}, {'w': '感', 'bg': 12.780000000000001, 'ed': 12.94}, {'w': '器', 'bg': 12.94, 'ed': 13.1}, {'w': '的', 'bg': 13.1, 'ed': 13.26}, {'w': '内', 'bg': 13.26, 'ed': 13.42}, {'w': '部', 'bg': 13.42, 'ed': 13.56}, {'w': '构', 'bg': 13.56, 'ed': 13.700000000000001}, {'w': '造', 'bg': 13.700000000000001, 'ed': 13.86}, {'w': '图', 'bg': 13.86, 'ed': 14.280000000000001}, {'w': '辅', 'bg': 14.280000000000001, 'ed': 14.66}, {'w': '以', 'bg': 14.66, 'ed': 14.82}, {'w': '文', 'bg': 14.82, 'ed': 15.0}, {'w': '字', 'bg': 15.0, 'ed': 15.16}, {'w': '说', 'bg': 15.16, 'ed': 15.32}, {'w': '明', 'bg': 15.32, 'ed': 15.72}, {'w': '进', 'bg': 15.72, 'ed': 16.1}, {'w': '一', 'bg': 16.1, 'ed': 16.2}, {'w': '步', 'bg': 16.2, 'ed': 16.32}, {'w': '帮', 'bg': 16.32, 'ed': 16.48}, {'w': '助', 'bg': 16.48, 'ed': 16.66}, {'w': '学', 'bg': 16.66, 'ed': 16.82}, {'w': '生', 'bg': 16.82, 'ed': 17.12}, {'w': '对', 'bg': 17.12, 'ed': 17.48}, {'w': '传', 'bg': 17.48, 'ed': 17.66}, {'w': '感', 'bg': 17.66, 'ed': 17.84}, {'w': '器', 'bg': 17.84, 'ed': 18.12}, {'w': '有', 'bg': 18.12, 'ed': 18.42}, {'w': '更', 'bg': 18.42, 'ed': 18.66}, {'w': '深', 'bg': 18.66, 'ed': 18.88}, {'w': '刻', 'bg': 18.88, 'ed': 19.04}, {'w': '的', 'bg': 19.04, 'ed': 19.16}, {'w': '印', 'bg': 19.16, 'ed': 19.3}, {'w': '象', 'bg': 19.3, 'ed': 19.8}, {'w': '最', 'bg': 19.8, 'ed': 20.3}, {'w': '后', 'bg': 20.3, 'ed': 20.62}, {'w': '结', 'bg': 20.62, 'ed': 20.96}, {'w': '合', 'bg': 20.96, 'ed': 21.14}, {'w': '具', 'bg': 21.14, 'ed': 21.3}, {'w': '体', 'bg': 21.3, 'ed': 21.42}, {'w': '的', 'bg': 21.42, 'ed': 21.580000000000002}, {'w': '实', 'bg': 21.580000000000002, 'ed': 21.76}, {'w': '践', 'bg': 21.76, 'ed': 21.92}, {'w': '应', 'bg': 21.92, 'ed': 22.080000000000002}, {'w': '用', 'bg': 22.080000000000002, 'ed': 22.44}, {'w': '提', 'bg': 22.44, 'ed': 22.78}, {'w': '升', 'bg': 22.78, 'ed': 22.94}, {'w': '学', 'bg': 22.94, 'ed': 23.12}, {'w': '生', 'bg': 23.12, 'ed': 23.34}, {'w': '对', 'bg': 23.34, 'ed': 23.62}, {'w': '实', 'bg': 23.62, 'ed': 23.82}, {'w': '训', 'bg': 23.82, 'ed': 23.96}, {'w': '的', 'bg': 23.96, 'ed': 24.12}, {'w': '兴', 'bg': 24.12, 'ed': 24.3}, {'w': '趣', 'bg': 24.3, 'ed': 24.6}, {'w': '以', 'bg': 24.6, 'ed': 24.88}, {'w': '及', 'bg': 24.88, 'ed': 25.12}, {'w': '意', 'bg': 25.12, 'ed': 25.34}, {'w': '义', 'bg': 25.34, 'ed': 25.46}, {'w': '感', 'bg': 25.46, 'ed': 26.04}]}
+[2023-03-30 23:27:01,060] [    INFO] - audio duration: 26.04, elapsed time: 46.581613540649414, RTF=1.7888484462614982
+sentences:  ['第一部分是认知部分', '该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理', '让学生对设备有大致的认知', '随后使用真实传感器的内部构造图', '辅以文字说明', '进一步帮助学生对传感器有更深刻的印象', '最后结合具体的实践应用', '提升学生对实训的兴趣以及意义感']
+relative_times:  [[0.0, 2.1], [2.1, 8.06], [8.06, 11.040000000000001], [11.040000000000001, 14.280000000000001], [14.280000000000001, 15.72], [15.72, 19.8], [19.8, 22.44], [22.44, 26.04]]
+[2023-03-30 23:27:01,076] [    INFO] - results saved to /home/fxb/PaddleSpeech-develop/data/认知.srt
+  ```
diff --git a/demos/streaming_asr_server/README_cn.md b/demos/streaming_asr_server/README_cn.md
index 1902a2fa..bbddd693 100644
--- a/demos/streaming_asr_server/README_cn.md
+++ b/demos/streaming_asr_server/README_cn.md
@@ -90,7 +90,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
 
   server_executor = ServerExecutor()
   server_executor(
-      config_file="./conf/ws_conformer_wenetspeech_application", 
+      config_file="./conf/ws_conformer_wenetspeech_application_faster.yaml", 
       log_file="./log/paddlespeech.log")
   ```
 
@@ -578,3 +578,354 @@ bash server.sh
   [2022-05-07 11:11:18,915] [    INFO] - audio duration: 4.9968125, elapsed time: 15.928460597991943, RTF=3.187724293835709
   [2022-05-07 11:11:18,916] [    INFO] - asr websocket client finished : 我认为跑步最重要的就是给我带来了身体健康
   ```
+
+## 从音频文件(.wav 格式 或者.mp3 格式)生成字幕文件 (.srt 格式)
+
+**注意:** 默认部署在 `cpu` 设备上，可以通过修改服务配置文件中 `device` 参数将语音识别和标点预测部署在不同的 `gpu` 上。
+
+使用 `streaming_asr_server.py` 和 `punc_server.py` 两个服务，分别启动流式语音识别和标点预测服务。调用 `websocket_client.py` 脚本可以同时调用流式语音识别和标点预测服务，将会生成对应的字幕文件(.srt格式)。
+
+**使用该脚本前需要安装mffpeg**
+
+**应该在对应的`.../demos/streaming_asr_server/`目录下运行以下脚本**
+
+### 1. 启动服务端
+
+```bash
+Note: streaming speech recognition and punctuation prediction are configured on different graphics cards through configuration files
+paddlespeech_server start --config_file ./conf/ws_conformer_wenetspeech_application.yaml
+```
+
+Open another terminal run the following commands:
+```bash
+paddlespeech_server start --config_file conf/punc_application.yaml
+```
+
+### 2. 启动客户端
+
+  ```bash
+  python3 local/websocket_client_srt.py --server_ip 127.0.0.1 --port 8090 --punc.server_ip 127.0.0.1 --punc.port 8190 --wavfile ../../data/认知.mp3
+  ```
+  Output:
+  ```text
+  [2023-03-30 23:26:13,991] [    INFO] - Start to do streaming asr client
+[2023-03-30 23:26:13,994] [    INFO] - asr websocket client start
+[2023-03-30 23:26:13,994] [    INFO] - endpoint: http://127.0.0.1:8190/paddlespeech/text
+[2023-03-30 23:26:13,994] [    INFO] - endpoint: ws://127.0.0.1:8090/paddlespeech/asr/streaming
+[2023-03-30 23:26:14,475] [    INFO] - /home/fxb/PaddleSpeech-develop/data/认知.mp3 converted to /home/fxb/PaddleSpeech-develop/data/认知.wav
+[2023-03-30 23:26:14,476] [    INFO] - start to process the wavscp: /home/fxb/PaddleSpeech-develop/data/认知.wav
+[2023-03-30 23:26:14,515] [    INFO] - client receive msg={"status": "ok", "signal": "server_ready"}
+[2023-03-30 23:26:14,533] [    INFO] - client receive msg={'result': ''}
+[2023-03-30 23:26:14,545] [    INFO] - client receive msg={'result': ''}
+[2023-03-30 23:26:14,556] [    INFO] - client receive msg={'result': ''}
+[2023-03-30 23:26:14,572] [    INFO] - client receive msg={'result': ''}
+[2023-03-30 23:26:14,588] [    INFO] - client receive msg={'result': ''}
+[2023-03-30 23:26:14,600] [    INFO] - client receive msg={'result': ''}
+[2023-03-30 23:26:14,613] [    INFO] - client receive msg={'result': ''}
+[2023-03-30 23:26:14,626] [    INFO] - client receive msg={'result': ''}
+[2023-03-30 23:26:15,122] [    INFO] - client receive msg={'result': '第一部'}
+[2023-03-30 23:26:15,135] [    INFO] - client receive msg={'result': '第一部'}
+[2023-03-30 23:26:15,154] [    INFO] - client receive msg={'result': '第一部'}
+[2023-03-30 23:26:15,163] [    INFO] - client receive msg={'result': '第一部'}
+[2023-03-30 23:26:15,175] [    INFO] - client receive msg={'result': '第一部'}
+[2023-03-30 23:26:15,185] [    INFO] - client receive msg={'result': '第一部'}
+[2023-03-30 23:26:15,196] [    INFO] - client receive msg={'result': '第一部'}
+[2023-03-30 23:26:15,637] [    INFO] - client receive msg={'result': '第一部分是认'}
+[2023-03-30 23:26:15,648] [    INFO] - client receive msg={'result': '第一部分是认'}
+[2023-03-30 23:26:15,657] [    INFO] - client receive msg={'result': '第一部分是认'}
+[2023-03-30 23:26:15,666] [    INFO] - client receive msg={'result': '第一部分是认'}
+[2023-03-30 23:26:15,676] [    INFO] - client receive msg={'result': '第一部分是认'}
+[2023-03-30 23:26:15,683] [    INFO] - client receive msg={'result': '第一部分是认'}
+[2023-03-30 23:26:15,691] [    INFO] - client receive msg={'result': '第一部分是认'}
+[2023-03-30 23:26:15,703] [    INFO] - client receive msg={'result': '第一部分是认'}
+[2023-03-30 23:26:16,146] [    INFO] - client receive msg={'result': '第一部分是认知部分'}
+[2023-03-30 23:26:16,159] [    INFO] - client receive msg={'result': '第一部分是认知部分'}
+[2023-03-30 23:26:16,167] [    INFO] - client receive msg={'result': '第一部分是认知部分'}
+[2023-03-30 23:26:16,177] [    INFO] - client receive msg={'result': '第一部分是认知部分'}
+[2023-03-30 23:26:16,187] [    INFO] - client receive msg={'result': '第一部分是认知部分'}
+[2023-03-30 23:26:16,197] [    INFO] - client receive msg={'result': '第一部分是认知部分'}
+[2023-03-30 23:26:16,210] [    INFO] - client receive msg={'result': '第一部分是认知部分'}
+[2023-03-30 23:26:16,694] [    INFO] - client receive msg={'result': '第一部分是认知部分'}
+[2023-03-30 23:26:16,704] [    INFO] - client receive msg={'result': '第一部分是认知部分'}
+[2023-03-30 23:26:16,713] [    INFO] - client receive msg={'result': '第一部分是认知部分'}
+[2023-03-30 23:26:16,725] [    INFO] - client receive msg={'result': '第一部分是认知部分'}
+[2023-03-30 23:26:16,737] [    INFO] - client receive msg={'result': '第一部分是认知部分'}
+[2023-03-30 23:26:16,749] [    INFO] - client receive msg={'result': '第一部分是认知部分'}
+[2023-03-30 23:26:16,759] [    INFO] - client receive msg={'result': '第一部分是认知部分'}
+[2023-03-30 23:26:16,770] [    INFO] - client receive msg={'result': '第一部分是认知部分'}
+[2023-03-30 23:26:17,279] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通'}
+[2023-03-30 23:26:17,302] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通'}
+[2023-03-30 23:26:17,316] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通'}
+[2023-03-30 23:26:17,332] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通'}
+[2023-03-30 23:26:17,343] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通'}
+[2023-03-30 23:26:17,358] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通'}
+[2023-03-30 23:26:17,373] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通'}
+[2023-03-30 23:26:17,958] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图'}
+[2023-03-30 23:26:17,971] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图'}
+[2023-03-30 23:26:17,987] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图'}
+[2023-03-30 23:26:18,000] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图'}
+[2023-03-30 23:26:18,017] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图'}
+[2023-03-30 23:26:18,028] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图'}
+[2023-03-30 23:26:18,038] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图'}
+[2023-03-30 23:26:18,049] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图'}
+[2023-03-30 23:26:18,653] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本'}
+[2023-03-30 23:26:18,689] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本'}
+[2023-03-30 23:26:18,701] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本'}
+[2023-03-30 23:26:18,712] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本'}
+[2023-03-30 23:26:18,723] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本'}
+[2023-03-30 23:26:18,750] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本'}
+[2023-03-30 23:26:18,767] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本'}
+[2023-03-30 23:26:19,295] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式'}
+[2023-03-30 23:26:19,307] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式'}
+[2023-03-30 23:26:19,323] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式'}
+[2023-03-30 23:26:19,332] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式'}
+[2023-03-30 23:26:19,342] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式'}
+[2023-03-30 23:26:19,349] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式'}
+[2023-03-30 23:26:19,373] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式'}
+[2023-03-30 23:26:19,389] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式'}
+[2023-03-30 23:26:20,046] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生'}
+[2023-03-30 23:26:20,055] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生'}
+[2023-03-30 23:26:20,067] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生'}
+[2023-03-30 23:26:20,076] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生'}
+[2023-03-30 23:26:20,094] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生'}
+[2023-03-30 23:26:20,124] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生'}
+[2023-03-30 23:26:20,135] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生'}
+[2023-03-30 23:26:20,732] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解'}
+[2023-03-30 23:26:20,742] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解'}
+[2023-03-30 23:26:20,757] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解'}
+[2023-03-30 23:26:20,770] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解'}
+[2023-03-30 23:26:20,782] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解'}
+[2023-03-30 23:26:20,798] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解'}
+[2023-03-30 23:26:20,815] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解'}
+[2023-03-30 23:26:20,834] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解'}
+[2023-03-30 23:26:21,390] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感'}
+[2023-03-30 23:26:21,405] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感'}
+[2023-03-30 23:26:21,416] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感'}
+[2023-03-30 23:26:21,428] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感'}
+[2023-03-30 23:26:21,448] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感'}
+[2023-03-30 23:26:21,459] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感'}
+[2023-03-30 23:26:21,473] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感'}
+[2023-03-30 23:26:22,065] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作'}
+[2023-03-30 23:26:22,085] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作'}
+[2023-03-30 23:26:22,110] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作'}
+[2023-03-30 23:26:22,118] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作'}
+[2023-03-30 23:26:22,137] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作'}
+[2023-03-30 23:26:22,144] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作'}
+[2023-03-30 23:26:22,154] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作'}
+[2023-03-30 23:26:22,169] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作'}
+[2023-03-30 23:26:22,698] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理'}
+[2023-03-30 23:26:22,709] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理'}
+[2023-03-30 23:26:22,731] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理'}
+[2023-03-30 23:26:22,743] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理'}
+[2023-03-30 23:26:22,755] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理'}
+[2023-03-30 23:26:22,771] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理'}
+[2023-03-30 23:26:22,782] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理'}
+[2023-03-30 23:26:23,415] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生'}
+[2023-03-30 23:26:23,430] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生'}
+[2023-03-30 23:26:23,442] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生'}
+[2023-03-30 23:26:23,456] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生'}
+[2023-03-30 23:26:23,470] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生'}
+[2023-03-30 23:26:23,487] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生'}
+[2023-03-30 23:26:23,498] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生'}
+[2023-03-30 23:26:23,524] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生'}
+[2023-03-30 23:26:24,200] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备'}
+[2023-03-30 23:26:24,210] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备'}
+[2023-03-30 23:26:24,219] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备'}
+[2023-03-30 23:26:24,231] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备'}
+[2023-03-30 23:26:24,250] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备'}
+[2023-03-30 23:26:24,262] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备'}
+[2023-03-30 23:26:24,272] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备'}
+[2023-03-30 23:26:24,898] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致'}
+[2023-03-30 23:26:24,903] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致'}
+[2023-03-30 23:26:24,907] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致'}
+[2023-03-30 23:26:24,932] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致'}
+[2023-03-30 23:26:24,957] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致'}
+[2023-03-30 23:26:24,979] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致'}
+[2023-03-30 23:26:24,991] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致'}
+[2023-03-30 23:26:25,011] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致'}
+[2023-03-30 23:26:25,616] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知'}
+[2023-03-30 23:26:25,625] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知'}
+[2023-03-30 23:26:25,648] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知'}
+[2023-03-30 23:26:25,658] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知'}
+[2023-03-30 23:26:25,669] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知'}
+[2023-03-30 23:26:25,681] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知'}
+[2023-03-30 23:26:25,690] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知'}
+[2023-03-30 23:26:25,707] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知'}
+[2023-03-30 23:26:26,378] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知'}
+[2023-03-30 23:26:26,384] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知'}
+[2023-03-30 23:26:26,389] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知'}
+[2023-03-30 23:26:26,397] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知'}
+[2023-03-30 23:26:26,402] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知'}
+[2023-03-30 23:26:26,415] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知'}
+[2023-03-30 23:26:26,428] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知'}
+[2023-03-30 23:26:27,008] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使'}
+[2023-03-30 23:26:27,018] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使'}
+[2023-03-30 23:26:27,026] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使'}
+[2023-03-30 23:26:27,037] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使'}
+[2023-03-30 23:26:27,046] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使'}
+[2023-03-30 23:26:27,054] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使'}
+[2023-03-30 23:26:27,062] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使'}
+[2023-03-30 23:26:27,070] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使'}
+[2023-03-30 23:26:27,735] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传'}
+[2023-03-30 23:26:27,745] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传'}
+[2023-03-30 23:26:27,755] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传'}
+[2023-03-30 23:26:27,769] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传'}
+[2023-03-30 23:26:27,783] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传'}
+[2023-03-30 23:26:27,794] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传'}
+[2023-03-30 23:26:27,804] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传'}
+[2023-03-30 23:26:28,454] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内'}
+[2023-03-30 23:26:28,472] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内'}
+[2023-03-30 23:26:28,481] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内'}
+[2023-03-30 23:26:28,489] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内'}
+[2023-03-30 23:26:28,499] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内'}
+[2023-03-30 23:26:28,533] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内'}
+[2023-03-30 23:26:28,543] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内'}
+[2023-03-30 23:26:28,556] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内'}
+[2023-03-30 23:26:29,212] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图'}
+[2023-03-30 23:26:29,222] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图'}
+[2023-03-30 23:26:29,233] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图'}
+[2023-03-30 23:26:29,246] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图'}
+[2023-03-30 23:26:29,258] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图'}
+[2023-03-30 23:26:29,270] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图'}
+[2023-03-30 23:26:29,286] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图'}
+[2023-03-30 23:26:30,003] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅'}
+[2023-03-30 23:26:30,013] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅'}
+[2023-03-30 23:26:30,038] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅'}
+[2023-03-30 23:26:30,048] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅'}
+[2023-03-30 23:26:30,062] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅'}
+[2023-03-30 23:26:30,074] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅'}
+[2023-03-30 23:26:30,114] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅'}
+[2023-03-30 23:26:30,125] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅'}
+[2023-03-30 23:26:30,856] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说'}
+[2023-03-30 23:26:30,876] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说'}
+[2023-03-30 23:26:30,885] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说'}
+[2023-03-30 23:26:30,897] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说'}
+[2023-03-30 23:26:30,914] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说'}
+[2023-03-30 23:26:30,940] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说'}
+[2023-03-30 23:26:30,952] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说'}
+[2023-03-30 23:26:31,655] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明'}
+[2023-03-30 23:26:31,696] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明'}
+[2023-03-30 23:26:31,709] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明'}
+[2023-03-30 23:26:31,718] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明'}
+[2023-03-30 23:26:31,727] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明'}
+[2023-03-30 23:26:31,740] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明'}
+[2023-03-30 23:26:31,757] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明'}
+[2023-03-30 23:26:31,768] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明'}
+[2023-03-30 23:26:32,476] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助'}
+[2023-03-30 23:26:32,486] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助'}
+[2023-03-30 23:26:32,495] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助'}
+[2023-03-30 23:26:32,549] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助'}
+[2023-03-30 23:26:32,560] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助'}
+[2023-03-30 23:26:32,574] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助'}
+[2023-03-30 23:26:32,590] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助'}
+[2023-03-30 23:26:33,338] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生'}
+[2023-03-30 23:26:33,356] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生'}
+[2023-03-30 23:26:33,368] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生'}
+[2023-03-30 23:26:33,386] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生'}
+[2023-03-30 23:26:33,397] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生'}
+[2023-03-30 23:26:33,409] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生'}
+[2023-03-30 23:26:33,424] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生'}
+[2023-03-30 23:26:33,434] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生'}
+[2023-03-30 23:26:34,352] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感'}
+[2023-03-30 23:26:34,364] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感'}
+[2023-03-30 23:26:34,377] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感'}
+[2023-03-30 23:26:34,395] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感'}
+[2023-03-30 23:26:34,410] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感'}
+[2023-03-30 23:26:34,423] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感'}
+[2023-03-30 23:26:34,434] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感'}
+[2023-03-30 23:26:35,373] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有'}
+[2023-03-30 23:26:35,397] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有'}
+[2023-03-30 23:26:35,410] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有'}
+[2023-03-30 23:26:35,420] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有'}
+[2023-03-30 23:26:35,437] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有'}
+[2023-03-30 23:26:35,448] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有'}
+[2023-03-30 23:26:35,460] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有'}
+[2023-03-30 23:26:35,473] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有'}
+[2023-03-30 23:26:36,288] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的'}
+[2023-03-30 23:26:36,297] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的'}
+[2023-03-30 23:26:36,306] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的'}
+[2023-03-30 23:26:36,326] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的'}
+[2023-03-30 23:26:36,336] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的'}
+[2023-03-30 23:26:36,351] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的'}
+[2023-03-30 23:26:36,365] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的'}
+[2023-03-30 23:26:37,164] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象'}
+[2023-03-30 23:26:37,173] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象'}
+[2023-03-30 23:26:37,182] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象'}
+[2023-03-30 23:26:37,192] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象'}
+[2023-03-30 23:26:37,204] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象'}
+[2023-03-30 23:26:37,232] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象'}
+[2023-03-30 23:26:37,238] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象'}
+[2023-03-30 23:26:37,252] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象'}
+[2023-03-30 23:26:38,084] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后'}
+[2023-03-30 23:26:38,093] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后'}
+[2023-03-30 23:26:38,106] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后'}
+[2023-03-30 23:26:38,122] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后'}
+[2023-03-30 23:26:38,140] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后'}
+[2023-03-30 23:26:38,181] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后'}
+[2023-03-30 23:26:38,206] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后'}
+[2023-03-30 23:26:39,094] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合'}
+[2023-03-30 23:26:39,111] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合'}
+[2023-03-30 23:26:39,132] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合'}
+[2023-03-30 23:26:39,150] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合'}
+[2023-03-30 23:26:39,174] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合'}
+[2023-03-30 23:26:39,190] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合'}
+[2023-03-30 23:26:39,197] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合'}
+[2023-03-30 23:26:39,212] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合'}
+[2023-03-30 23:26:40,009] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实'}
+[2023-03-30 23:26:40,094] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实'}
+[2023-03-30 23:26:40,105] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实'}
+[2023-03-30 23:26:40,128] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实'}
+[2023-03-30 23:26:40,149] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实'}
+[2023-03-30 23:26:40,173] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实'}
+[2023-03-30 23:26:40,189] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实'}
+[2023-03-30 23:26:40,200] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实'}
+[2023-03-30 23:26:40,952] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用'}
+[2023-03-30 23:26:40,973] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用'}
+[2023-03-30 23:26:40,986] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用'}
+[2023-03-30 23:26:40,999] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用'}
+[2023-03-30 23:26:41,013] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用'}
+[2023-03-30 23:26:41,022] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用'}
+[2023-03-30 23:26:41,033] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用'}
+[2023-03-30 23:26:41,819] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升'}
+[2023-03-30 23:26:41,832] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升'}
+[2023-03-30 23:26:41,845] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升'}
+[2023-03-30 23:26:41,878] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升'}
+[2023-03-30 23:26:41,886] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升'}
+[2023-03-30 23:26:41,893] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升'}
+[2023-03-30 23:26:41,925] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升'}
+[2023-03-30 23:26:41,935] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升'}
+[2023-03-30 23:26:42,562] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对'}
+[2023-03-30 23:26:42,589] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对'}
+[2023-03-30 23:26:42,621] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对'}
+[2023-03-30 23:26:42,634] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对'}
+[2023-03-30 23:26:42,644] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对'}
+[2023-03-30 23:26:42,657] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对'}
+[2023-03-30 23:26:42,668] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对'}
+[2023-03-30 23:26:43,380] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴'}
+[2023-03-30 23:26:43,389] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴'}
+[2023-03-30 23:26:43,436] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴'}
+[2023-03-30 23:26:43,448] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴'}
+[2023-03-30 23:26:43,462] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴'}
+[2023-03-30 23:26:43,472] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴'}
+[2023-03-30 23:26:43,486] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴'}
+[2023-03-30 23:26:43,496] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴'}
+[2023-03-30 23:26:44,346] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴趣以'}
+[2023-03-30 23:26:44,356] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴趣以'}
+[2023-03-30 23:26:44,364] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴趣以'}
+[2023-03-30 23:26:44,374] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴趣以'}
+[2023-03-30 23:26:44,389] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴趣以'}
+[2023-03-30 23:26:44,398] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴趣以'}
+[2023-03-30 23:26:44,420] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴趣以'}
+[2023-03-30 23:26:45,226] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴趣以及意义感'}
+[2023-03-30 23:26:45,235] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴趣以及意义感'}
+[2023-03-30 23:26:45,258] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴趣以及意义感'}
+[2023-03-30 23:26:45,273] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴趣以及意义感'}
+[2023-03-30 23:26:45,295] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴趣以及意义感'}
+[2023-03-30 23:26:45,306] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴趣以及意义感'}
+[2023-03-30 23:26:46,380] [    INFO] - client punctuation restored msg={'result': '第一部分是认知部分，该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理，让学生对设备有大致的认知。随后使用真实传感器的内部构造图，辅以文字说明，进一步帮助学生对传感器有更深刻的印象，最后结合具体的实践应用，提升学生对实训的兴趣以及意义感。'}
+[2023-03-30 23:27:01,059] [    INFO] - client final receive msg={'status': 'ok', 'signal': 'finished', 'result': '第一部分是认知部分，该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理，让学生对设备有大致的认知。随后使用真实传感器的内部构造图，辅以文字说明，进一步帮助学生对传感器有更深刻的印象，最后结合具体的实践应用，提升学生对实训的兴趣以及意义感。', 'times': [{'w': '第', 'bg': 0.0, 'ed': 0.36}, {'w': '一', 'bg': 0.36, 'ed': 0.48}, {'w': '部', 'bg': 0.48, 'ed': 0.62}, {'w': '分', 'bg': 0.62, 'ed': 0.8200000000000001}, {'w': '是', 'bg': 0.8200000000000001, 'ed': 1.08}, {'w': '认', 'bg': 1.08, 'ed': 1.28}, {'w': '知', 'bg': 1.28, 'ed': 1.44}, {'w': '部', 'bg': 1.44, 'ed': 1.58}, {'w': '分', 'bg': 1.58, 'ed': 2.1}, {'w': '该', 'bg': 2.1, 'ed': 2.6}, {'w': '部', 'bg': 2.6, 'ed': 2.72}, {'w': '分', 'bg': 2.72, 'ed': 2.94}, {'w': '通', 'bg': 2.94, 'ed': 3.16}, {'w': '过', 'bg': 3.16, 'ed': 3.36}, {'w': '示', 'bg': 3.36, 'ed': 3.54}, {'w': '意', 'bg': 3.54, 'ed': 3.68}, {'w': '图', 'bg': 3.68, 'ed': 3.9}, {'w': '和', 'bg': 3.9, 'ed': 4.14}, {'w': '文', 'bg': 4.14, 'ed': 4.32}, {'w': '本', 'bg': 4.32, 'ed': 4.46}, {'w': '的', 'bg': 4.46, 'ed': 4.58}, {'w': '形', 'bg': 4.58, 'ed': 4.72}, {'w': '式', 'bg': 4.72, 'ed': 5.0}, {'w': '向', 'bg': 5.0, 'ed': 5.32}, {'w': '学', 'bg': 5.32, 'ed': 5.5}, {'w': '生', 'bg': 5.5, 'ed': 5.66}, {'w': '讲', 'bg': 5.66, 'ed': 5.86}, {'w': '解', 'bg': 5.86, 'ed': 6.18}, {'w': '主', 'bg': 6.18, 'ed': 6.46}, {'w': '要', 'bg': 6.46, 'ed': 6.62}, {'w': '传', 'bg': 6.62, 'ed': 6.8}, {'w': '感', 'bg': 6.8, 'ed': 7.0}, {'w': '器', 'bg': 7.0, 'ed': 7.16}, {'w': '的', 'bg': 7.16, 'ed': 7.28}, {'w': '工', 'bg': 7.28, 'ed': 7.44}, {'w': '作', 'bg': 7.44, 'ed': 7.6000000000000005}, {'w': '原', 'bg': 7.6000000000000005, 'ed': 7.74}, {'w': '理', 'bg': 7.74, 'ed': 8.06}, {'w': '让', 'bg': 8.06, 'ed': 8.44}, {'w': '学', 'bg': 8.44, 'ed': 8.64}, {'w': '生', 'bg': 8.64, 'ed': 8.84}, {'w': '对', 'bg': 8.84, 'ed': 9.06}, {'w': '设', 'bg': 9.06, 'ed': 9.24}, {'w': '备', 'bg': 9.24, 'ed': 9.52}, {'w': '有', 'bg': 9.52, 'ed': 9.86}, {'w': '大', 'bg': 9.86, 'ed': 10.1}, {'w': '致', 'bg': 10.1, 'ed': 10.24}, {'w': '的', 'bg': 10.24, 'ed': 10.36}, {'w': '认', 'bg': 10.36, 'ed': 10.5}, {'w': '知', 'bg': 10.5, 'ed': 11.040000000000001}, {'w': '随', 'bg': 11.040000000000001, 'ed': 11.56}, {'w': '后', 'bg': 11.56, 'ed': 11.82}, {'w': '使', 'bg': 11.82, 'ed': 12.1}, {'w': '用', 'bg': 12.1, 'ed': 12.26}, {'w': '真', 'bg': 12.26, 'ed': 12.44}, {'w': '实', 'bg': 12.44, 'ed': 12.620000000000001}, {'w': '传', 'bg': 12.620000000000001, 'ed': 12.780000000000001}, {'w': '感', 'bg': 12.780000000000001, 'ed': 12.94}, {'w': '器', 'bg': 12.94, 'ed': 13.1}, {'w': '的', 'bg': 13.1, 'ed': 13.26}, {'w': '内', 'bg': 13.26, 'ed': 13.42}, {'w': '部', 'bg': 13.42, 'ed': 13.56}, {'w': '构', 'bg': 13.56, 'ed': 13.700000000000001}, {'w': '造', 'bg': 13.700000000000001, 'ed': 13.86}, {'w': '图', 'bg': 13.86, 'ed': 14.280000000000001}, {'w': '辅', 'bg': 14.280000000000001, 'ed': 14.66}, {'w': '以', 'bg': 14.66, 'ed': 14.82}, {'w': '文', 'bg': 14.82, 'ed': 15.0}, {'w': '字', 'bg': 15.0, 'ed': 15.16}, {'w': '说', 'bg': 15.16, 'ed': 15.32}, {'w': '明', 'bg': 15.32, 'ed': 15.72}, {'w': '进', 'bg': 15.72, 'ed': 16.1}, {'w': '一', 'bg': 16.1, 'ed': 16.2}, {'w': '步', 'bg': 16.2, 'ed': 16.32}, {'w': '帮', 'bg': 16.32, 'ed': 16.48}, {'w': '助', 'bg': 16.48, 'ed': 16.66}, {'w': '学', 'bg': 16.66, 'ed': 16.82}, {'w': '生', 'bg': 16.82, 'ed': 17.12}, {'w': '对', 'bg': 17.12, 'ed': 17.48}, {'w': '传', 'bg': 17.48, 'ed': 17.66}, {'w': '感', 'bg': 17.66, 'ed': 17.84}, {'w': '器', 'bg': 17.84, 'ed': 18.12}, {'w': '有', 'bg': 18.12, 'ed': 18.42}, {'w': '更', 'bg': 18.42, 'ed': 18.66}, {'w': '深', 'bg': 18.66, 'ed': 18.88}, {'w': '刻', 'bg': 18.88, 'ed': 19.04}, {'w': '的', 'bg': 19.04, 'ed': 19.16}, {'w': '印', 'bg': 19.16, 'ed': 19.3}, {'w': '象', 'bg': 19.3, 'ed': 19.8}, {'w': '最', 'bg': 19.8, 'ed': 20.3}, {'w': '后', 'bg': 20.3, 'ed': 20.62}, {'w': '结', 'bg': 20.62, 'ed': 20.96}, {'w': '合', 'bg': 20.96, 'ed': 21.14}, {'w': '具', 'bg': 21.14, 'ed': 21.3}, {'w': '体', 'bg': 21.3, 'ed': 21.42}, {'w': '的', 'bg': 21.42, 'ed': 21.580000000000002}, {'w': '实', 'bg': 21.580000000000002, 'ed': 21.76}, {'w': '践', 'bg': 21.76, 'ed': 21.92}, {'w': '应', 'bg': 21.92, 'ed': 22.080000000000002}, {'w': '用', 'bg': 22.080000000000002, 'ed': 22.44}, {'w': '提', 'bg': 22.44, 'ed': 22.78}, {'w': '升', 'bg': 22.78, 'ed': 22.94}, {'w': '学', 'bg': 22.94, 'ed': 23.12}, {'w': '生', 'bg': 23.12, 'ed': 23.34}, {'w': '对', 'bg': 23.34, 'ed': 23.62}, {'w': '实', 'bg': 23.62, 'ed': 23.82}, {'w': '训', 'bg': 23.82, 'ed': 23.96}, {'w': '的', 'bg': 23.96, 'ed': 24.12}, {'w': '兴', 'bg': 24.12, 'ed': 24.3}, {'w': '趣', 'bg': 24.3, 'ed': 24.6}, {'w': '以', 'bg': 24.6, 'ed': 24.88}, {'w': '及', 'bg': 24.88, 'ed': 25.12}, {'w': '意', 'bg': 25.12, 'ed': 25.34}, {'w': '义', 'bg': 25.34, 'ed': 25.46}, {'w': '感', 'bg': 25.46, 'ed': 26.04}]}
+[2023-03-30 23:27:01,060] [    INFO] - audio duration: 26.04, elapsed time: 46.581613540649414, RTF=1.7888484462614982
+sentences:  ['第一部分是认知部分', '该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理', '让学生对设备有大致的认知', '随后使用真实传感器的内部构造图', '辅以文字说明', '进一步帮助学生对传感器有更深刻的印象', '最后结合具体的实践应用', '提升学生对实训的兴趣以及意义感']
+relative_times:  [[0.0, 2.1], [2.1, 8.06], [8.06, 11.040000000000001], [11.040000000000001, 14.280000000000001], [14.280000000000001, 15.72], [15.72, 19.8], [19.8, 22.44], [22.44, 26.04]]
+[2023-03-30 23:27:01,076] [    INFO] - results saved to /home/fxb/PaddleSpeech-develop/data/认知.srt
+  ```
diff --git a/demos/streaming_asr_server/local/websocket_client_srt.py b/demos/streaming_asr_server/local/websocket_client_srt.py
new file mode 100644
index 00000000..02fea484
--- /dev/null
+++ b/demos/streaming_asr_server/local/websocket_client_srt.py
@@ -0,0 +1,162 @@
+#!/usr/bin/python
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# calc avg RTF(NOT Accurate): grep -rn RTF log.txt | awk '{print $NF}' | awk -F "=" '{sum += $NF} END {print "all time",sum, "audio num", NR,  "RTF", sum/NR}'
+# python3 websocket_client.py --server_ip 127.0.0.1 --port 8290 --punc.server_ip 127.0.0.1 --punc.port 8190 --wavfile ./zh.wav
+# python3 websocket_client.py --server_ip 127.0.0.1 --port 8290 --wavfile ./zh.wav
+import argparse
+import asyncio
+import codecs
+import os
+from pydub import AudioSegment
+import re
+
+from paddlespeech.cli.log import logger
+from paddlespeech.server.utils.audio_handler import ASRWsAudioHandler
+
+def convert_to_wav(input_file):
+    # Load audio file
+    audio = AudioSegment.from_file(input_file)
+
+    # Set parameters for audio file
+    audio = audio.set_channels(1)
+    audio = audio.set_frame_rate(16000)
+
+    # Create output filename
+    output_file = os.path.splitext(input_file)[0] + ".wav"
+
+    # Export audio file as WAV
+    audio.export(output_file, format="wav")
+
+    logger.info(f"{input_file} converted to {output_file}")
+
+def format_time(sec):
+    # Convert seconds to SRT format (HH:MM:SS,ms)
+    hours = int(sec/3600)
+    minutes = int((sec%3600)/60)
+    seconds = int(sec%60)
+    milliseconds = int((sec%1)*1000)
+    return f'{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}'
+
+def results2srt(results, srt_file):
+    """convert results from paddlespeech to srt format for subtitle
+    Args:
+        results (dict): results from paddlespeech
+    """
+    # times contains start and end time of each word
+    times = results['times']
+    # result contains the whole sentence including punctuation
+    result = results['result']
+    # split result into several sencences by '，' and '。'
+    sentences = re.split('，|。', result)[:-1]
+    # print("sentences: ", sentences)
+    # generate relative time for each sentence in sentences
+    relative_times = []
+    word_i = 0
+    for sentence in sentences:
+        relative_times.append([])
+        for word in sentence:
+            if relative_times[-1] == []:
+                relative_times[-1].append(times[word_i]['bg'])
+            if len(relative_times[-1]) == 1:
+                relative_times[-1].append(times[word_i]['ed'])
+            else:
+                relative_times[-1][1] = times[word_i]['ed']
+            word_i += 1
+    # print("relative_times: ", relative_times)
+    # generate srt file acoording to relative_times and sentences
+    with open(srt_file, 'w') as f:
+        for i in range(len(sentences)):
+            # Write index number
+            f.write(str(i+1)+'\n')
+            
+            # Write start and end times
+            start = format_time(relative_times[i][0])
+            end = format_time(relative_times[i][1])
+            f.write(start + ' --> ' + end + '\n')
+            
+            # Write text
+            f.write(sentences[i]+'\n\n')
+    logger.info(f"results saved to {srt_file}")
+
+def main(args):
+    logger.info("asr websocket client start")
+    handler = ASRWsAudioHandler(
+        args.server_ip,
+        args.port,
+        endpoint=args.endpoint,
+        punc_server_ip=args.punc_server_ip,
+        punc_server_port=args.punc_server_port)
+    loop = asyncio.get_event_loop()
+
+    # check if the wav file is mp3 format
+    # if so, convert it to wav format using convert_to_wav function
+    if args.wavfile and os.path.exists(args.wavfile):
+        if args.wavfile.endswith(".mp3"):
+            convert_to_wav(args.wavfile)
+            args.wavfile = args.wavfile.replace(".mp3", ".wav")
+
+    # support to process single audio file
+    if args.wavfile and os.path.exists(args.wavfile):
+        logger.info(f"start to process the wavscp: {args.wavfile}")
+        result = loop.run_until_complete(handler.run(args.wavfile))
+        # result = result["result"]
+        # logger.info(f"asr websocket client finished : {result}")
+        results2srt(result, args.wavfile.replace(".wav", ".srt"))
+
+    # support to process batch audios from wav.scp
+    if args.wavscp and os.path.exists(args.wavscp):
+        logger.info(f"start to process the wavscp: {args.wavscp}")
+        with codecs.open(args.wavscp, 'r', encoding='utf-8') as f,\
+             codecs.open("result.txt", 'w', encoding='utf-8') as w:
+            for line in f:
+                utt_name, utt_path = line.strip().split()
+                result = loop.run_until_complete(handler.run(utt_path))
+                result = result["result"]
+                w.write(f"{utt_name} {result}\n")
+
+
+if __name__ == "__main__":
+    logger.info("Start to do streaming asr client")
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--server_ip', type=str, default='127.0.0.1', help='server ip')
+    parser.add_argument('--port', type=int, default=8090, help='server port')
+    parser.add_argument(
+        '--punc.server_ip',
+        type=str,
+        default=None,
+        dest="punc_server_ip",
+        help='Punctuation server ip')
+    parser.add_argument(
+        '--punc.port',
+        type=int,
+        default=8091,
+        dest="punc_server_port",
+        help='Punctuation server port')
+    parser.add_argument(
+        "--endpoint",
+        type=str,
+        default="/paddlespeech/asr/streaming",
+        help="ASR websocket endpoint")
+    parser.add_argument(
+        "--wavfile",
+        action="store",
+        help="wav file path ",
+        default="./16_audio.wav")
+    parser.add_argument(
+        "--wavscp", type=str, default=None, help="The batch audios dict text")
+    args = parser.parse_args()
+
+    main(args)
diff --git a/docs/images/note_map.png b/docs/images/note_map.png
new file mode 100644
index 00000000..f280d98c
Binary files /dev/null and b/docs/images/note_map.png differ
diff --git a/docs/requirements.txt b/docs/requirements.txt
index db6c8099..30622230 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -38,7 +38,7 @@ sphinx-markdown-tables
 sphinx_rtd_theme
 textgrid
 timer
-ToJyutping
+ToJyutping==0.2.1
 typeguard==2.13.3
 webrtcvad
 websockets
diff --git a/docs/source/install.md b/docs/source/install.md
index a4dae364..3607d718 100644
--- a/docs/source/install.md
+++ b/docs/source/install.md
@@ -95,7 +95,7 @@ bash
 ```
 Then you can create a conda virtual environment using the following command:
 ```bash
-conda create -y -p tools/venv python=3.7
+conda create -y -p tools/venv python=3.8
 ```
 Activate the conda virtual environment:
 ```bash
@@ -181,7 +181,7 @@ $HOME/miniconda3/bin/conda init
 # use the "bash" command to make the conda environment works
 bash
 # create a conda virtual environment
-conda create -y -p tools/venv python=3.7
+conda create -y -p tools/venv python=3.8
 # Activate the conda virtual environment:
 conda activate tools/venv
 # Install the conda packages
diff --git a/docs/source/install_cn.md b/docs/source/install_cn.md
index 7f05cdfe..01ae21fe 100644
--- a/docs/source/install_cn.md
+++ b/docs/source/install_cn.md
@@ -91,7 +91,7 @@ bash
 ```
 然后你可以创建一个 conda 的虚拟环境：
 ```bash
-conda create -y -p tools/venv python=3.7
+conda create -y -p tools/venv python=3.8
 ```
 激活 conda 虚拟环境：
 ```bash
@@ -173,7 +173,7 @@ $HOME/miniconda3/bin/conda init
 # 激活 conda
 bash
 # 创建 Conda 虚拟环境
-conda create -y -p tools/venv python=3.7
+conda create -y -p tools/venv python=3.8
 # 激活 Conda 虚拟环境:
 conda activate tools/venv
 # 安装 Conda 包
diff --git a/docs/source/released_model.md b/docs/source/released_model.md
index 9e922177..87619a55 100644
--- a/docs/source/released_model.md
+++ b/docs/source/released_model.md
@@ -1,5 +1,7 @@
 # Released Models
 
+> !!! Since PaddlePaddle support 0-D tensor from 2.5.0, PaddleSpeech Static model will not work for it, please re-export static model.
+
 ## Speech-to-Text Models
 
 ### Speech Recognition Model
@@ -10,7 +12,7 @@ Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER |
 [Ds2 Offline Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_offline_aishell_ckpt_1.0.1.model.tar.gz)| Aishell Dataset | Char-based | 1.4 GB | 2 Conv + 5 bidirectional LSTM layers| 0.0554 |-| 151 h | [Ds2 Offline Aishell ASR0](../../examples/aishell/asr0) | inference/python |-|
 [Conformer Online Wenetspeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_wenetspeech_ckpt_1.0.0a.model.tar.gz) | WenetSpeech Dataset | Char-based | 457 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring| 0.11 (test\_net) 0.1879 (test\_meeting) |-| 10000 h |- | python |-|
 [Conformer U2PP Online Wenetspeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.3.0.model.tar.gz) | WenetSpeech Dataset | Char-based | 540 MB  | Encoder:Conformer, Decoder:BiTransformer, Decoding method: Attention rescoring| 0.047198 (aishell test\_-1) 0.059212 (aishell test\_16) |-| 10000 h |- | python |[FP32](https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.3.0.model.tar.gz) </br>[INT8](https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/static/asr1_chunk_conformer_u2pp_wenetspeech_static_quant_1.3.0.model.tar.gz) |
-[Conformer Online Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_chunk_conformer_aishell_ckpt_0.2.0.model.tar.gz) | Aishell Dataset | Char-based | 189 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring| 0.0544 |-| 151 h | [Conformer Online Aishell ASR1](../../examples/aishell/asr1) | python |-|
+[Conformer Online Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_conformer_aishell_ckpt_1.5.0.model.tar.gz) | Aishell Dataset | Char-based | 189 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring| 0.051968 |-| 151 h | [Conformer Online Aishell ASR1](../../examples/aishell/asr1) | python |-|
 [Conformer Offline Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_conformer_aishell_ckpt_1.0.1.model.tar.gz) | Aishell Dataset | Char-based | 189 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0460 |-| 151 h | [Conformer Offline Aishell ASR1](../../examples/aishell/asr1) | python |-|
 [Transformer Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_transformer_aishell_ckpt_0.1.1.model.tar.gz) | Aishell Dataset | Char-based | 128 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0523 || 151 h | [Transformer  Aishell ASR1](../../examples/aishell/asr1) | python |-|
 [Ds2 Offline Librispeech ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr0/asr0_deepspeech2_offline_librispeech_ckpt_1.0.1.model.tar.gz)| Librispeech Dataset | Char-based | 1.3 GB | 2 Conv + 5 bidirectional LSTM layers| - |0.0467| 960 h | [Ds2 Offline Librispeech ASR0](../../examples/librispeech/asr0) | inference/python |-|
@@ -26,6 +28,8 @@ Model | Pre-Train Method | Pre-Train Data | Finetune Data | Size | Descriptions
 [Wav2vec2ASR-large-960h-librispeech Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr3/wav2vec2ASR-large-960h-librispeech_ckpt_1.3.1.model.tar.gz) | wav2vec2 | Librispeech and LV-60k Dataset (5.3w h) | Librispeech (960 h) | 718 MB |Encoder: Wav2vec2.0, Decoder: CTC, Decoding method: Greedy search | - | 0.0189 | [Wav2vecASR Librispeech ASR3](../../examples/librispeech/asr3) |
 [Wav2vec2-large-wenetspeech-self Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr3/wav2vec2-large-wenetspeech-self_ckpt_1.3.0.model.tar.gz) | wav2vec2 | Wenetspeech Dataset (1w h) | - | 714 MB |Pre-trained Wav2vec2.0 Model | - | - | - | 
 [Wav2vec2ASR-large-aishell1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr3/wav2vec2ASR-large-aishell1_ckpt_1.4.0.model.tar.gz) | wav2vec2 | Wenetspeech Dataset (1w h) | aishell1 (train set) | 1.18 GB |Encoder: Wav2vec2.0, Decoder: CTC, Decoding method: Greedy search | 0.0510 | - | - |
+[Hubert-large-lv60 Model](https://paddlespeech.bj.bcebos.com/hubert/hubert-large-lv60.pdparams) | hubert | LV-60k Dataset | - | 1.18 GB |Pre-trained hubert Model | - | - | - | 
+[Hubert-large-100h-librispeech Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr4/hubertASR-large-100h-librispeech_ckpt_1.4.0.model.tar.gz) | hubert | LV-60k Dataset | librispeech train-clean-100 | 1.27 GB |Encoder: Hubert, Decoder: Linear + CTC, Decoding method: Greedy search | - | 0.0587 | [HubertASR Librispeech ASR4](../../examples/librispeech/asr4) |
 
 ### Whisper Model
 Demo Link | Training Data | Size | Descriptions | CER | Model 
diff --git a/docs/source/tts/quick_start.md b/docs/source/tts/quick_start.md
index d8dbc646..d2a1b4ec 100644
--- a/docs/source/tts/quick_start.md
+++ b/docs/source/tts/quick_start.md
@@ -79,8 +79,8 @@ checkpoint_name
 ├── snapshot_iter_*.pdz
 ├── speech_stats.npy
 ├── phone_id_map.txt
-├── spk_id_map.txt (optimal)
-└── tone_id_map.txt (optimal)
+├── spk_id_map.txt (optional)
+└── tone_id_map.txt (optional)
 ```
 **Vocoders:**
 ```text
diff --git a/docs/source/tts/quick_start_cn.md b/docs/source/tts/quick_start_cn.md
index c56d9bb4..ba259643 100644
--- a/docs/source/tts/quick_start_cn.md
+++ b/docs/source/tts/quick_start_cn.md
@@ -87,8 +87,8 @@ checkpoint_name
 ├── snapshot_iter_*.pdz
 ├── speech_stats.npy
 ├── phone_id_map.txt
-├── spk_id_map.txt (optimal)
-└── tone_id_map.txt (optimal)
+├── spk_id_map.txt (optional)
+└── tone_id_map.txt (optional)
 ```
 **Vocoders:**
 ```text
diff --git a/docs/source/tts/svs_music_score.md b/docs/source/tts/svs_music_score.md
new file mode 100644
index 00000000..9f351c00
--- /dev/null
+++ b/docs/source/tts/svs_music_score.md
@@ -0,0 +1,183 @@
+本人非音乐专业人士，如文档中有误欢迎指正。
+
+# 一、常见基础
+## 1.1 简谱和音名（note）
+<p align="left">
+  <img src="https://paddlespeech.bj.bcebos.com/t2s/svs/svs_music_scores/seven.png" width="300"/>
+</p>
+
+上图从左往右的黑键音名分别是：C#/Db，D#/Db，F#/Db，G#/Ab，A#/Bb
+钢琴88键如下图，分为大字一组，大字组，小字组，小字一组，小字二组，小字三组，小字四组。分别对应音名的后缀是 1 2 3 4 5 6，例如小字一组（C大调）包含的键分别为： C4，C#4/Db4，D4，D#4/Eb4，E4，F4，F#4/Gb4，G4，G#4/Ab4，A4，A#4/Bb4，B4  
+钢琴八度音就是12345671八个音，最后一个音是高1。**遵循：全全半全全全半** 就会得到 1 2 3 4 5 6 7 (高)1 的音
+
+<p align="left">
+  <img src="https://paddlespeech.bj.bcebos.com/t2s/svs/svs_music_scores/piano_88.png" />
+</p>
+
+## 1.2 十二大调
+“#”表示升调
+
+<p align="left">
+  <img src="https://paddlespeech.bj.bcebos.com/t2s/svs/svs_music_scores/up.png" />
+</p>
+
+“b”表示降调
+
+<p align="left">
+  <img src="https://paddlespeech.bj.bcebos.com/t2s/svs/svs_music_scores/down.png" />
+</p>
+
+什么大调表示Do(简谱1) 这个音从哪个键开始，例如D大调，则用D这个键来表示 Do这个音。
+下图是十二大调下简谱与音名的对应表。
+
+<p align="left">
+  <img src="../../../docs/images/note_map.png" />
+</p>
+
+
+## 1.3 Tempo
+Tempo 用于表示速度（Speed of the beat/pulse），一分钟里面有几拍（beats per mimute BPM）
+
+<p align="left">
+  <img src="https://paddlespeech.bj.bcebos.com/t2s/svs/svs_music_scores/note_beat.png" width="450"/>
+</p>
+
+whole note -->  4 beats</br>
+half note --> 2 beats</br>
+quarter note --> 1 beat</br>
+eighth note --> 1/2 beat</br>
+sixteenth note --> 1/4 beat</br> 
+
+
+# 二、应用试验
+## 2.1 从谱中获取 music scores
+music scores 包含：note，note_dur，is_slur
+
+<p align="left">
+  <img src="https://paddlespeech.bj.bcebos.com/t2s/svs/svs_music_scores/pu.png" width="600"/>
+</p>
+
+从左上角的谱信息 *bE* 可以得出该谱子是 **降E大调**，可以对应1.2小节十二大调简谱音名对照表根据 简谱获取对应的note
+从左上角的谱信息 *quarter note* 可以得出该谱子的速度是 **一分钟95拍（beat）**，一拍的时长 = **60/95 = 0.631578s**
+从左上角的谱信息 *4/4* 可以得出该谱子表示四分音符为一拍（分母的4），每小节有4拍（分子的4）
+
+从该简谱上可以获取 music score 如下：
+
+|text |phone |简谱（辅助）后面的点表示高八音 |note （从小字组开始算） |几拍（辅助） |note_dur |is_slur|
+:-------------:| :------------:| :-----: | -----: | :-----: |:-----:| :-----:  |
+|小 |x   |5  |A#3/Bb3 |半 |0.315789 |0 |
+|   |iao |5  |A#3/Bb3 |半 |0.315789 |0 |
+|酒 |j   |1. |D#4/Eb4 |半 |0.315789 |0 |
+|   |iu  |1. |D#4/Eb4 |半 |0.315789 |0 |
+|窝 |w   |2. |F4      |半 |0.315789 |0 |
+|   |o   |2. |F4      |半 |0.315789 |0 |
+|长 |ch  |3. |G4      |半 |0.315789 |0 |
+|   |ang |3. |G4      |半 |0.315789 |0 |
+|   |ang |1. |D#4/Eb4 |半 |0.315789 |1 |
+|睫 |j   |1. |D#4/Eb4 |半 |0.315789 |0 |
+|   |ie  |1. |D#4/Eb4 |半 |0.315789 |0 |
+|   |ie  |5  |A#3/Bb3 |半 |0.315789 |1 |
+|毛 |m   |5  |A#3/Bb3 |一 |0.631578 |0 |
+|   |ao  |5  |A#3/Bb3 |一 |0.631578 |0 |
+|是 |sh  |5  |A#3/Bb3 |半 |0.315789 |0 |
+|   |i   |5  |A#3/Bb3 |半 |0.315789 |0 |
+|你 |n   |3. |G4      |半 |0.315789 |0 |
+|   |i   |3. |G4      |半 |0.315789 |0 |
+|最 |z   |2. |F4      |半 |0.315789 |0 |
+|   |ui  |2. |F4      |半 |0.315789 |0 |
+|美 |m   |3. |G4      |半 |0.315789 |0 |
+|   |ei  |3. |G4      |半 |0.315789 |0 |
+|的 |d   |2. |F4      |半 |0.315789 |0 |
+|   |e   |2. |F4      |半 |0.315789 |0 |
+|记 |j   |7  |D4      |半 |0.315789 |0 |
+|   |i   |7  |D4      |半 |0.315789 |0 |
+|号 |h   |5  |A#3/Bb3 |半 |0.315789 |0 |
+|   |ao  |5  |A#3/Bb3 |半 |0.315789 |0 |
+
+
+## 2.2 一些实验
+
+<div align = "center">
+<table style="width:100%">
+  <thead>
+    <tr>
+      <th> 序号  </th>
+      <th width="500"> 说明  </th>
+      <th> 合成音频（diffsinger_opencpop + pwgan_opencpop） </th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td > 1 </td>
+      <td > 原始 opencpop 标注的 notes，note_durs，is_slurs，升F大调，起始在小字组（第3组） </td>
+      <td align = "center">
+      <a href="https://paddlespeech.bj.bcebos.com/t2s/svs/svs_music_scores/test1.wav" rel="nofollow">
+            <img align="center" src="../../../docs/images/audio_icon.png" width="200 style="max-width: 100%;"></a><br>
+      </td>
+    </tr>
+    <tr>
+      <td > 2 </td>
+      <td > 原始 opencpop 标注的 notes 和 is_slurs，note_durs 改变（从谱子获取） </td>
+      <td align = "center">
+      <a href="https://paddlespeech.bj.bcebos.com/t2s/svs/svs_music_scores/test2.wav" rel="nofollow">
+            <img align="center" src="../../../docs/images/audio_icon.png" width="200 style="max-width: 100%;"></a><br>
+      </td>
+    </tr>
+    <tr>
+      <td > 3 </td>
+      <td > 原始 opencpop 标注的 notes 去掉 rest（毛字一拍），is_slurs 和 note_durs 改变（从谱子获取） </td>
+      <td align = "center">
+      <a href="https://paddlespeech.bj.bcebos.com/t2s/svs/svs_music_scores/test3.wav" rel="nofollow">
+            <img align="center" src="../../../docs/images/audio_icon.png" width="200 style="max-width: 100%;"></a><br>
+      </td>
+    </tr>
+    <tr>
+      <td > 4 </td>
+      <td > 从谱子获取 notes，note durs，is_slurs，不含 rest（毛字一拍），起始在小字一组（第3组） </td>
+      <td align = "center">
+      <a href="https://paddlespeech.bj.bcebos.com/t2s/svs/svs_music_scores/test4.wav" rel="nofollow">
+            <img align="center" src="../../../docs/images/audio_icon.png" width="200 style="max-width: 100%;"></a><br>
+      </td>
+    </tr>
+    <tr>
+      <td > 5 </td>
+      <td > 从谱子获取 notes，note durs，is_slurs，加上 rest （毛字半拍，rest半拍），起始在小字一组（第3组）</td>
+      <td align = "center">
+      <a href="https://paddlespeech.bj.bcebos.com/t2s/svs/svs_music_scores/test5.wav" rel="nofollow">
+            <img align="center" src="../../../docs/images/audio_icon.png" width="200 style="max-width: 100%;"></a><br>
+      </td>
+    </tr>
+    <tr>
+      <td > 6 </td>
+      <td > 从谱子获取 notes， is_slurs，包含 rest，note_durs 从原始标注获取，起始在小字一组（第3组） </td>
+      <td align = "center">
+      <a href="https://paddlespeech.bj.bcebos.com/t2s/svs/svs_music_scores/test6.wav" rel="nofollow">
+            <img align="center" src="../../../docs/images/audio_icon.png" width="200 style="max-width: 100%;"></a><br>
+      </td>
+    </tr>
+    <tr>
+      <td > 7 </td>
+      <td > 从谱子获取 notes，note durs，is_slurs，不含 rest（毛字一拍），起始在小字一组（第4组） </td>
+      <td align = "center">
+      <a href="https://paddlespeech.bj.bcebos.com/t2s/svs/svs_music_scores/test7.wav" rel="nofollow">
+            <img align="center" src="../../../docs/images/audio_icon.png" width="200 style="max-width: 100%;"></a><br>
+      </td>
+    </tr>
+    
+  </tbody>
+</table>
+
+</div>
+
+
+上述实验表明通过该方法来提取 music score 是可行的，但是在应用中可以**灵活地在歌词中加"AP"(用来表示吸气声)和"SP"(用来表示停顿声)**，对应的在 **note 上加 rest**，会使得整体的歌声合成更自然。
+除此之外，还要考虑哪一个大调并且以哪一组为起始**得到的 note 在训练数据集中出现过**，如若推理时传入训练数据中没有见过的 note， 合成出来的音频可能不是我们期待的音调。
+
+
+# 三、其他
+## 3.1 读取midi
+
+```python
+import mido
+mid = mido.MidiFile('2093.midi')
+```
diff --git a/docs/topic/package_release/python_package_release.md b/docs/topic/package_release/python_package_release.md
index cb1029e7..c735e0bd 100644
--- a/docs/topic/package_release/python_package_release.md
+++ b/docs/topic/package_release/python_package_release.md
@@ -165,8 +165,7 @@ docker run -it xxxxxx
 设置python：
 
 ```bash
-export PATH="/opt/python/cp37-cp37m/bin/:$PATH"
-#export PATH="/opt/python/cp38-cp38/bin/:$PATH"
+export PATH="/opt/python/cp38-cp38/bin/:$PATH"
 #export PATH="/opt/python/cp39-cp39/bin/:$PATH"
 ```
 
diff --git a/docs/tutorial/asr/tutorial_transformer.ipynb b/docs/tutorial/asr/tutorial_transformer.ipynb
index dc303006..77aed4bf 100644
--- a/docs/tutorial/asr/tutorial_transformer.ipynb
+++ b/docs/tutorial/asr/tutorial_transformer.ipynb
@@ -236,8 +236,8 @@
     "warnings.filterwarnings('ignore')\n",
     "\n",
     "from yacs.config import CfgNode\n",
-    "from paddlespeech.s2t.transform.spectrogram import LogMelSpectrogramKaldi\n",
-    "from paddlespeech.s2t.transform.cmvn import GlobalCMVN\n",
+    "from paddlespeech.audio.transform.spectrogram import LogMelSpectrogramKaldi\n",
+    "from paddlespeech.audio.transform.cmvn import GlobalCMVN\n",
     "from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer\n",
     "from paddlespeech.s2t.models.u2 import U2Model\n",
     "\n",
diff --git a/docs/tutorial/st/st_tutorial.ipynb b/docs/tutorial/st/st_tutorial.ipynb
index 2fb85053..e755beba 100644
--- a/docs/tutorial/st/st_tutorial.ipynb
+++ b/docs/tutorial/st/st_tutorial.ipynb
@@ -62,7 +62,7 @@
     "collapsed": false
    },
    "source": [
-    "# 使用Transformer进行端到端语音翻译的的基本流程\n",
+    "# 使用Transformer进行端到端语音翻译的基本流程\n",
     "## 基础模型\n",
     "由于 ASR 章节已经介绍了 Transformer 以及语音特征抽取，在此便不做过多介绍，感兴趣的同学可以去相关章节进行了解。\n",
     "\n",
diff --git a/docs/tutorial/tts/tts_tutorial.ipynb b/docs/tutorial/tts/tts_tutorial.ipynb
index 583adb01..0cecb680 100644
--- a/docs/tutorial/tts/tts_tutorial.ipynb
+++ b/docs/tutorial/tts/tts_tutorial.ipynb
@@ -464,7 +464,7 @@
     "<br><center> FastSpeech2 网络结构图</center></br>\n",
     "\n",
     "\n",
-    "PaddleSpeech TTS 实现的 FastSpeech2 与论文不同的地方在于，我们使用的的是 phone 级别的 `pitch` 和 `energy`(与 FastPitch 类似)，这样的合成结果可以更加**稳定**。\n",
+    "PaddleSpeech TTS 实现的 FastSpeech2 与论文不同的地方在于，我们使用的是 phone 级别的 `pitch` 和 `energy`(与 FastPitch 类似)，这样的合成结果可以更加**稳定**。\n",
     "<center><img src=\"https://ai-studio-static-online.cdn.bcebos.com/862c21456c784c41a83a308b7d9707f0810cc3b3c6f94ed48c60f5d32d0072f0\"></center>\n",
     "<br><center> FastPitch 网络结构图</center></br>\n",
     "\n",
diff --git a/examples/aishell/asr0/local/train.sh b/examples/aishell/asr0/local/train.sh
index 2b71b7f7..c0da3325 100755
--- a/examples/aishell/asr0/local/train.sh
+++ b/examples/aishell/asr0/local/train.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-if [ $# -lt 2 ] && [ $# -gt 3 ];then
+if [ $# -lt 2 ] || [ $# -gt 3 ];then
     echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ips(optional)"
     exit -1
 fi
diff --git a/examples/aishell/asr1/RESULTS.md b/examples/aishell/asr1/RESULTS.md
index 79c695b1..be771ba5 100644
--- a/examples/aishell/asr1/RESULTS.md
+++ b/examples/aishell/asr1/RESULTS.md
@@ -1,27 +1,55 @@
 # Aishell
 
-## Conformer
-paddle version: 2.2.2  
-paddlespeech version: 1.0.1
-| Model | Params | Config | Augmentation| Test set | Decode method | Loss | CER |
-| --- | --- | --- | --- | --- | --- | --- | --- | 
-| conformer | 47.07M  | conf/conformer.yaml | spec_aug | test | attention | - | 0.0522 |
-| conformer | 47.07M  | conf/conformer.yaml | spec_aug | test | ctc_greedy_search | - | 0.0481 |
-| conformer | 47.07M  | conf/conformer.yaml | spec_aug| test | ctc_prefix_beam_search | - | 0.0480 | 
-| conformer | 47.07M  | conf/conformer.yaml | spec_aug | test | attention_rescoring | - | 0.0460 | 
+## RoFormer Streaming
+paddle version: 2.5.0  
+paddlespeech version: 1.5.0
+
+Tesla V100-SXM2-32GB: 1 node, 4 card
+Global BachSize: 32 * 4
+Training Done: 1 day, 12:56:39.639646
+### `decoding.decoding_chunk_size=16`
+
+> chunk_size=16, ((16 - 1) * 4 + 7) * 10ms = (16 * 4 + 3) * 10ms = 670ms
+
+| Model | Params | Config | Augmentation| Test set | Decode method | Chunk Size & Left Chunks | Loss | CER |  
+| --- | --- | --- | --- | --- | --- | --- | --- | --- |  
+| roformer | 44.80M | conf/chunk_roformer.yaml | spec_aug | test | attention | 16, -1 | - |  5.63 |  
+| roformer | 44.80M | conf/chunk_roformer.yaml | spec_aug | test | ctc_greedy_search | 16, -1 | - | 6.13 |  
+| roformer | 44.80M | conf/chunk_roformer.yaml | spec_aug | test | ctc_prefix_beam_search | 16, -1 | - | 6.13 |  
+| roformer | 44.80M | conf/chunk_roformer.yaml | spec_aug | test | attention_rescoring | 16, -1 |  - | 5.44 |  
+
+### `decoding.decoding_chunk_size=-1`
+
+| Model | Params | Config | Augmentation| Test set | Decode method | Chunk Size & Left Chunks | Loss | CER |  
+| --- | --- | --- | --- | --- | --- | --- | --- | --- |  
+| roformer | 44.80M | conf/chunk_roformer.yaml | spec_aug | test | attention | -1, -1 | - | 5.39 |  
+| roformer | 44.80M | conf/chunk_roformer.yaml | spec_aug | test | ctc_greedy_search | -1, -1 | - |  5.51 |  
+| roformer | 44.80M | conf/chunk_roformer.yaml | spec_aug | test | ctc_prefix_beam_search | -1, -1 | - | 5.51 | 
+| roformer | 44.80M | conf/chunk_roformer.yaml | spec_aug | test | attention_rescoring | -1, -1 |  - | 4.99 |  
 
 
 ## Conformer Streaming
 paddle version: 2.2.2  
-paddlespeech version: 0.2.0  
+paddlespeech version: 1.4.1  
 Need set `decoding.decoding_chunk_size=16` when decoding.
 
 | Model | Params | Config | Augmentation| Test set | Decode method | Chunk Size & Left Chunks | Loss | CER |  
 | --- | --- | --- | --- | --- | --- | --- | --- | --- |  
-| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug | test | attention | 16, -1 | - | 0.0551 |  
-| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug | test | ctc_greedy_search | 16, -1 | - | 0.0629 |  
-| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug | test | ctc_prefix_beam_search | 16, -1 | - | 0.0629 |  
-| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug | test | attention_rescoring | 16, -1 |  - | 0.0544 |  
+| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug | test | attention | 16, -1 | - | 0.056102 |  
+| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug | test | ctc_greedy_search | 16, -1 | - | 0.058160 |  
+| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug | test | ctc_prefix_beam_search | 16, -1 | - | 0.058160 |  
+| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug | test | attention_rescoring | 16, -1 |  - | 0.051968 |  
+
+
+## Conformer
+paddle version: 2.2.2  
+paddlespeech version: 1.0.1
+| Model | Params | Config | Augmentation| Test set | Decode method | Loss | CER |
+| --- | --- | --- | --- | --- | --- | --- | --- | 
+| conformer | 47.07M  | conf/conformer.yaml | spec_aug | test | attention | - | 0.0522 |
+| conformer | 47.07M  | conf/conformer.yaml | spec_aug | test | ctc_greedy_search | - | 0.0481 |
+| conformer | 47.07M  | conf/conformer.yaml | spec_aug | test | ctc_prefix_beam_search | - | 0.0480 | 
+| conformer | 47.07M  | conf/conformer.yaml | spec_aug | test | attention_rescoring | - | 0.0460 | 
 
 
 ## Transformer 
diff --git a/examples/aishell/asr1/conf/chunk_roformer.yaml b/examples/aishell/asr1/conf/chunk_roformer.yaml
new file mode 100644
index 00000000..a4051a02
--- /dev/null
+++ b/examples/aishell/asr1/conf/chunk_roformer.yaml
@@ -0,0 +1,98 @@
+############################################
+#           Network Architecture           #
+############################################
+cmvn_file: 
+cmvn_file_type: "json"
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1   # sublayer output dropout
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: True
+    cnn_module_kernel: 15
+    use_cnn_module: True
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rope_pos' # abs_pos, rel_pos, rope_pos
+    selfattention_layer_type: 'rel_selfattn' # unused
+    causal: true
+    use_dynamic_chunk: true
+    cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
+    use_dynamic_left_chunk: false
+# decoder related
+decoder: transformer # transformer, bitransformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    r_num_blocks: 0    # only for bitransformer
+    dropout_rate: 0.1  # sublayer output dropout
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    reverse_weight: 0.0 # only for bitransformer
+    length_normalized_loss: false
+    init_type: 'kaiming_uniform' # !Warning: need to convergence
+
+###########################################
+#                   Data                  #
+###########################################
+
+train_manifest: data/manifest.train
+dev_manifest: data/manifest.dev
+test_manifest: data/manifest.test
+
+
+###########################################
+#              Dataloader                 #
+###########################################
+
+vocab_filepath: data/lang_char/vocab.txt 
+spm_model_prefix: ''
+unit_type: 'char'
+preprocess_config: conf/preprocess.yaml
+feat_dim: 80
+stride_ms: 10.0
+window_ms: 25.0
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+batch_size: 32
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+num_workers: 2
+subsampling_factor: 1
+num_encs: 1
+
+###########################################
+#                 Training                #
+###########################################
+n_epoch: 240 
+accum_grad: 1
+global_grad_clip: 5.0
+dist_sampler: True
+optim: adam
+optim_conf:
+  lr: 0.001
+  weight_decay: 1.0e-6
+scheduler: warmuplr
+scheduler_conf:
+  warmup_steps: 25000
+  lr_decay: 1.0
+log_interval: 100
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
diff --git a/examples/aishell/asr1/conf/chunk_roformer_bidecoder.yaml b/examples/aishell/asr1/conf/chunk_roformer_bidecoder.yaml
new file mode 100644
index 00000000..aa3a0aca
--- /dev/null
+++ b/examples/aishell/asr1/conf/chunk_roformer_bidecoder.yaml
@@ -0,0 +1,98 @@
+############################################
+#           Network Architecture           #
+############################################
+cmvn_file: 
+cmvn_file_type: "json"
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1   # sublayer output dropout
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: True
+    cnn_module_kernel: 15
+    use_cnn_module: True
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rope_pos' # abs_pos, rel_pos, rope_pos
+    selfattention_layer_type: 'rel_selfattn' # unused
+    causal: true
+    use_dynamic_chunk: true
+    cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
+    use_dynamic_left_chunk: false
+# decoder related
+decoder: bitransformer # transformer, bitransformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 3
+    r_num_blocks: 3    # only for bitransformer
+    dropout_rate: 0.1  # sublayer output dropout
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    reverse_weight: 0.3 # only for bitransformer
+    length_normalized_loss: false
+    init_type: 'kaiming_uniform' # !Warning: need to convergence
+
+###########################################
+#                   Data                  #
+###########################################
+
+train_manifest: data/manifest.train
+dev_manifest: data/manifest.dev
+test_manifest: data/manifest.test
+
+
+###########################################
+#              Dataloader                 #
+###########################################
+
+vocab_filepath: data/lang_char/vocab.txt 
+spm_model_prefix: ''
+unit_type: 'char'
+preprocess_config: conf/preprocess.yaml
+feat_dim: 80
+stride_ms: 10.0
+window_ms: 25.0
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+batch_size: 32
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+num_workers: 2
+subsampling_factor: 1
+num_encs: 1
+
+###########################################
+#                 Training                #
+###########################################
+n_epoch: 240 
+accum_grad: 1
+global_grad_clip: 5.0
+dist_sampler: True
+optim: adam
+optim_conf:
+  lr: 0.001
+  weight_decay: 1.0e-6
+scheduler: warmuplr
+scheduler_conf:
+  warmup_steps: 25000
+  lr_decay: 1.0
+log_interval: 100
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
diff --git a/examples/aishell/asr1/conf/chunk_squeezeformer.yaml b/examples/aishell/asr1/conf/chunk_squeezeformer.yaml
new file mode 100644
index 00000000..35a90b7d
--- /dev/null
+++ b/examples/aishell/asr1/conf/chunk_squeezeformer.yaml
@@ -0,0 +1,98 @@
+############################################
+#           Network Architecture           #
+############################################
+cmvn_file: 
+cmvn_file_type: "json"
+# encoder related
+encoder: squeezeformer
+encoder_conf:
+    encoder_dim: 256    # dimension of attention
+    output_size: 256    # dimension of output
+    attention_heads: 4
+    num_blocks: 12      # the number of encoder blocks
+    reduce_idx: 5
+    recover_idx: 11
+    feed_forward_expansion_factor: 8
+    input_dropout_rate: 0.1
+    feed_forward_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    adaptive_scale: true
+    cnn_module_kernel: 31
+    normalize_before: false
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rel_pos'
+    time_reduction_layer_type: 'stream'
+    causal: true
+    use_dynamic_chunk: true
+    use_dynamic_left_chunk: false
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1  # sublayer output dropout
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+    init_type: 'kaiming_uniform' # !Warning: need to convergence
+
+###########################################
+#                   Data                  #
+###########################################
+
+train_manifest: data/manifest.train
+dev_manifest: data/manifest.dev
+test_manifest: data/manifest.test
+
+
+###########################################
+#              Dataloader                 #
+###########################################
+
+vocab_filepath: data/lang_char/vocab.txt 
+spm_model_prefix: ''
+unit_type: 'char'
+preprocess_config: conf/preprocess.yaml
+feat_dim: 80
+stride_ms: 10.0
+window_ms: 25.0
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+batch_size: 32
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+num_workers: 2
+subsampling_factor: 1
+num_encs: 1
+
+###########################################
+#                 Training                #
+###########################################
+n_epoch: 240 
+accum_grad: 1
+global_grad_clip: 5.0
+dist_sampler: True
+optim: adam
+optim_conf:
+  lr: 0.001
+  weight_decay: 1.0e-6
+scheduler: warmuplr
+scheduler_conf:
+  warmup_steps: 25000
+  lr_decay: 1.0
+log_interval: 100
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
diff --git a/examples/aishell/asr1/conf/squeezeformer.yaml b/examples/aishell/asr1/conf/squeezeformer.yaml
new file mode 100644
index 00000000..b7841aca
--- /dev/null
+++ b/examples/aishell/asr1/conf/squeezeformer.yaml
@@ -0,0 +1,93 @@
+############################################
+#           Network Architecture           #
+############################################
+cmvn_file: 
+cmvn_file_type: "json"
+# encoder related
+encoder: squeezeformer
+encoder_conf:
+    encoder_dim: 256    # dimension of attention
+    output_size: 256    # dimension of output
+    attention_heads: 4
+    num_blocks: 12      # the number of encoder blocks
+    reduce_idx: 5
+    recover_idx: 11
+    feed_forward_expansion_factor: 8
+    input_dropout_rate: 0.1
+    feed_forward_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    adaptive_scale: true
+    cnn_module_kernel: 31
+    normalize_before: false
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rel_pos'
+    time_reduction_layer_type: 'conv1d'
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+    init_type: 'kaiming_uniform' # !Warning: need to convergence
+
+###########################################
+#                   Data                  #
+###########################################
+train_manifest: data/manifest.train
+dev_manifest: data/manifest.dev
+test_manifest: data/manifest.test
+
+###########################################
+#              Dataloader                 #
+###########################################
+vocab_filepath: data/lang_char/vocab.txt 
+spm_model_prefix: ''
+unit_type: 'char'
+preprocess_config: conf/preprocess.yaml
+feat_dim: 80
+stride_ms: 10.0
+window_ms: 25.0
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+batch_size: 32
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0 
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+num_workers: 2
+subsampling_factor: 1
+num_encs: 1
+
+###########################################
+#                Training                 #
+###########################################
+n_epoch: 150 
+accum_grad: 8
+global_grad_clip: 5.0
+dist_sampler: False
+optim: adam
+optim_conf:
+  lr: 0.002
+  weight_decay: 1.0e-6
+scheduler: warmuplr
+scheduler_conf:
+  warmup_steps: 25000
+  lr_decay: 1.0
+log_interval: 100
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
diff --git a/examples/aishell/asr1/local/test.sh b/examples/aishell/asr1/local/test.sh
index 26926b4a..8487e990 100755
--- a/examples/aishell/asr1/local/test.sh
+++ b/examples/aishell/asr1/local/test.sh
@@ -1,15 +1,21 @@
 #!/bin/bash
 
-if [ $# != 3 ];then
-    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix"
-    exit -1
-fi
+set -e
 
 stage=0
 stop_stage=100
+
+source utils/parse_options.sh || exit 1;
+
 ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 
+
+if [ $# != 3 ];then
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix"
+    exit -1
+fi
+
 config_path=$1
 decode_config_path=$2
 ckpt_prefix=$3
@@ -92,6 +98,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
 fi
 
 if [ ${stage} -le 101 ] && [ ${stop_stage} -ge 101 ]; then
+    echo "using sclite to compute cer..."
     # format the reference test file for sclite
     python utils/format_rsl.py \
         --origin_ref data/manifest.test.raw \
diff --git a/examples/aishell/asr1/local/train.sh b/examples/aishell/asr1/local/train.sh
index bfa8dd97..3d4f052a 100755
--- a/examples/aishell/asr1/local/train.sh
+++ b/examples/aishell/asr1/local/train.sh
@@ -17,7 +17,7 @@ if [ ${seed} != 0  ]; then
     echo "using seed $seed & FLAGS_cudnn_deterministic=True ..."
 fi
 
-if [ $# -lt 2 ] && [ $# -gt 3 ];then
+if [ $# -lt 2 ] || [ $# -gt 3 ];then
     echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ips(optional)"
     exit -1
 fi
diff --git a/examples/aishell/asr3/local/train.sh b/examples/aishell/asr3/local/train.sh
index e51e3d34..33fef0fd 100755
--- a/examples/aishell/asr3/local/train.sh
+++ b/examples/aishell/asr3/local/train.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-if [ $# -lt 2 ] && [ $# -gt 3 ];then
+if [ $# -lt 2 ] || [ $# -gt 3 ];then
     echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ips(optional)"
     exit -1
 fi
diff --git a/examples/aishell3/tts3/README.md b/examples/aishell3/tts3/README.md
index 49801c4c..c33d665c 100644
--- a/examples/aishell3/tts3/README.md
+++ b/examples/aishell3/tts3/README.md
@@ -241,7 +241,7 @@ fastspeech2_aishell3_ckpt_1.1.0
 ├── speaker_id_map.txt      # speaker id map file when training a multi-speaker fastspeech2
 └── speech_stats.npy        # statistics used to normalize spectrogram when training fastspeech2
 ```
-You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained fastspeech2 and parallel wavegan models.
+You can use the following scripts to synthesize for `${BIN_DIR}/../../assets/sentences.txt` using pretrained fastspeech2 and parallel wavegan models.
 ```bash
 source path.sh
 
@@ -257,7 +257,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
   --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
   --voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
   --lang=zh \
-  --text=${BIN_DIR}/../sentences.txt \
+  --text=${BIN_DIR}/../../assets/sentences.txt \
   --output_dir=exp/default/test_e2e \
   --phones_dict=fastspeech2_aishell3_ckpt_1.1.0/phone_id_map.txt \
   --speaker_dict=fastspeech2_aishell3_ckpt_1.1.0/speaker_id_map.txt \
diff --git a/examples/aishell3/tts3/local/inference.sh b/examples/aishell3/tts3/local/inference.sh
index dc05ec59..2d096bdc 100755
--- a/examples/aishell3/tts3/local/inference.sh
+++ b/examples/aishell3/tts3/local/inference.sh
@@ -10,7 +10,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --inference_dir=${train_output_path}/inference \
         --am=fastspeech2_aishell3 \
         --voc=pwgan_aishell3 \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/pd_infer_out \
         --phones_dict=dump/phone_id_map.txt \
         --speaker_dict=dump/speaker_id_map.txt \
@@ -22,7 +22,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --inference_dir=${train_output_path}/inference \
         --am=fastspeech2_aishell3 \
         --voc=hifigan_aishell3 \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/pd_infer_out \
         --phones_dict=dump/phone_id_map.txt \
         --speaker_dict=dump/speaker_id_map.txt \
diff --git a/examples/aishell3/tts3/local/lite_predict.sh b/examples/aishell3/tts3/local/lite_predict.sh
index e77e8b6c..2534b460 100755
--- a/examples/aishell3/tts3/local/lite_predict.sh
+++ b/examples/aishell3/tts3/local/lite_predict.sh
@@ -11,7 +11,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --inference_dir=${train_output_path}/pdlite \
         --am=fastspeech2_aishell3 \
         --voc=pwgan_aishell3 \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/lite_infer_out \
         --phones_dict=dump/phone_id_map.txt \
         --speaker_dict=dump/speaker_id_map.txt \
@@ -24,7 +24,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --inference_dir=${train_output_path}/pdlite \
         --am=fastspeech2_aishell3 \
         --voc=hifigan_aishell3 \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/lite_infer_out \
         --phones_dict=dump/phone_id_map.txt \
         --speaker_dict=dump/speaker_id_map.txt \
diff --git a/examples/aishell3/tts3/local/ort_predict.sh b/examples/aishell3/tts3/local/ort_predict.sh
index 24e66f68..9c41dee3 100755
--- a/examples/aishell3/tts3/local/ort_predict.sh
+++ b/examples/aishell3/tts3/local/ort_predict.sh
@@ -10,7 +10,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --am=fastspeech2_aishell3 \
         --voc=pwgan_aishell3 \
         --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../csmsc_test.txt \
+        --text=${BIN_DIR}/../../assets/csmsc_test.txt \
         --phones_dict=dump/phone_id_map.txt \
         --device=cpu \
         --cpu_threads=2 \
@@ -24,7 +24,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --am=fastspeech2_aishell3 \
         --voc=hifigan_aishell3 \
         --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../csmsc_test.txt \
+        --text=${BIN_DIR}/../../assets/csmsc_test.txt \
         --phones_dict=dump/phone_id_map.txt \
         --device=cpu \
         --cpu_threads=2 \
diff --git a/examples/aishell3/tts3/local/synthesize_e2e.sh b/examples/aishell3/tts3/local/synthesize_e2e.sh
index 158350ae..2cc22ede 100755
--- a/examples/aishell3/tts3/local/synthesize_e2e.sh
+++ b/examples/aishell3/tts3/local/synthesize_e2e.sh
@@ -21,7 +21,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
         --voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
         --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/test_e2e \
         --phones_dict=dump/phone_id_map.txt \
         --speaker_dict=dump/speaker_id_map.txt \
@@ -44,7 +44,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --voc_ckpt=hifigan_aishell3_ckpt_0.2.0/snapshot_iter_2500000.pdz \
         --voc_stat=hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \
         --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/test_e2e \
         --phones_dict=dump/phone_id_map.txt \
         --speaker_dict=dump/speaker_id_map.txt \
diff --git a/examples/aishell3/tts3/run.sh b/examples/aishell3/tts3/run.sh
index b5da076b..8dcecaa0 100755
--- a/examples/aishell3/tts3/run.sh
+++ b/examples/aishell3/tts3/run.sh
@@ -43,10 +43,7 @@ fi
 
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
     # install paddle2onnx
-    version=$(echo `pip list |grep "paddle2onnx"` |awk -F" " '{print $2}')
-    if [[ -z "$version" || ${version} != '1.0.0' ]]; then
-        pip install paddle2onnx==1.0.0
-    fi
+    pip install paddle2onnx --upgrade
     ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx fastspeech2_aishell3
     # considering the balance between speed and quality, we recommend that you use hifigan as vocoder
     ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_aishell3
diff --git a/examples/aishell3/vits/README.md b/examples/aishell3/vits/README.md
index dc80e18b..8c19e29f 100644
--- a/examples/aishell3/vits/README.md
+++ b/examples/aishell3/vits/README.md
@@ -196,7 +196,7 @@ python3 ${BIN_DIR}/synthesize_e2e.py \
     --phones_dict=vits_aishell3_ckpt_1.1.0/phone_id_map.txt \
     --speaker_dict=vits_aishell3_ckpt_1.1.0/speaker_id_map.txt \
     --output_dir=exp/default/test_e2e \
-    --text=${BIN_DIR}/../sentences.txt \
+    --text=${BIN_DIR}/../../assets/sentences.txt \
     --add-blank=${add_blank} 
 ```
 -->
diff --git a/examples/aishell3/vits/local/synthesize_e2e.sh b/examples/aishell3/vits/local/synthesize_e2e.sh
index 1bd58549..5369cbf9 100755
--- a/examples/aishell3/vits/local/synthesize_e2e.sh
+++ b/examples/aishell3/vits/local/synthesize_e2e.sh
@@ -20,6 +20,6 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --speaker_dict=dump/speaker_id_map.txt \
         --spk_id=0 \
         --output_dir=${train_output_path}/test_e2e \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --add-blank=${add_blank}
 fi
diff --git a/examples/canton/tts3/README.md b/examples/canton/tts3/README.md
index f46949d2..87ef4090 100644
--- a/examples/canton/tts3/README.md
+++ b/examples/canton/tts3/README.md
@@ -102,7 +102,7 @@ Download the pretrained parallel wavegan model from [pwg_aishell3_ckpt_0.5.zip](
 unzip pwg_aishell3_ckpt_0.5.zip
 ```
 
-You can use the following scripts to synthesize for `${BIN_DIR}/../sentences_canton.txt` using pretrained fastspeech2 and parallel wavegan models.
+You can use the following scripts to synthesize for `${BIN_DIR}/../../assets/sentences_canton.txt` using pretrained fastspeech2 and parallel wavegan models.
 ```bash
 source path.sh
 
@@ -118,7 +118,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
   --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
   --voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
   --lang=canton \
-  --text=${BIN_DIR}/../sentences_canton.txt \
+  --text=${BIN_DIR}/../../assets/sentences_canton.txt \
   --output_dir=exp/default/test_e2e \
   --phones_dict=fastspeech2_canton_ckpt_1.4.0/phone_id_map.txt \
   --speaker_dict=fastspeech2_canton_ckpt_1.4.0/speaker_id_map.txt \
diff --git a/examples/canton/tts3/local/inference.sh b/examples/canton/tts3/local/inference.sh
index caf0b438..ad3af2d0 100755
--- a/examples/canton/tts3/local/inference.sh
+++ b/examples/canton/tts3/local/inference.sh
@@ -12,7 +12,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --am=fastspeech2_canton \
         --voc=pwgan_aishell3 \
         --spk_id=10 \
-        --text=${BIN_DIR}/../sentences_canton.txt \
+        --text=${BIN_DIR}/../../assets/sentences_canton.txt \
         --output_dir=${train_output_path}/pd_infer_out \
         --phones_dict=dump/phone_id_map.txt \
         --speaker_dict=dump/speaker_id_map.txt \
@@ -27,7 +27,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --am=fastspeech2_canton \
         --voc=mb_melgan_csmsc \
         --spk_id=10 \
-        --text=${BIN_DIR}/../sentences_canton.txt \
+        --text=${BIN_DIR}/../../assets/sentences_canton.txt \
         --output_dir=${train_output_path}/pd_infer_out \
         --phones_dict=dump/phone_id_map.txt \
         --speaker_dict=dump/speaker_id_map.txt \
@@ -41,7 +41,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
         --am=fastspeech2_canton \
         --voc=hifigan_csmsc \
         --spk_id=10 \
-        --text=${BIN_DIR}/../sentences_canton.txt \
+        --text=${BIN_DIR}/../../assets/sentences_canton.txt \
         --output_dir=${train_output_path}/pd_infer_out \
         --phones_dict=dump/phone_id_map.txt \
         --speaker_dict=dump/speaker_id_map.txt \
@@ -55,7 +55,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
         --am=fastspeech2_canton \
         --voc=wavernn_csmsc \
         --spk_id=10 \
-        --text=${BIN_DIR}/../sentences_canton.txt \
+        --text=${BIN_DIR}/../../assets/sentences_canton.txt \
         --output_dir=${train_output_path}/pd_infer_out \
         --phones_dict=dump/phone_id_map.txt \
         --speaker_dict=dump/speaker_id_map.txt \
diff --git a/examples/canton/tts3/local/ort_predict.sh b/examples/canton/tts3/local/ort_predict.sh
index d95e49f9..edbe0406 100755
--- a/examples/canton/tts3/local/ort_predict.sh
+++ b/examples/canton/tts3/local/ort_predict.sh
@@ -11,7 +11,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --voc=pwgan_aishell3 \
         --spk_id=10 \
         --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../sentences_canton.txt \
+        --text=${BIN_DIR}/../../assets/sentences_canton.txt \
         --phones_dict=dump/phone_id_map.txt \
         --speaker_dict=dump/speaker_id_map.txt \
         --lang=canton \
@@ -26,7 +26,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --voc=mb_melgan_csmsc \
         --spk_id=10 \
         --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../sentences_canton.txt \
+        --text=${BIN_DIR}/../../assets/sentences_canton.txt \
         --phones_dict=dump/phone_id_map.txt \
         --speaker_dict=dump/speaker_id_map.txt \
         --lang=canton \
@@ -40,7 +40,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
         --am=fastspeech2_canton \
         --voc=hifigan_csmsc \
         --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../sentences_canton.txt \
+        --text=${BIN_DIR}/../../assets/sentences_canton.txt \
         --phones_dict=dump/phone_id_map.txt \
         --speaker_dict=dump/speaker_id_map.txt \
         --lang=canton \
diff --git a/examples/canton/tts3/local/synthesize_e2e.sh b/examples/canton/tts3/local/synthesize_e2e.sh
index 8cf7eb22..38b7e1af 100755
--- a/examples/canton/tts3/local/synthesize_e2e.sh
+++ b/examples/canton/tts3/local/synthesize_e2e.sh
@@ -21,7 +21,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
         --voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
         --lang=canton \
-        --text=${BIN_DIR}/../sentences_canton.txt \
+        --text=${BIN_DIR}/../../assets/sentences_canton.txt \
         --output_dir=${train_output_path}/test_e2e \
         --phones_dict=dump/phone_id_map.txt \
         --speaker_dict=dump/speaker_id_map.txt \
@@ -44,7 +44,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --voc_ckpt=hifigan_aishell3_ckpt_0.2.0/snapshot_iter_2500000.pdz \
         --voc_stat=hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \
         --lang=canton \
-        --text=${BIN_DIR}/../sentences_canton.txt \
+        --text=${BIN_DIR}/../../assets/sentences_canton.txt \
         --output_dir=${train_output_path}/test_e2e \
         --phones_dict=dump/phone_id_map.txt \
         --speaker_dict=dump/speaker_id_map.txt \
diff --git a/examples/canton/tts3/run.sh b/examples/canton/tts3/run.sh
index 3a3dfe0a..acfc5022 100755
--- a/examples/canton/tts3/run.sh
+++ b/examples/canton/tts3/run.sh
@@ -46,10 +46,7 @@ fi
 # we have only tested the following models so far
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
     # install paddle2onnx
-    version=$(echo `pip list |grep "paddle2onnx"` |awk -F" " '{print $2}')
-    if [[ -z "$version" || ${version} != '1.0.0' ]]; then
-        pip install paddle2onnx==1.0.0
-    fi
+    pip install paddle2onnx --upgrade
     ../../csmsc/tts3/local/paddle2onnx.sh ${train_output_path} inference inference_onnx fastspeech2_canton
     # considering the balance between speed and quality, we recommend that you use hifigan as vocoder
     # ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_csmsc
diff --git a/examples/csmsc/jets/README.md b/examples/csmsc/jets/README.md
new file mode 100644
index 00000000..07dade0e
--- /dev/null
+++ b/examples/csmsc/jets/README.md
@@ -0,0 +1,108 @@
+# JETS with CSMSC
+This example contains code used to train a [JETS](https://arxiv.org/abs/2203.16852v1) model with [Chinese Standard Mandarin Speech Copus](https://www.data-baker.com/open_source.html).
+
+## Dataset
+### Download and Extract
+Download CSMSC from it's [Official Website](https://test.data-baker.com/data/index/source).
+
+### Get MFA Result and Extract
+We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get phonemes and durations for JETS.
+You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
+
+## Get Started
+Assume the path to the dataset is `~/datasets/BZNSYP`.
+Assume the path to the MFA result of CSMSC is `./baker_alignment_tone`.
+Run the command below to
+1. **source path**.
+2. preprocess the dataset.
+3. train the model.
+4. synthesize wavs.
+    - synthesize waveform from `metadata.jsonl`.
+    - synthesize waveform from a text file.
+
+```bash
+./run.sh
+```
+You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, running the following command will only preprocess the dataset.
+```bash
+./run.sh --stage 0 --stop-stage 0
+```
+### Data Preprocessing
+```bash
+./local/preprocess.sh ${conf_path}
+```
+When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.
+
+```text
+dump
+├── dev
+│   ├── norm
+│   └── raw
+├── phone_id_map.txt
+├── speaker_id_map.txt
+├── test
+│   ├── norm
+│   └── raw
+└── train
+    ├── feats_stats.npy
+    ├── norm
+    └── raw
+```
+The dataset is split into 3 parts, namely `train`, `dev`, and` test`, each of which contains a `norm` and `raw` subfolder. The raw folder contains wave、mel spectrogram、speech、pitch and energy features of each utterance, while the norm folder contains normalized ones. The statistics used to normalize features are computed from the training set, which is located in `dump/train/feats_stats.npy`.
+
+Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file that contains phones, text_lengths, the path of feats, feats_lengths, the path of pitch features, the path of energy features, the path of raw waves, speaker, and the id of each utterance.
+
+### Model Training
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
+```
+`./local/train.sh` calls `${BIN_DIR}/train.py`.
+Here's the complete help message.
+```text
+usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
+                [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
+                [--ngpu NGPU] [--phones-dict PHONES_DICT]
+
+Train a JETS model.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --config CONFIG       config file to overwrite default config.
+  --train-metadata TRAIN_METADATA
+                        training data.
+  --dev-metadata DEV_METADATA
+                        dev data.
+  --output-dir OUTPUT_DIR
+                        output dir.
+  --ngpu NGPU           if ngpu == 0, use cpu.
+  --phones-dict PHONES_DICT
+                        phone vocabulary file.
+```
+1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
+2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder.
+3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are saved in `checkpoints/` inside this directory.
+4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
+5. `--phones-dict` is the path of the phone vocabulary file.
+
+### Synthesizing
+
+`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
+
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
+```
+
+`./local/synthesize_e2e.sh` calls `${BIN_DIR}/synthesize_e2e.py`, which can synthesize waveform from text file.
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name}
+```
+
+## Pretrained Model
+
+The pretrained model can be downloaded here:
+
+- [jets_csmsc_ckpt_1.5.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/jets_csmsc_ckpt_1.5.0.zip)
+
+The static model can be downloaded here:
+
+- [jets_csmsc_static_1.5.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/jets_csmsc_static_1.5.0.zip)
diff --git a/examples/csmsc/jets/conf/default.yaml b/examples/csmsc/jets/conf/default.yaml
new file mode 100644
index 00000000..1dafd20c
--- /dev/null
+++ b/examples/csmsc/jets/conf/default.yaml
@@ -0,0 +1,224 @@
+# This configuration tested on 4 GPUs (V100) with 32GB GPU
+# memory. It takes around 2 weeks to finish the training
+# but 100k iters model should generate reasonable results.
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+
+n_mels: 80
+fs: 22050         # sr
+n_fft: 1024        # FFT size (samples).
+n_shift: 256       # Hop size (samples). 12.5ms
+win_length: null   # Window length (samples). 50ms
+                   # If set to null, it will be the same as fft_size.
+window: "hann"     # Window function.
+fmin: 0            # minimum frequency for Mel basis
+fmax: null         # maximum frequency for Mel basis
+f0min: 80          # Minimum f0 for pitch extraction.
+f0max: 400         # Maximum f0 for pitch extraction.
+
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+model:
+    # generator related
+    generator_type: jets_generator
+    generator_params:
+        adim: 256         # attention dimension
+        aheads: 2         # number of attention heads
+        elayers: 4        # number of encoder layers
+        eunits: 1024      # number of encoder ff units
+        dlayers: 4        # number of decoder layers
+        dunits: 1024      # number of decoder ff units
+        positionwise_layer_type: conv1d   # type of position-wise layer
+        positionwise_conv_kernel_size: 3  # kernel size of position wise conv layer
+        duration_predictor_layers: 2      # number of layers of duration predictor
+        duration_predictor_chans: 256     # number of channels of duration predictor
+        duration_predictor_kernel_size: 3 # filter size of duration predictor
+        use_masking: True                 # whether to apply masking for padded part in loss calculation
+        encoder_normalize_before: True    # whether to perform layer normalization before the input
+        decoder_normalize_before: True    # whether to perform layer normalization before the input
+        encoder_type: transformer           # encoder type
+        decoder_type: transformer           # decoder type
+        conformer_rel_pos_type: latest               # relative positional encoding type
+        conformer_pos_enc_layer_type: rel_pos        # conformer positional encoding type
+        conformer_self_attn_layer_type: rel_selfattn # conformer self-attention type
+        conformer_activation_type: swish             # conformer activation type
+        use_macaron_style_in_conformer: true         # whether to use macaron style in conformer
+        use_cnn_in_conformer: true                   # whether to use CNN in conformer
+        conformer_enc_kernel_size: 7                 # kernel size in CNN module of conformer-based encoder
+        conformer_dec_kernel_size: 31                # kernel size in CNN module of conformer-based decoder
+        init_type: xavier_uniform                    # initialization type
+        init_enc_alpha: 1.0                          # initial value of alpha for encoder
+        init_dec_alpha: 1.0                          # initial value of alpha for decoder
+        transformer_enc_dropout_rate: 0.2            # dropout rate for transformer encoder layer
+        transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
+        transformer_enc_attn_dropout_rate: 0.2       # dropout rate for transformer encoder attention layer
+        transformer_dec_dropout_rate: 0.2            # dropout rate for transformer decoder layer
+        transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
+        transformer_dec_attn_dropout_rate: 0.2       # dropout rate for transformer decoder attention layer
+        pitch_predictor_layers: 5                    # number of conv layers in pitch predictor
+        pitch_predictor_chans: 256                   # number of channels of conv layers in pitch predictor
+        pitch_predictor_kernel_size: 5               # kernel size of conv leyers in pitch predictor
+        pitch_predictor_dropout: 0.5                 # dropout rate in pitch predictor
+        pitch_embed_kernel_size: 1                   # kernel size of conv embedding layer for pitch
+        pitch_embed_dropout: 0.0                     # dropout rate after conv embedding layer for pitch
+        stop_gradient_from_pitch_predictor: true     # whether to stop the gradient from pitch predictor to encoder
+        energy_predictor_layers: 2                   # number of conv layers in energy predictor
+        energy_predictor_chans: 256                  # number of channels of conv layers in energy predictor
+        energy_predictor_kernel_size: 3              # kernel size of conv leyers in energy predictor
+        energy_predictor_dropout: 0.5                # dropout rate in energy predictor
+        energy_embed_kernel_size: 1                  # kernel size of conv embedding layer for energy
+        energy_embed_dropout: 0.0                    # dropout rate after conv embedding layer for energy
+        stop_gradient_from_energy_predictor: false   # whether to stop the gradient from energy predictor to encoder
+        generator_out_channels: 1
+        generator_channels: 512
+        generator_global_channels: -1
+        generator_kernel_size: 7
+        generator_upsample_scales: [8, 8, 2, 2]
+        generator_upsample_kernel_sizes: [16, 16, 4, 4]
+        generator_resblock_kernel_sizes: [3, 7, 11]
+        generator_resblock_dilations: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
+        generator_use_additional_convs: true
+        generator_bias: true
+        generator_nonlinear_activation: "leakyrelu"
+        generator_nonlinear_activation_params:
+            negative_slope: 0.1
+        generator_use_weight_norm: true
+        segment_size: 64              # segment size for random windowed discriminator
+
+    # discriminator related
+    discriminator_type: hifigan_multi_scale_multi_period_discriminator
+    discriminator_params:
+        scales: 1
+        scale_downsample_pooling: "AvgPool1D"
+        scale_downsample_pooling_params:
+            kernel_size: 4
+            stride: 2
+            padding: 2
+        scale_discriminator_params:
+            in_channels: 1
+            out_channels: 1
+            kernel_sizes: [15, 41, 5, 3]
+            channels: 128
+            max_downsample_channels: 1024
+            max_groups: 16
+            bias: True
+            downsample_scales: [2, 2, 4, 4, 1]
+            nonlinear_activation: "leakyrelu"
+            nonlinear_activation_params:
+                negative_slope: 0.1
+            use_weight_norm: True
+            use_spectral_norm: False
+        follow_official_norm: False
+        periods: [2, 3, 5, 7, 11]
+        period_discriminator_params:
+            in_channels: 1
+            out_channels: 1
+            kernel_sizes: [5, 3]
+            channels: 32
+            downsample_scales: [3, 3, 3, 3, 1]
+            max_downsample_channels: 1024
+            bias: True
+            nonlinear_activation: "leakyrelu"
+            nonlinear_activation_params:
+                negative_slope: 0.1
+            use_weight_norm: True
+            use_spectral_norm: False
+    # others
+    sampling_rate: 22050          # needed in the inference for saving wav
+    cache_generator_outputs: True # whether to cache generator outputs in the training
+use_alignment_module: False       # whether to use alignment module
+      
+###########################################################
+#                        LOSS SETTING                     #
+###########################################################
+# loss function related
+generator_adv_loss_params:
+    average_by_discriminators: False # whether to average loss value by #discriminators
+    loss_type: mse                   # loss type, "mse" or "hinge"
+discriminator_adv_loss_params:
+    average_by_discriminators: False # whether to average loss value by #discriminators
+    loss_type: mse                   # loss type, "mse" or "hinge"
+feat_match_loss_params:
+    average_by_discriminators: False # whether to average loss value by #discriminators
+    average_by_layers: False         # whether to average loss value by #layers of each discriminator
+    include_final_outputs: True      # whether to include final outputs for loss calculation
+mel_loss_params:
+    fs: 22050          # must be the same as the training data
+    fft_size: 1024        # fft points
+    hop_size: 256    # hop size
+    win_length: null   # window length
+    window: hann       # window type
+    num_mels: 80         # number of Mel basis
+    fmin: 0            # minimum frequency for Mel basis
+    fmax: null         # maximum frequency for Mel basis
+    log_base: null     # null represent natural log
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_adv: 1.0        # loss scaling coefficient for adversarial loss
+lambda_mel: 45.0       # loss scaling coefficient for Mel loss
+lambda_feat_match: 2.0 # loss scaling coefficient for feat match loss
+lambda_var: 1.0        # loss scaling coefficient for duration loss
+lambda_align: 2.0         # loss scaling coefficient for KL divergence loss
+# others
+sampling_rate: 22050          # needed in the inference for saving wav
+cache_generator_outputs: True # whether to cache generator outputs in the training
+
+
+# extra module for additional inputs
+pitch_extract: dio           # pitch extractor type
+pitch_extract_conf:
+    reduction_factor: 1
+    use_token_averaged_f0: false
+pitch_normalize: global_mvn  # normalizer for the pitch feature
+energy_extract: energy       # energy extractor type
+energy_extract_conf:
+    reduction_factor: 1
+    use_token_averaged_energy: false
+energy_normalize: global_mvn # normalizer for the energy feature
+
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 32              # Batch size.
+num_workers: 4              # Number of workers in DataLoader.
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+# optimizer setting for generator
+generator_optimizer_params:
+    beta1: 0.8
+    beta2: 0.99
+    epsilon: 1.0e-9
+    weight_decay: 0.0
+generator_scheduler: exponential_decay
+generator_scheduler_params:
+    learning_rate: 2.0e-4
+    gamma: 0.999875                   
+
+# optimizer setting for discriminator
+discriminator_optimizer_params:
+    beta1: 0.8
+    beta2: 0.99
+    epsilon: 1.0e-9
+    weight_decay: 0.0
+discriminator_scheduler: exponential_decay
+discriminator_scheduler_params:
+    learning_rate: 2.0e-4          
+    gamma: 0.999875
+generator_first: True # whether to start updating generator first
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_snapshots: 10            # max number of snapshots to keep while training
+train_max_steps: 350000      # Number of training steps. == total_iters / ngpus, total_iters = 1000000
+save_interval_steps: 1000    # Interval steps to save checkpoint.
+eval_interval_steps: 250     # Interval steps to evaluate the network.
+seed: 777                    # random seed number
diff --git a/examples/csmsc/jets/local/inference.sh b/examples/csmsc/jets/local/inference.sh
new file mode 100755
index 00000000..987f4cea
--- /dev/null
+++ b/examples/csmsc/jets/local/inference.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+train_output_path=$1
+
+stage=0
+stop_stage=0
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    python3 ${BIN_DIR}/inference.py \
+        --inference_dir=${train_output_path}/inference \
+        --am=jets_csmsc \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/pd_infer_out \
+        --phones_dict=dump/phone_id_map.txt
+fi
diff --git a/examples/csmsc/jets/local/preprocess.sh b/examples/csmsc/jets/local/preprocess.sh
new file mode 100755
index 00000000..60053131
--- /dev/null
+++ b/examples/csmsc/jets/local/preprocess.sh
@@ -0,0 +1,77 @@
+#!/bin/bash
+set -e
+stage=0
+stop_stage=100
+
+config_path=$1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # get durations from MFA's result
+    echo "Generate durations.txt from MFA results ..."
+    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
+        --inputdir=./baker_alignment_tone \
+        --output=durations.txt \
+        --config=${config_path}
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # extract features
+    echo "Extract features ..."
+    python3 ${BIN_DIR}/preprocess.py \
+        --dataset=baker \
+        --rootdir=~/datasets/BZNSYP/ \
+        --dumpdir=dump \
+        --dur-file=durations.txt \
+        --config=${config_path} \
+        --num-cpu=20 \
+        --cut-sil=True \
+        --token_average=True
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # get features' stats(mean and std)
+    echo "Get features' stats ..."
+    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --field-name="feats"
+
+    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --field-name="pitch"
+
+    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --field-name="energy"
+
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # normalize and covert phone/speaker to id, dev and test should use train's stats
+    echo "Normalize ..."
+    python3 ${BIN_DIR}/normalize.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --dumpdir=dump/train/norm \
+        --feats-stats=dump/train/feats_stats.npy \
+        --pitch-stats=dump/train/pitch_stats.npy \
+        --energy-stats=dump/train/energy_stats.npy \
+        --phones-dict=dump/phone_id_map.txt \
+        --speaker-dict=dump/speaker_id_map.txt
+
+    python3 ${BIN_DIR}/normalize.py \
+        --metadata=dump/dev/raw/metadata.jsonl \
+        --dumpdir=dump/dev/norm \
+        --feats-stats=dump/train/feats_stats.npy \
+        --pitch-stats=dump/train/pitch_stats.npy \
+        --energy-stats=dump/train/energy_stats.npy \
+        --phones-dict=dump/phone_id_map.txt \
+        --speaker-dict=dump/speaker_id_map.txt
+
+    python3 ${BIN_DIR}/normalize.py \
+        --metadata=dump/test/raw/metadata.jsonl \
+        --dumpdir=dump/test/norm \
+        --feats-stats=dump/train/feats_stats.npy \
+        --pitch-stats=dump/train/pitch_stats.npy \
+        --energy-stats=dump/train/energy_stats.npy \
+        --phones-dict=dump/phone_id_map.txt \
+        --speaker-dict=dump/speaker_id_map.txt
+fi
diff --git a/examples/csmsc/jets/local/synthesize.sh b/examples/csmsc/jets/local/synthesize.sh
new file mode 100755
index 00000000..a4b35ec0
--- /dev/null
+++ b/examples/csmsc/jets/local/synthesize.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+stage=0
+stop_stage=0
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+    python3 ${BIN_DIR}/synthesize.py \
+        --config=${config_path} \
+        --ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --phones_dict=dump/phone_id_map.txt \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test
+fi
diff --git a/examples/csmsc/jets/local/synthesize_e2e.sh b/examples/csmsc/jets/local/synthesize_e2e.sh
new file mode 100755
index 00000000..c95354d8
--- /dev/null
+++ b/examples/csmsc/jets/local/synthesize_e2e.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
+stage=0
+stop_stage=0
+
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+    python3 ${BIN_DIR}/synthesize_e2e.py \
+        --am=jets_csmsc \
+        --config=${config_path} \
+        --ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --phones_dict=dump/phone_id_map.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --inference_dir=${train_output_path}/inference
+fi
diff --git a/examples/csmsc/jets/local/train.sh b/examples/csmsc/jets/local/train.sh
new file mode 100755
index 00000000..d1302f99
--- /dev/null
+++ b/examples/csmsc/jets/local/train.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+
+python3 ${BIN_DIR}/train.py \
+    --train-metadata=dump/train/norm/metadata.jsonl \
+    --dev-metadata=dump/dev/norm/metadata.jsonl \
+    --config=${config_path} \
+    --output-dir=${train_output_path} \
+    --ngpu=1 \
+    --phones-dict=dump/phone_id_map.txt
diff --git a/examples/csmsc/jets/path.sh b/examples/csmsc/jets/path.sh
new file mode 100755
index 00000000..73a0af7e
--- /dev/null
+++ b/examples/csmsc/jets/path.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+MODEL=jets
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
diff --git a/examples/csmsc/jets/run.sh b/examples/csmsc/jets/run.sh
new file mode 100755
index 00000000..d0985c50
--- /dev/null
+++ b/examples/csmsc/jets/run.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+set -e
+source path.sh
+
+gpus=0
+stage=0
+stop_stage=100
+
+conf_path=conf/default.yaml
+train_output_path=exp/default
+ckpt_name=snapshot_iter_150000.pdz
+
+# with the following command, you can choose the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    ./local/preprocess.sh ${conf_path}|| exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # synthesize_e2e
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} || exit -1
+fi
+
diff --git a/examples/csmsc/tts0/README.md b/examples/csmsc/tts0/README.md
index bc7769d1..ce682495 100644
--- a/examples/csmsc/tts0/README.md
+++ b/examples/csmsc/tts0/README.md
@@ -226,7 +226,7 @@ tacotron2_csmsc_ckpt_0.2.0
 ├── snapshot_iter_30600.pdz # model parameters and optimizer states
 └── speech_stats.npy        # statistics used to normalize spectrogram when training Tacotron2
 ```
-You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained Tacotron2 and parallel wavegan models.
+You can use the following scripts to synthesize for `${BIN_DIR}/../../assets/sentences.txt` using pretrained Tacotron2 and parallel wavegan models.
 ```bash
 source path.sh
 
@@ -242,7 +242,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
   --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
   --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
   --lang=zh \
-  --text=${BIN_DIR}/../sentences.txt \
+  --text=${BIN_DIR}/../../assets/sentences.txt \
   --output_dir=exp/default/test_e2e \
   --inference_dir=exp/default/inference \
   --phones_dict=tacotron2_csmsc_ckpt_0.2.0/phone_id_map.txt
diff --git a/examples/csmsc/tts0/local/inference.sh b/examples/csmsc/tts0/local/inference.sh
index d2960441..6ea2e4b6 100755
--- a/examples/csmsc/tts0/local/inference.sh
+++ b/examples/csmsc/tts0/local/inference.sh
@@ -10,7 +10,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --inference_dir=${train_output_path}/inference \
         --am=tacotron2_csmsc \
         --voc=pwgan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/pd_infer_out \
         --phones_dict=dump/phone_id_map.txt
 fi
@@ -22,7 +22,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --inference_dir=${train_output_path}/inference \
         --am=tacotron2_csmsc \
         --voc=mb_melgan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/pd_infer_out \
         --phones_dict=dump/phone_id_map.txt
 fi
@@ -33,7 +33,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
         --inference_dir=${train_output_path}/inference \
         --am=tacotron2_csmsc \
         --voc=hifigan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/pd_infer_out \
         --phones_dict=dump/phone_id_map.txt
 fi
\ No newline at end of file
diff --git a/examples/csmsc/tts0/local/synthesize_e2e.sh b/examples/csmsc/tts0/local/synthesize_e2e.sh
index 4c3b08dc..40b49aa1 100755
--- a/examples/csmsc/tts0/local/synthesize_e2e.sh
+++ b/examples/csmsc/tts0/local/synthesize_e2e.sh
@@ -22,7 +22,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
         --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
         --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/test_e2e \
         --phones_dict=dump/phone_id_map.txt \
         --inference_dir=${train_output_path}/inference
@@ -44,7 +44,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
         --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
         --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/test_e2e \
         --phones_dict=dump/phone_id_map.txt \
         --inference_dir=${train_output_path}/inference
@@ -66,7 +66,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
         --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
         --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
         --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/test_e2e \
         --phones_dict=dump/phone_id_map.txt
         # --inference_dir=${train_output_path}/inference
@@ -87,7 +87,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
         --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
         --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
         --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/test_e2e \
         --phones_dict=dump/phone_id_map.txt \
         --inference_dir=${train_output_path}/inference
@@ -108,7 +108,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
         --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
         --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
         --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/test_e2e \
         --phones_dict=dump/phone_id_map.txt \
         --inference_dir=${train_output_path}/inference
diff --git a/examples/csmsc/tts2/README.md b/examples/csmsc/tts2/README.md
index ec88959d..96956776 100644
--- a/examples/csmsc/tts2/README.md
+++ b/examples/csmsc/tts2/README.md
@@ -248,7 +248,7 @@ speedyspeech_csmsc_ckpt_0.2.0
 ├── snapshot_iter_30600.pdz # model parameters and optimizer states
 └── tone_id_map.txt         # tone vocabulary file when training speedyspeech
 ```
-You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained speedyspeech and parallel wavegan models.
+You can use the following scripts to synthesize for `${BIN_DIR}/../../assets/sentences.txt` using pretrained speedyspeech and parallel wavegan models.
 ```bash
 source path.sh
 
@@ -264,7 +264,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
   --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
   --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
   --lang=zh \
-  --text=${BIN_DIR}/../sentences.txt \
+  --text=${BIN_DIR}/../../assets/sentences.txt \
   --output_dir=exp/default/test_e2e \
   --inference_dir=exp/default/inference \
   --phones_dict=speedyspeech_csmsc_ckpt_0.2.0/phone_id_map.txt \
diff --git a/examples/csmsc/tts2/local/inference.sh b/examples/csmsc/tts2/local/inference.sh
index ed92136c..9a677edc 100755
--- a/examples/csmsc/tts2/local/inference.sh
+++ b/examples/csmsc/tts2/local/inference.sh
@@ -11,7 +11,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --inference_dir=${train_output_path}/inference \
         --am=speedyspeech_csmsc \
         --voc=pwgan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/pd_infer_out \
         --phones_dict=dump/phone_id_map.txt \
         --tones_dict=dump/tone_id_map.txt
@@ -24,7 +24,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --inference_dir=${train_output_path}/inference \
         --am=speedyspeech_csmsc \
         --voc=mb_melgan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/pd_infer_out \
         --phones_dict=dump/phone_id_map.txt \
         --tones_dict=dump/tone_id_map.txt
@@ -36,7 +36,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
         --inference_dir=${train_output_path}/inference \
         --am=speedyspeech_csmsc \
         --voc=hifigan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/pd_infer_out \
         --phones_dict=dump/phone_id_map.txt \
         --tones_dict=dump/tone_id_map.txt
diff --git a/examples/csmsc/tts2/local/inference_xpu.sh b/examples/csmsc/tts2/local/inference_xpu.sh
new file mode 100644
index 00000000..5d8d9205
--- /dev/null
+++ b/examples/csmsc/tts2/local/inference_xpu.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+
+train_output_path=$1
+
+stage=0
+stop_stage=0
+
+# pwgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    python3 ${BIN_DIR}/../inference.py \
+        --inference_dir=${train_output_path}/inference \
+        --am=speedyspeech_csmsc \
+        --voc=pwgan_csmsc \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/pd_infer_out \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --device xpu
+fi
+
+# for more GAN Vocoders
+# multi band melgan
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    python3 ${BIN_DIR}/../inference.py \
+        --inference_dir=${train_output_path}/inference \
+        --am=speedyspeech_csmsc \
+        --voc=mb_melgan_csmsc \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/pd_infer_out \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --device xpu
+fi
+
+# hifigan
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    python3 ${BIN_DIR}/../inference.py \
+        --inference_dir=${train_output_path}/inference \
+        --am=speedyspeech_csmsc \
+        --voc=hifigan_csmsc \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/pd_infer_out \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --device xpu
+fi
diff --git a/examples/csmsc/tts2/local/lite_predict.sh b/examples/csmsc/tts2/local/lite_predict.sh
index d0c6c058..9bb33cdf 100755
--- a/examples/csmsc/tts2/local/lite_predict.sh
+++ b/examples/csmsc/tts2/local/lite_predict.sh
@@ -11,7 +11,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --inference_dir=${train_output_path}/pdlite \
         --am=speedyspeech_csmsc \
         --voc=pwgan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/lite_infer_out \
         --phones_dict=dump/phone_id_map.txt \
         --tones_dict=dump/tone_id_map.txt
@@ -24,7 +24,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --inference_dir=${train_output_path}/pdlite \
         --am=speedyspeech_csmsc \
         --voc=mb_melgan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/lite_infer_out \
         --phones_dict=dump/phone_id_map.txt \
         --tones_dict=dump/tone_id_map.txt
@@ -36,7 +36,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
         --inference_dir=${train_output_path}/pdlite \
         --am=speedyspeech_csmsc \
         --voc=hifigan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/lite_infer_out \
         --phones_dict=dump/phone_id_map.txt \
         --tones_dict=dump/tone_id_map.txt
diff --git a/examples/csmsc/tts2/local/ort_predict.sh b/examples/csmsc/tts2/local/ort_predict.sh
index 8ca4c0e9..36f88667 100755
--- a/examples/csmsc/tts2/local/ort_predict.sh
+++ b/examples/csmsc/tts2/local/ort_predict.sh
@@ -10,7 +10,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --am=speedyspeech_csmsc \
         --voc=pwgan_csmsc \
         --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../csmsc_test.txt \
+        --text=${BIN_DIR}/../../assets/csmsc_test.txt \
         --phones_dict=dump/phone_id_map.txt \
         --tones_dict=dump/tone_id_map.txt \
         --device=cpu \
@@ -23,7 +23,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --am=speedyspeech_csmsc \
         --voc=mb_melgan_csmsc \
         --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../csmsc_test.txt \
+        --text=${BIN_DIR}/../../assets/csmsc_test.txt \
         --phones_dict=dump/phone_id_map.txt \
         --tones_dict=dump/tone_id_map.txt \
         --device=cpu \
@@ -36,7 +36,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
         --am=speedyspeech_csmsc \
         --voc=hifigan_csmsc \
         --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../csmsc_test.txt \
+        --text=${BIN_DIR}/../../assets/csmsc_test.txt \
         --phones_dict=dump/phone_id_map.txt \
         --tones_dict=dump/tone_id_map.txt \
         --device=cpu \
diff --git a/examples/csmsc/tts2/local/synthesize_e2e.sh b/examples/csmsc/tts2/local/synthesize_e2e.sh
index 553b4554..2b278729 100755
--- a/examples/csmsc/tts2/local/synthesize_e2e.sh
+++ b/examples/csmsc/tts2/local/synthesize_e2e.sh
@@ -21,7 +21,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
         --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
         --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/test_e2e \
         --phones_dict=dump/phone_id_map.txt \
         --tones_dict=dump/tone_id_map.txt \
@@ -43,7 +43,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
         --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
         --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/test_e2e \
         --phones_dict=dump/phone_id_map.txt \
         --tones_dict=dump/tone_id_map.txt \
@@ -66,7 +66,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
         --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
         --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
         --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/test_e2e \
         --phones_dict=dump/phone_id_map.txt \
         --tones_dict=dump/tone_id_map.txt
@@ -87,7 +87,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
         --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
         --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
         --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/test_e2e \
         --phones_dict=dump/phone_id_map.txt \
         --tones_dict=dump/tone_id_map.txt \
@@ -109,7 +109,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
         --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
         --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
         --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/test_e2e \
         --phones_dict=dump/phone_id_map.txt \
         --tones_dict=dump/tone_id_map.txt \
diff --git a/examples/csmsc/tts2/local/synthesize_e2e_xpu.sh b/examples/csmsc/tts2/local/synthesize_e2e_xpu.sh
new file mode 100644
index 00000000..0285f42c
--- /dev/null
+++ b/examples/csmsc/tts2/local/synthesize_e2e_xpu.sh
@@ -0,0 +1,122 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
+stage=0
+stop_stage=0
+
+# pwgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=pwgan_csmsc \
+        --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
+        --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
+        --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --inference_dir=${train_output_path}/inference \
+        --ngpu=0 \
+        --nxpu=1
+fi
+
+# for more GAN Vocoders
+# multi band melgan
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=mb_melgan_csmsc \
+        --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
+        --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --inference_dir=${train_output_path}/inference \
+        --ngpu=0 \
+        --nxpu=1
+fi
+
+# the pretrained models haven't release now
+# style melgan
+# style melgan's Dygraph to Static Graph is not ready now
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=style_melgan_csmsc \
+        --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
+        --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --ngpu=0 \
+        --nxpu=1
+        # --inference_dir=${train_output_path}/inference
+fi
+
+# hifigan
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=hifigan_csmsc \
+        --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
+        --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --inference_dir=${train_output_path}/inference \
+        --ngpu=0 \
+        --nxpu=1
+fi
+
+# wavernn
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    echo "in wavernn syn_e2e"
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=wavernn_csmsc \
+        --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
+        --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
+        --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --inference_dir=${train_output_path}/inference \
+        --ngpu=0 \
+        --nxpu=1
+fi
diff --git a/examples/csmsc/tts2/local/synthesize_xpu.sh b/examples/csmsc/tts2/local/synthesize_xpu.sh
new file mode 100644
index 00000000..801789c2
--- /dev/null
+++ b/examples/csmsc/tts2/local/synthesize_xpu.sh
@@ -0,0 +1,110 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+stage=0
+stop_stage=0
+
+# pwgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=pwgan_csmsc \
+        --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
+        --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
+        --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --ngpu=0 \
+        --nxpu=1
+fi
+
+# for more GAN Vocoders
+# multi band melgan
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=mb_melgan_csmsc \
+        --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
+        --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --ngpu=0 \
+        --nxpu=1
+fi
+
+# style melgan
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=style_melgan_csmsc \
+        --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
+        --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --ngpu=0 \
+        --nxpu=1
+fi
+
+# hifigan
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    echo "in hifigan syn"
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=hifigan_csmsc \
+        --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
+        --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test \
+        --phones_dict=dump/phone_id_map.txt \
+        --tones_dict=dump/tone_id_map.txt \
+        --ngpu=0 \
+        --nxpu=1
+fi
+
+# wavernn
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    echo "in wavernn syn"
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize.py \
+        --am=speedyspeech_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/feats_stats.npy \
+        --voc=wavernn_csmsc \
+        --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
+        --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
+        --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test \
+        --tones_dict=dump/tone_id_map.txt \
+        --phones_dict=dump/phone_id_map.txt \
+        --ngpu=0 \
+        --nxpu=1
+fi
diff --git a/examples/csmsc/tts2/local/train_xpu.sh b/examples/csmsc/tts2/local/train_xpu.sh
new file mode 100644
index 00000000..0c07c27f
--- /dev/null
+++ b/examples/csmsc/tts2/local/train_xpu.sh
@@ -0,0 +1,16 @@
+
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+
+python ${BIN_DIR}/train.py \
+    --train-metadata=dump/train/norm/metadata.jsonl \
+    --dev-metadata=dump/dev/norm/metadata.jsonl \
+    --config=${config_path} \
+    --output-dir=${train_output_path} \
+    --ngpu=0 \
+    --nxpu=1 \
+    --phones-dict=dump/phone_id_map.txt \
+    --tones-dict=dump/tone_id_map.txt \
+    --use-relative-path=True
diff --git a/examples/csmsc/tts2/run.sh b/examples/csmsc/tts2/run.sh
index 6279ec57..5732ea3c 100755
--- a/examples/csmsc/tts2/run.sh
+++ b/examples/csmsc/tts2/run.sh
@@ -45,10 +45,7 @@ fi
 # we have only tested the following models so far
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
     # install paddle2onnx
-    version=$(echo `pip list |grep "paddle2onnx"` |awk -F" " '{print $2}')
-    if [[ -z "$version" || ${version} != '1.0.0' ]]; then
-        pip install paddle2onnx==1.0.0
-    fi
+    pip install paddle2onnx --upgrade
     ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx speedyspeech_csmsc
     # considering the balance between speed and quality, we recommend that you use hifigan as vocoder
     ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_csmsc
diff --git a/examples/csmsc/tts2/run_xpu.sh b/examples/csmsc/tts2/run_xpu.sh
new file mode 100644
index 00000000..4b867961
--- /dev/null
+++ b/examples/csmsc/tts2/run_xpu.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+set -e
+source path.sh
+
+xpus=0,1
+stage=0
+stop_stage=100
+
+conf_path=conf/default.yaml
+train_output_path=exp/default
+ckpt_name=snapshot_iter_76.pdz
+
+# with the following command, you can choose the stage range you want to run
+# such as `./run_xpu.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    ./local/preprocess.sh ${conf_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+    FLAGS_selected_xpus=${xpus} ./local/train_xpu.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # synthesize, vocoder is pwgan by default
+    FLAGS_selected_xpus=${xpus} ./local/synthesize_xpu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # synthesize_e2e, vocoder is pwgan by default
+    FLAGS_selected_xpus=${xpus} ./local/synthesize_e2e_xpu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    # inference with static model
+    FLAGS_selected_xpus=${xpus} ./local/inference_xpu.sh ${train_output_path} || exit -1
+fi
diff --git a/examples/csmsc/tts3/README.md b/examples/csmsc/tts3/README.md
index 39926259..5a097537 100644
--- a/examples/csmsc/tts3/README.md
+++ b/examples/csmsc/tts3/README.md
@@ -258,7 +258,7 @@ fastspeech2_nosil_baker_ckpt_0.4
 ├── snapshot_iter_76000.pdz # model parameters and optimizer states
 └── speech_stats.npy        # statistics used to normalize spectrogram when training fastspeech2
 ```
-You can use the following scripts to synthesize for `${BIN_DIR}/../sentences.txt` using pretrained fastspeech2 and parallel wavegan models.
+You can use the following scripts to synthesize for `${BIN_DIR}/../../assets/sentences.txt` using pretrained fastspeech2 and parallel wavegan models.
 
 If you want to use fastspeech2_conformer, you must delete this line `--inference_dir=exp/default/inference \` to skip the step of dygraph to static graph, cause we haven't tested dygraph to static graph for fastspeech2_conformer till now.
 ```bash
@@ -276,7 +276,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
   --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
   --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
   --lang=zh \
-  --text=${BIN_DIR}/../sentences.txt \
+  --text=${BIN_DIR}/../../assets/sentences.txt \
   --output_dir=exp/default/test_e2e \
   --inference_dir=exp/default/inference \
   --phones_dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
diff --git a/examples/csmsc/tts3/README_cn.md b/examples/csmsc/tts3/README_cn.md
index 1829b770..3f2783a9 100644
--- a/examples/csmsc/tts3/README_cn.md
+++ b/examples/csmsc/tts3/README_cn.md
@@ -248,7 +248,7 @@ fastspeech2_nosil_baker_ckpt_0.4
 ├── snapshot_iter_76000.pdz # 模型参数和优化器状态
 └── speech_stats.npy        # 训练 fastspeech2 时用于规范化频谱图的统计数据
 ```
-您可以使用以下脚本通过使用预训练的 fastspeech2 和 parallel wavegan 模型为 `${BIN_DIR}/../sentences.txt` 合成句子
+您可以使用以下脚本通过使用预训练的 fastspeech2 和 parallel wavegan 模型为 `${BIN_DIR}/../../assets/sentences.txt` 合成句子
 ```bash
 source path.sh
 
@@ -264,7 +264,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
   --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
   --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
   --lang=zh \
-  --text=${BIN_DIR}/../sentences.txt \
+  --text=${BIN_DIR}/../../assets/sentences.txt \
   --output_dir=exp/default/test_e2e \
   --inference_dir=exp/default/inference \
   --phones_dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
diff --git a/examples/csmsc/tts3/local/inference.sh b/examples/csmsc/tts3/local/inference.sh
index b43fd286..5b143cdd 100755
--- a/examples/csmsc/tts3/local/inference.sh
+++ b/examples/csmsc/tts3/local/inference.sh
@@ -11,7 +11,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --inference_dir=${train_output_path}/inference \
         --am=fastspeech2_csmsc \
         --voc=pwgan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/pd_infer_out \
         --phones_dict=dump/phone_id_map.txt
 fi
@@ -23,7 +23,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --inference_dir=${train_output_path}/inference \
         --am=fastspeech2_csmsc \
         --voc=mb_melgan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/pd_infer_out \
         --phones_dict=dump/phone_id_map.txt
 fi
@@ -34,7 +34,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
         --inference_dir=${train_output_path}/inference \
         --am=fastspeech2_csmsc \
         --voc=hifigan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/pd_infer_out \
         --phones_dict=dump/phone_id_map.txt
 fi
@@ -45,7 +45,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
         --inference_dir=${train_output_path}/inference \
         --am=fastspeech2_csmsc \
         --voc=wavernn_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/pd_infer_out \
         --phones_dict=dump/phone_id_map.txt
 fi
\ No newline at end of file
diff --git a/examples/csmsc/tts3/local/inference_streaming.sh b/examples/csmsc/tts3/local/inference_streaming.sh
index 719f46c6..5ad50aa5 100755
--- a/examples/csmsc/tts3/local/inference_streaming.sh
+++ b/examples/csmsc/tts3/local/inference_streaming.sh
@@ -12,7 +12,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --am=fastspeech2_csmsc \
         --am_stat=dump/train/speech_stats.npy \
         --voc=pwgan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/pd_infer_out_streaming \
         --phones_dict=dump/phone_id_map.txt \
         --am_streaming=True
@@ -26,7 +26,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --am=fastspeech2_csmsc \
         --am_stat=dump/train/speech_stats.npy \
         --voc=mb_melgan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/pd_infer_out_streaming \
         --phones_dict=dump/phone_id_map.txt \
         --am_streaming=True
@@ -39,7 +39,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
         --am=fastspeech2_csmsc \
         --am_stat=dump/train/speech_stats.npy \
         --voc=hifigan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/pd_infer_out_streaming \
         --phones_dict=dump/phone_id_map.txt \
         --am_streaming=True
diff --git a/examples/csmsc/tts3/local/inference_xpu.sh b/examples/csmsc/tts3/local/inference_xpu.sh
new file mode 100644
index 00000000..541dc626
--- /dev/null
+++ b/examples/csmsc/tts3/local/inference_xpu.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+
+train_output_path=$1
+
+stage=0
+stop_stage=0
+
+# pwgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    python3 ${BIN_DIR}/../inference.py \
+        --inference_dir=${train_output_path}/inference \
+        --am=fastspeech2_csmsc \
+        --voc=pwgan_csmsc \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/pd_infer_out \
+        --phones_dict=dump/phone_id_map.txt \
+        --device xpu
+fi
+
+# for more GAN Vocoders
+# multi band melgan
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    python3 ${BIN_DIR}/../inference.py \
+        --inference_dir=${train_output_path}/inference \
+        --am=fastspeech2_csmsc \
+        --voc=mb_melgan_csmsc \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/pd_infer_out \
+        --phones_dict=dump/phone_id_map.txt \
+        --device xpu
+fi
+
+# hifigan
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    python3 ${BIN_DIR}/../inference.py \
+        --inference_dir=${train_output_path}/inference \
+        --am=fastspeech2_csmsc \
+        --voc=hifigan_csmsc \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/pd_infer_out \
+        --phones_dict=dump/phone_id_map.txt \
+        --device xpu
+fi
+
+# wavernn
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    python3 ${BIN_DIR}/../inference.py \
+        --inference_dir=${train_output_path}/inference \
+        --am=fastspeech2_csmsc \
+        --voc=wavernn_csmsc \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/pd_infer_out \
+        --phones_dict=dump/phone_id_map.txt \
+        --device xpu
+fi
\ No newline at end of file
diff --git a/examples/csmsc/tts3/local/lite_predict.sh b/examples/csmsc/tts3/local/lite_predict.sh
index 1ed2f108..9af17899 100755
--- a/examples/csmsc/tts3/local/lite_predict.sh
+++ b/examples/csmsc/tts3/local/lite_predict.sh
@@ -11,7 +11,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --inference_dir=${train_output_path}/pdlite \
         --am=fastspeech2_csmsc \
         --voc=pwgan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/lite_infer_out \
         --phones_dict=dump/phone_id_map.txt
 fi
@@ -23,7 +23,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --inference_dir=${train_output_path}/pdlite \
         --am=fastspeech2_csmsc \
         --voc=mb_melgan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/lite_infer_out \
         --phones_dict=dump/phone_id_map.txt
 fi
@@ -34,7 +34,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
         --inference_dir=${train_output_path}/pdlite \
         --am=fastspeech2_csmsc \
         --voc=hifigan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/lite_infer_out \
         --phones_dict=dump/phone_id_map.txt
 fi
diff --git a/examples/csmsc/tts3/local/lite_predict_streaming.sh b/examples/csmsc/tts3/local/lite_predict_streaming.sh
index 4570cb4e..19fdde41 100755
--- a/examples/csmsc/tts3/local/lite_predict_streaming.sh
+++ b/examples/csmsc/tts3/local/lite_predict_streaming.sh
@@ -12,7 +12,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --am=fastspeech2_csmsc \
         --am_stat=dump/train/speech_stats.npy \
         --voc=pwgan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/lite_infer_out_streaming \
         --phones_dict=dump/phone_id_map.txt \
         --am_streaming=True
@@ -26,7 +26,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --am=fastspeech2_csmsc \
         --am_stat=dump/train/speech_stats.npy \
         --voc=mb_melgan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/lite_infer_out_streaming \
         --phones_dict=dump/phone_id_map.txt \
         --am_streaming=True
@@ -39,7 +39,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
         --am=fastspeech2_csmsc \
         --am_stat=dump/train/speech_stats.npy \
         --voc=hifigan_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/lite_infer_out_streaming \
         --phones_dict=dump/phone_id_map.txt \
         --am_streaming=True
diff --git a/examples/csmsc/tts3/local/ort_predict.sh b/examples/csmsc/tts3/local/ort_predict.sh
index e16c7bd0..99955665 100755
--- a/examples/csmsc/tts3/local/ort_predict.sh
+++ b/examples/csmsc/tts3/local/ort_predict.sh
@@ -10,7 +10,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --am=fastspeech2_csmsc \
         --voc=pwgan_csmsc \
         --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../csmsc_test.txt \
+        --text=${BIN_DIR}/../../assets/csmsc_test.txt \
         --phones_dict=dump/phone_id_map.txt \
         --device=cpu \
         --cpu_threads=2
@@ -22,7 +22,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --am=fastspeech2_csmsc \
         --voc=mb_melgan_csmsc \
         --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../csmsc_test.txt \
+        --text=${BIN_DIR}/../../assets/csmsc_test.txt \
         --phones_dict=dump/phone_id_map.txt \
         --device=cpu \
         --cpu_threads=2
@@ -34,7 +34,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
         --am=fastspeech2_csmsc \
         --voc=hifigan_csmsc \
         --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../csmsc_test.txt \
+        --text=${BIN_DIR}/../../assets/csmsc_test.txt \
         --phones_dict=dump/phone_id_map.txt \
         --device=cpu \
         --cpu_threads=2
diff --git a/examples/csmsc/tts3/local/ort_predict_streaming.sh b/examples/csmsc/tts3/local/ort_predict_streaming.sh
index 74393581..e2c7e852 100755
--- a/examples/csmsc/tts3/local/ort_predict_streaming.sh
+++ b/examples/csmsc/tts3/local/ort_predict_streaming.sh
@@ -11,7 +11,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --am_stat=dump/train/speech_stats.npy \
         --voc=pwgan_csmsc \
         --output_dir=${train_output_path}/onnx_infer_out_streaming \
-        --text=${BIN_DIR}/../csmsc_test.txt \
+        --text=${BIN_DIR}/../../assets/csmsc_test.txt \
         --phones_dict=dump/phone_id_map.txt \
         --device=cpu \
         --cpu_threads=2 \
@@ -25,7 +25,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --am_stat=dump/train/speech_stats.npy \
         --voc=mb_melgan_csmsc \
         --output_dir=${train_output_path}/onnx_infer_out_streaming \
-        --text=${BIN_DIR}/../csmsc_test.txt \
+        --text=${BIN_DIR}/../../assets/csmsc_test.txt \
         --phones_dict=dump/phone_id_map.txt \
         --device=cpu \
         --cpu_threads=2 \
@@ -39,7 +39,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
         --am_stat=dump/train/speech_stats.npy \
         --voc=hifigan_csmsc \
         --output_dir=${train_output_path}/onnx_infer_out_streaming \
-        --text=${BIN_DIR}/../csmsc_test.txt \
+        --text=${BIN_DIR}/../../assets/csmsc_test.txt \
         --phones_dict=dump/phone_id_map.txt \
         --device=cpu \
         --cpu_threads=2 \
diff --git a/examples/csmsc/tts3/local/synthesize_e2e.sh b/examples/csmsc/tts3/local/synthesize_e2e.sh
index 512e062b..35a5598a 100755
--- a/examples/csmsc/tts3/local/synthesize_e2e.sh
+++ b/examples/csmsc/tts3/local/synthesize_e2e.sh
@@ -21,7 +21,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
         --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
         --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/test_e2e \
         --phones_dict=dump/phone_id_map.txt \
         --inference_dir=${train_output_path}/inference
@@ -42,7 +42,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
         --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
         --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/test_e2e \
         --phones_dict=dump/phone_id_map.txt \
         --inference_dir=${train_output_path}/inference
@@ -64,7 +64,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
         --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
         --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
         --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/test_e2e \
         --phones_dict=dump/phone_id_map.txt
         # --inference_dir=${train_output_path}/inference
@@ -85,7 +85,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
         --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
         --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
         --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/test_e2e \
         --phones_dict=dump/phone_id_map.txt \
         --inference_dir=${train_output_path}/inference
@@ -107,7 +107,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
         --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
         --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
         --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/test_e2e \
         --phones_dict=dump/phone_id_map.txt \
         --inference_dir=${train_output_path}/inference
diff --git a/examples/csmsc/tts3/local/synthesize_e2e_xpu.sh b/examples/csmsc/tts3/local/synthesize_e2e_xpu.sh
new file mode 100644
index 00000000..bb58a37c
--- /dev/null
+++ b/examples/csmsc/tts3/local/synthesize_e2e_xpu.sh
@@ -0,0 +1,119 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
+stage=0
+stop_stage=0
+
+# pwgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=fastspeech2_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=pwgan_csmsc \
+        --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
+        --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
+        --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --inference_dir=${train_output_path}/inference \
+        --ngpu=0 \
+        --nxpu=1
+fi
+
+# for more GAN Vocoders
+# multi band melgan
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=fastspeech2_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=mb_melgan_csmsc \
+        --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
+        --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --inference_dir=${train_output_path}/inference \
+        --ngpu=0 \
+        --nxpu=1
+fi
+
+# the pretrained models haven't release now
+# style melgan
+# style melgan's Dygraph to Static Graph is not ready now
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=fastspeech2_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=style_melgan_csmsc \
+        --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
+        --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --ngpu=0 \
+        --nxpu=1
+        # --inference_dir=${train_output_path}/inference
+fi
+
+# hifigan
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    echo "in hifigan syn_e2e"
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=fastspeech2_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=hifigan_csmsc \
+        --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
+        --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --inference_dir=${train_output_path}/inference \
+        --ngpu=0 \
+        --nxpu=1
+fi
+
+
+# wavernn
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    echo "in wavernn syn_e2e"
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=fastspeech2_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=wavernn_csmsc \
+        --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
+        --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
+        --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --inference_dir=${train_output_path}/inference \
+        --ngpu=0 \
+        --nxpu=1
+fi
diff --git a/examples/csmsc/tts3/local/synthesize_streaming.sh b/examples/csmsc/tts3/local/synthesize_streaming.sh
index 366a88db..f4e783d4 100755
--- a/examples/csmsc/tts3/local/synthesize_streaming.sh
+++ b/examples/csmsc/tts3/local/synthesize_streaming.sh
@@ -21,7 +21,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
         --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
         --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/test_e2e_streaming \
         --phones_dict=dump/phone_id_map.txt \
         --am_streaming=True \
@@ -43,7 +43,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
         --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
         --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/test_e2e_streaming \
         --phones_dict=dump/phone_id_map.txt \
         --am_streaming=True \
@@ -66,7 +66,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
         --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
         --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
         --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/test_e2e_streaming \
         --phones_dict=dump/phone_id_map.txt \
         --am_streaming=True
@@ -87,7 +87,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
         --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
         --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
         --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/test_e2e_streaming \
         --phones_dict=dump/phone_id_map.txt \
         --am_streaming=True \
diff --git a/examples/csmsc/tts3/local/synthesize_xpu.sh b/examples/csmsc/tts3/local/synthesize_xpu.sh
new file mode 100644
index 00000000..fac8677a
--- /dev/null
+++ b/examples/csmsc/tts3/local/synthesize_xpu.sh
@@ -0,0 +1,105 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+stage=0
+stop_stage=0
+
+# pwgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize.py \
+        --am=fastspeech2_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=pwgan_csmsc \
+        --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
+        --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
+        --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test \
+        --phones_dict=dump/phone_id_map.txt \
+        --ngpu=0 \
+        --nxpu=1
+fi
+
+# for more GAN Vocoders
+# multi band melgan
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize.py \
+        --am=fastspeech2_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=mb_melgan_csmsc \
+        --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
+        --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test \
+        --phones_dict=dump/phone_id_map.txt \
+        --ngpu=0 \
+        --nxpu=1
+fi
+
+# style melgan
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize.py \
+        --am=fastspeech2_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=style_melgan_csmsc \
+        --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
+        --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test \
+        --phones_dict=dump/phone_id_map.txt \
+        --ngpu=0 \
+        --nxpu=1
+fi
+
+# hifigan
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    echo "in hifigan syn"
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize.py \
+        --am=fastspeech2_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=hifigan_csmsc \
+        --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
+        --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test \
+        --phones_dict=dump/phone_id_map.txt \
+        --ngpu=0 \
+        --nxpu=1
+fi
+
+# wavernn
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    echo "in wavernn syn"
+    FLAGS_allocator_strategy=naive_best_fit \
+    python3 ${BIN_DIR}/../synthesize.py \
+        --am=fastspeech2_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=wavernn_csmsc \
+        --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
+        --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
+        --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test \
+        --phones_dict=dump/phone_id_map.txt \
+        --ngpu=0 \
+        --nxpu=1
+fi
diff --git a/examples/csmsc/tts3/local/train_xpu.sh b/examples/csmsc/tts3/local/train_xpu.sh
new file mode 100644
index 00000000..a7d88988
--- /dev/null
+++ b/examples/csmsc/tts3/local/train_xpu.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+
+python3 ${BIN_DIR}/train.py \
+    --train-metadata=dump/train/norm/metadata.jsonl \
+    --dev-metadata=dump/dev/norm/metadata.jsonl \
+    --config=${config_path} \
+    --output-dir=${train_output_path} \
+    --ngpu=0 \
+    --nxpu=1 \
+    --phones-dict=dump/phone_id_map.txt
\ No newline at end of file
diff --git a/examples/csmsc/tts3/run.sh b/examples/csmsc/tts3/run.sh
index dd8c9f3e..a7b4e423 100755
--- a/examples/csmsc/tts3/run.sh
+++ b/examples/csmsc/tts3/run.sh
@@ -45,10 +45,7 @@ fi
 # we have only tested the following models so far
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
     # install paddle2onnx
-    version=$(echo `pip list |grep "paddle2onnx"` |awk -F" " '{print $2}')
-    if [[ -z "$version" || ${version} != '1.0.0' ]]; then
-        pip install paddle2onnx==1.0.0
-    fi
+    pip install paddle2onnx --upgrade
     ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx fastspeech2_csmsc
     # considering the balance between speed and quality, we recommend that you use hifigan as vocoder
     ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_csmsc
diff --git a/examples/csmsc/tts3/run_cnndecoder.sh b/examples/csmsc/tts3/run_cnndecoder.sh
index 96b446c5..f356f313 100755
--- a/examples/csmsc/tts3/run_cnndecoder.sh
+++ b/examples/csmsc/tts3/run_cnndecoder.sh
@@ -58,10 +58,7 @@ fi
 # paddle2onnx non streaming
 if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
     # install paddle2onnx
-    version=$(echo `pip list |grep "paddle2onnx"` |awk -F" " '{print $2}')
-    if [[ -z "$version" || ${version} != '1.0.0' ]]; then
-        pip install paddle2onnx==1.0.0
-    fi
+    pip install paddle2onnx --upgrade
     ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx fastspeech2_csmsc
     # considering the balance between speed and quality, we recommend that you use hifigan as vocoder
     ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_csmsc
@@ -77,10 +74,7 @@ fi
 # paddle2onnx streaming
 if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then
     # install paddle2onnx
-    version=$(echo `pip list |grep "paddle2onnx"` |awk -F" " '{print $2}')
-    if [[ -z "$version" || ${version} != '1.0.0' ]]; then
-        pip install paddle2onnx==1.0.0
-    fi
+    pip install paddle2onnx --upgrade
     # streaming acoustic model
     ./local/paddle2onnx.sh ${train_output_path} inference_streaming inference_onnx_streaming fastspeech2_csmsc_am_encoder_infer
     ./local/paddle2onnx.sh ${train_output_path} inference_streaming inference_onnx_streaming fastspeech2_csmsc_am_decoder
diff --git a/examples/csmsc/tts3/run_xpu.sh b/examples/csmsc/tts3/run_xpu.sh
new file mode 100644
index 00000000..4922d6b4
--- /dev/null
+++ b/examples/csmsc/tts3/run_xpu.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+set -e
+source path.sh
+
+xpus=0,1
+stage=0
+stop_stage=100
+
+conf_path=conf/default.yaml
+train_output_path=exp/default
+ckpt_name=snapshot_iter_153.pdz
+
+# with the following command, you can choose the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    ./local/preprocess.sh ${conf_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+    FLAGS_selected_xpus=${xpus} ./local/train_xpu.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # synthesize, vocoder is pwgan by default
+    FLAGS_selected_xpus=${xpus} ./local/synthesize_xpu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # synthesize_e2e, vocoder is pwgan by default
+    FLAGS_selected_xpus=${xpus} ./local/synthesize_e2e_xpu.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    # inference with static model, vocoder is pwgan by default
+    FLAGS_selected_xpus=${xpus} ./local/inference_xpu.sh ${train_output_path} || exit -1
+fi
diff --git a/examples/csmsc/tts3_rhy/local/synthesize_e2e.sh b/examples/csmsc/tts3_rhy/local/synthesize_e2e.sh
index 8f5d8010..bf7229e1 100755
--- a/examples/csmsc/tts3_rhy/local/synthesize_e2e.sh
+++ b/examples/csmsc/tts3_rhy/local/synthesize_e2e.sh
@@ -21,7 +21,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
         --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
         --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/test_e2e \
         --phones_dict=dump/phone_id_map.txt \
         --inference_dir=${train_output_path}/inference \
@@ -43,7 +43,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
         --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
         --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/test_e2e \
         --phones_dict=dump/phone_id_map.txt \
         --inference_dir=${train_output_path}/inference \
@@ -66,7 +66,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
         --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
         --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
         --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/test_e2e \
         --phones_dict=dump/phone_id_map.txt \
         --use_rhy=True
@@ -88,7 +88,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
         --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
         --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
         --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/test_e2e \
         --phones_dict=dump/phone_id_map.txt \
         --inference_dir=${train_output_path}/inference \
@@ -111,7 +111,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
         --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
         --voc_stat=wavernn_csmsc_ckpt_0.2.0/feats_stats.npy \
         --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/test_e2e \
         --phones_dict=dump/phone_id_map.txt \
         --inference_dir=${train_output_path}/inference \
diff --git a/examples/csmsc/vits/README.md b/examples/csmsc/vits/README.md
index 50d703b2..83871277 100644
--- a/examples/csmsc/vits/README.md
+++ b/examples/csmsc/vits/README.md
@@ -172,6 +172,6 @@ python3 ${BIN_DIR}/synthesize_e2e.py \
     --ckpt=vits_csmsc_ckpt_1.4.0/snapshot_iter_150000.pdz \
     --phones_dict=vits_csmsc_ckpt_1.4.0/phone_id_map.txt \
     --output_dir=exp/default/test_e2e \
-    --text=${BIN_DIR}/../sentences.txt \
+    --text=${BIN_DIR}/../../assets/sentences.txt \
     --add-blank=${add_blank} 
 ```
diff --git a/examples/csmsc/vits/conf/default.yaml b/examples/csmsc/vits/conf/default.yaml
index a2aef998..7e9e9c1d 100644
--- a/examples/csmsc/vits/conf/default.yaml
+++ b/examples/csmsc/vits/conf/default.yaml
@@ -179,7 +179,7 @@ generator_first: False # whether to start updating generator first
 #                OTHER TRAINING SETTING                  #
 ##########################################################
 num_snapshots: 10            # max number of snapshots to keep while training
-train_max_steps: 350000      # Number of training steps. == total_iters / ngpus, total_iters = 1000000
-save_interval_steps: 1000    # Interval steps to save checkpoint.
-eval_interval_steps: 250     # Interval steps to evaluate the network.
+max_epoch: 1000              # Number of training epochs.
+save_interval_epochs: 1      # Interval epochs to save checkpoint.
+eval_interval_epochs: 1      # Interval steps to evaluate the network.
 seed: 777                    # random seed number
diff --git a/examples/csmsc/vits/local/inference.sh b/examples/csmsc/vits/local/inference.sh
index 0a79c255..d26b7f71 100755
--- a/examples/csmsc/vits/local/inference.sh
+++ b/examples/csmsc/vits/local/inference.sh
@@ -10,7 +10,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     python3 ${BIN_DIR}/inference.py \
         --inference_dir=${train_output_path}/inference \
         --am=vits_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/pd_infer_out \
         --phones_dict=dump/phone_id_map.txt \
         --add-blank=${add_blank}
diff --git a/examples/csmsc/vits/local/lite_predict.sh b/examples/csmsc/vits/local/lite_predict.sh
index 9ed57b72..d20d7a57 100755
--- a/examples/csmsc/vits/local/lite_predict.sh
+++ b/examples/csmsc/vits/local/lite_predict.sh
@@ -7,10 +7,10 @@ stage=0
 stop_stage=0
 
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    python3 ${BIN_DIR}/../lite_predict.py \
+    python3 ${BIN_DIR}/lite_predict.py \
         --inference_dir=${train_output_path}/pdlite \
         --am=vits_csmsc \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=${train_output_path}/lite_infer_out \
         --phones_dict=dump/phone_id_map.txt \
         --add-blank=${add_blank}
diff --git a/examples/csmsc/vits/local/synthesize_e2e.sh b/examples/csmsc/vits/local/synthesize_e2e.sh
index 6a69b366..f3c067e4 100755
--- a/examples/csmsc/vits/local/synthesize_e2e.sh
+++ b/examples/csmsc/vits/local/synthesize_e2e.sh
@@ -18,7 +18,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --ckpt=${train_output_path}/checkpoints/${ckpt_name} \
         --phones_dict=dump/phone_id_map.txt \
         --output_dir=${train_output_path}/test_e2e \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --add-blank=${add_blank} #\
         # --inference_dir=${train_output_path}/inference
 fi
diff --git a/examples/csmsc/vits/run.sh b/examples/csmsc/vits/run.sh
index f2c5d452..f6e8a086 100755
--- a/examples/csmsc/vits/run.sh
+++ b/examples/csmsc/vits/run.sh
@@ -45,10 +45,7 @@ fi
 # # we have only tested the following models so far
 # if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
 #     # install paddle2onnx
-#     version=$(echo `pip list |grep "paddle2onnx"` |awk -F" " '{print $2}')
-#     if [[ -z "$version" || ${version} != '1.0.0' ]]; then
-#         pip install paddle2onnx==1.0.0
-#     fi
+#     pip install paddle2onnx --upgrade
 #     ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx vits_csmsc
 # fi
 
@@ -57,16 +54,16 @@ fi
 #     ./local/ort_predict.sh ${train_output_path}
 # fi
 
-# # not ready yet for operator missing in Paddle-Lite
-# # must run after stage 3 (which stage generated static models)
-# if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
-#     # NOTE by yuantian 2022.11.21: please compile develop version of Paddle-Lite to export and run TTS models,
-#     #                   cause TTS models are supported by https://github.com/PaddlePaddle/Paddle-Lite/pull/9587 
-#     #                   and https://github.com/PaddlePaddle/Paddle-Lite/pull/9706
-#     ./local/export2lite.sh ${train_output_path} inference pdlite vits_csmsc x86
-# fi
+# not ready yet for operator missing in Paddle-Lite
+# must run after stage 3 (which stage generated static models)
+if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
+    # NOTE by yuantian 2022.11.21: please compile develop version of Paddle-Lite to export and run TTS models,
+    #                   cause TTS models are supported by https://github.com/PaddlePaddle/Paddle-Lite/pull/10128
+    # vits can only run in arm
+    ./local/export2lite.sh ${train_output_path} inference pdlite vits_csmsc arm
+fi
 
-# if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
-#     CUDA_VISIBLE_DEVICES=${gpus} ./local/lite_predict.sh ${train_output_path} || exit -1
-# fi
+if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/lite_predict.sh ${train_output_path} || exit -1
+fi
 
diff --git a/examples/csmsc/voc5/conf/iSTFT.yaml b/examples/csmsc/voc5/conf/iSTFT.yaml
new file mode 100644
index 00000000..06677d79
--- /dev/null
+++ b/examples/csmsc/voc5/conf/iSTFT.yaml
@@ -0,0 +1,174 @@
+# This is the configuration file for CSMSC dataset.
+# This configuration is based on HiFiGAN V1, which is an official configuration. 
+# But I found that the optimizer setting does not work well with my implementation.
+# So I changed optimizer settings as follows:
+# - AdamW -> Adam
+# - betas: [0.8, 0.99] -> betas: [0.5, 0.9]
+# - Scheduler: ExponentialLR -> MultiStepLR
+# To match the shift size difference, the upsample scales is also modified from the original 256 shift setting.
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+fs: 24000                # Sampling rate.
+n_fft: 2048              # FFT size (samples).
+n_shift: 300             # Hop size (samples). 12.5ms
+win_length: 1200         # Window length (samples). 50ms
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+n_mels: 80               # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation. (Hz)
+fmax: 7600               # Maximum frequency in mel basis calculation. (Hz)
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_params:
+    use_istft: True                       # Use iSTFTNet.
+    istft_layer_id: 2                     # Use istft after istft_layer_id layers of upsample layer if use_istft=True.
+    n_fft: 2048                           # FFT size (samples) in feature extraction.
+    win_length: 1200                      # Window length (samples) in feature extraction.
+    in_channels: 80                       # Number of input channels.
+    out_channels: 1                       # Number of output channels.
+    channels: 512                         # Number of initial channels.
+    kernel_size: 7                        # Kernel size of initial and final conv layers.
+    upsample_scales:  [5, 5, 4, 3]        # Upsampling scales.
+    upsample_kernel_sizes: [10, 10, 8, 6] # Kernel size for upsampling layers.
+    resblock_kernel_sizes: [3, 7, 11]     # Kernel size for residual blocks.
+    resblock_dilations:                   # Dilations for residual blocks.
+        - [1, 3, 5]
+        - [1, 3, 5]
+        - [1, 3, 5]
+    use_additional_convs: True            # Whether to use additional conv layer in residual blocks.
+    bias: True                            # Whether to use bias parameter in conv.
+    nonlinear_activation: "leakyrelu"     # Nonlinear activation type.
+    nonlinear_activation_params:          # Nonlinear activation paramters.
+        negative_slope: 0.1
+    use_weight_norm: True                 # Whether to apply weight normalization.
+
+
+
+
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_params:
+    scales: 3                              # Number of multi-scale discriminator.
+    scale_downsample_pooling: "AvgPool1D"  # Pooling operation for scale discriminator.
+    scale_downsample_pooling_params:
+        kernel_size: 4                     # Pooling kernel size.
+        stride: 2                          # Pooling stride.
+        padding: 2                         # Padding size.
+    scale_discriminator_params:
+        in_channels: 1                     # Number of input channels.
+        out_channels: 1                    # Number of output channels.
+        kernel_sizes: [15, 41, 5, 3]       # List of kernel sizes.
+        channels: 128                      # Initial number of channels.
+        max_downsample_channels: 1024      # Maximum number of channels in downsampling conv layers.
+        max_groups: 16                     # Maximum number of groups in downsampling conv layers.
+        bias: True
+        downsample_scales: [4, 4, 4, 4, 1] # Downsampling scales.
+        nonlinear_activation: "leakyrelu"  # Nonlinear activation.
+        nonlinear_activation_params:
+            negative_slope: 0.1
+    follow_official_norm: True             # Whether to follow the official norm setting.
+    periods: [2, 3, 5, 7, 11]              # List of period for multi-period discriminator.
+    period_discriminator_params:
+        in_channels: 1                     # Number of input channels.
+        out_channels: 1                    # Number of output channels.
+        kernel_sizes: [5, 3]               # List of kernel sizes.
+        channels: 32                       # Initial number of channels.
+        downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales.
+        max_downsample_channels: 1024      # Maximum number of channels in downsampling conv layers.
+        bias: True                         # Whether to use bias parameter in conv layer."
+        nonlinear_activation: "leakyrelu"  # Nonlinear activation.
+        nonlinear_activation_params:       # Nonlinear activation paramters.
+            negative_slope: 0.1
+        use_weight_norm: True              # Whether to apply weight normalization.
+        use_spectral_norm: False           # Whether to apply spectral normalization.
+    
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+use_stft_loss: False                 # Whether to use multi-resolution STFT loss.
+use_mel_loss: True                   # Whether to use Mel-spectrogram loss.
+mel_loss_params:
+    fs: 24000
+    fft_size: 2048
+    hop_size: 300
+    win_length: 1200
+    window: "hann"
+    num_mels: 80
+    fmin: 0
+    fmax: 12000
+    log_base: null
+generator_adv_loss_params:
+    average_by_discriminators: False # Whether to average loss by #discriminators.
+discriminator_adv_loss_params:
+    average_by_discriminators: False # Whether to average loss by #discriminators.
+use_feat_match_loss: True
+feat_match_loss_params:
+    average_by_discriminators: False # Whether to average loss by #discriminators.
+    average_by_layers: False         # Whether to average loss by #layers in each discriminator.
+    include_final_outputs: False     # Whether to include final outputs in feat match loss calculation.
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_aux: 45.0       # Loss balancing coefficient for STFT loss.
+lambda_adv: 1.0        # Loss balancing coefficient for adversarial loss.
+lambda_feat_match: 2.0 # Loss balancing coefficient for feat match loss..
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 16              # Batch size.
+batch_max_steps: 8400       # Length of each audio in batch. Make sure dividable by hop_size.
+num_workers: 2              # Number of workers in DataLoader.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+    beta1: 0.5
+    beta2: 0.9
+    weight_decay: 0.0                   # Generator's weight decay coefficient.
+generator_scheduler_params:
+    learning_rate: 2.0e-4               # Generator's learning rate.
+    gamma: 0.5                          # Generator's scheduler gamma.
+    milestones:                         # At each milestone, lr will be multiplied by gamma.
+        - 200000
+        - 400000
+        - 600000
+        - 800000
+generator_grad_norm: -1                 # Generator's gradient norm.
+discriminator_optimizer_params:
+    beta1: 0.5
+    beta2: 0.9
+    weight_decay: 0.0                   # Discriminator's weight decay coefficient.
+discriminator_scheduler_params:
+    learning_rate: 2.0e-4               # Discriminator's learning rate.
+    gamma: 0.5                          # Discriminator's scheduler gamma.
+    milestones:                         # At each milestone, lr will be multiplied by gamma.
+        - 200000
+        - 400000
+        - 600000
+        - 800000    
+discriminator_grad_norm: -1             # Discriminator's gradient norm.            
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+generator_train_start_steps: 1     # Number of steps to start to train discriminator.
+discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
+train_max_steps: 2500000          # Number of training steps.
+save_interval_steps: 5000         # Interval steps to save checkpoint.
+eval_interval_steps: 1000          # Interval steps to evaluate the network.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_snapshots: 10                 # max number of snapshots to keep while training
+seed: 42                          # random seed for paddle, random, and np.random
diff --git a/examples/csmsc/voc5/iSTFTNet.md b/examples/csmsc/voc5/iSTFTNet.md
new file mode 100644
index 00000000..8f121938
--- /dev/null
+++ b/examples/csmsc/voc5/iSTFTNet.md
@@ -0,0 +1,145 @@
+# iSTFTNet with CSMSC
+
+This example contains code used to train a [iSTFTNet](https://arxiv.org/abs/2203.02395) model with [Chinese Standard Mandarin Speech Copus](https://www.data-baker.com/open_source.html).
+
+## Dataset
+### Download and Extract
+Download CSMSC from it's [official website](https://test.data-baker.com/data/index/TNtts/) and extract it to `~/datasets`. Then the dataset is in the directory `~/datasets/BZNSYP`.
+
+### Get MFA Result and Extract
+We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence at the edge of audio.
+You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
+
+## Get Started
+Assume the path to the dataset is `~/datasets/BZNSYP`.
+Assume the path to the MFA result of CSMSC is `./baker_alignment_tone`.
+Run the command below to
+1. **source path**.
+2. preprocess the dataset.
+3. train the model.
+4. synthesize wavs.
+    - synthesize waveform from `metadata.jsonl`.
+```bash
+./run.sh
+```
+You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, running the following command will only preprocess the dataset.
+```bash
+./run.sh --stage 0 --stop-stage 0
+```
+### Data Preprocessing
+```bash
+./local/preprocess.sh ${conf_path}
+```
+When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.
+
+```text
+dump
+├── dev
+│   ├── norm
+│   └── raw
+├── test
+│   ├── norm
+│   └── raw
+└── train
+    ├── norm
+    ├── raw
+    └── feats_stats.npy
+```
+The dataset is split into 3 parts, namely `train`, `dev`, and `test`, each of which contains a `norm` and `raw` subfolder. The `raw` folder contains the log magnitude of the mel spectrogram of each utterance, while the norm folder contains the normalized spectrogram. The statistics used to normalize the spectrogram are computed from the training set, which is located in `dump/train/feats_stats.npy`.
+
+Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file that contains id and paths to the spectrogram of each utterance.
+
+### Model Training
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
+```
+`./local/train.sh` calls `${BIN_DIR}/train.py`.
+Here's the complete help message.
+
+```text
+usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
+                [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
+                [--ngpu NGPU]
+
+Train a HiFiGAN model.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --config CONFIG       HiFiGAN config file.
+  --train-metadata TRAIN_METADATA
+                        training data.
+  --dev-metadata DEV_METADATA
+                        dev data.
+  --output-dir OUTPUT_DIR
+                        output dir.
+  --ngpu NGPU           if ngpu == 0, use cpu.
+```
+
+1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/iSTFT.yaml`.
+2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder.
+3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are saved in `checkpoints/` inside this directory.
+4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
+
+### Synthesizing
+`./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
+```
+```text
+usage: synthesize.py [-h] [--generator-type GENERATOR_TYPE] [--config CONFIG]
+                     [--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA]
+                     [--output-dir OUTPUT_DIR] [--ngpu NGPU]
+
+Synthesize with GANVocoder.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --generator-type GENERATOR_TYPE
+                        type of GANVocoder, should in {pwgan, mb_melgan,
+                        style_melgan, } now
+  --config CONFIG       GANVocoder config file.
+  --checkpoint CHECKPOINT
+                        snapshot to load.
+  --test-metadata TEST_METADATA
+                        dev data.
+  --output-dir OUTPUT_DIR
+                        output dir.
+  --ngpu NGPU           if ngpu == 0, use cpu.
+```
+
+1. `--config` config file. You should use the same config with which the model is trained.
+2. `--checkpoint` is the checkpoint to load. Pick one of the checkpoints from `checkpoints` inside the training output directory.
+3. `--test-metadata` is the metadata of the test dataset. Use the `metadata.jsonl` in the `dev/norm` subfolder from the processed directory.
+4. `--output-dir` is the directory to save the synthesized audio files.
+5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
+
+## Pretrained Models
+
+The pretrained model can be downloaded here:
+
+- [iSTFTNet_csmsc_ckpt.zip](https://pan.baidu.com/s/1SNDlRWOGOcbbrKf5w-TJaA?pwd=r1e5)
+
+iSTFTNet checkpoint contains files listed below.
+
+```text
+iSTFTNet_csmsc_ckpt
+├── iSTFT.yaml                    # config used to train iSTFTNet
+├── feats_stats.npy               # statistics used to normalize spectrogram when training hifigan
+└── snapshot_iter_50000.pdz       # generator parameters of hifigan
+```
+
+A Comparison between iSTFTNet and Hifigan
+|  Model   |      Step      | eval/generator_loss | eval/mel_loss | eval/feature_matching_loss |  rtf   |
+|:--------:|:--------------:|:-------------------:|:-------------:|:--------------------------:| :---: |
+| hifigan  | 1(gpu) x 50000 |       13.989        |    0.14683    |           1.3484           |  0.01767   |
+| istftNet | 1(gpu) x 50000 |       13.319        |    0.14818    |           1.1069           |  0.01069   |
+
+> Rtf is tested on the CSMSC test dataset, and the test environment is aistudio v100 16G 1GPU, the test command is `./run.sh --stage 2 --stop-stage 2`
+
+The pretained hifigan model int the comparison  can be downloaded here:
+
+- [hifigan_csmsc_ckpt.zip](https://pan.baidu.com/s/1pGY6RYV7yEB_5hRI_JoWig?pwd=tcaj)
+
+## Acknowledgement
+
+We adapted some code from https://github.com/rishikksh20/iSTFTNet-pytorch.git.
diff --git a/examples/librispeech/asr2/README.md b/examples/librispeech/asr2/README.md
index 26978520..253c9b45 100644
--- a/examples/librispeech/asr2/README.md
+++ b/examples/librispeech/asr2/README.md
@@ -153,7 +153,7 @@ After training the model, we need to get the final model for testing and inferen
 ```bash
  if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
      # avg n best model
-     avg.sh lastest exp/${ckpt}/checkpoints ${avg_num}
+     avg.sh latest exp/${ckpt}/checkpoints ${avg_num}
  fi
 ```
 The `avg.sh` is in the `../../../utils/` which is define in the `path.sh`.
diff --git a/examples/librispeech/asr3/path.sh b/examples/librispeech/asr3/path.sh
index f4717838..d98171a8 100644
--- a/examples/librispeech/asr3/path.sh
+++ b/examples/librispeech/asr3/path.sh
@@ -10,6 +10,4 @@ export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
 
 export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
 
-
-MODEL=wav2vec2
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/s2t/exps/${MODEL}/bin
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/s2t/exps/wav2vec2/bin
diff --git a/examples/librispeech/asr3/run.sh b/examples/librispeech/asr3/run.sh
old mode 100644
new mode 100755
index f52266a1..c880c9cb
--- a/examples/librispeech/asr3/run.sh
+++ b/examples/librispeech/asr3/run.sh
@@ -44,4 +44,4 @@ fi
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
     # test a single .wav file
     CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1
-fi
+fi
\ No newline at end of file
diff --git a/examples/librispeech/asr4/README.md b/examples/librispeech/asr4/README.md
new file mode 100644
index 00000000..064a7f16
--- /dev/null
+++ b/examples/librispeech/asr4/README.md
@@ -0,0 +1,197 @@
+# Hubert2ASR with Librispeech
+This example contains code used to finetune [hubert](https://arxiv.org/abs/2106.07447) model with [Librispeech dataset](http://www.openslr.org/resources/12)
+## Overview
+All the scripts you need are in `run.sh`. There are several stages in `run.sh`, and each stage has its function.
+| Stage | Function                                                     |
+|:---- |:----------------------------------------------------------- |
+| 0     | Process data. It includes: <br>       (1) Download the dataset <br>       (2) Calculate the CMVN of the train dataset <br>       (3) Get the vocabulary file <br>       (4) Get the manifest files of the train, development and test dataset<br>       (5) Download the pretrained wav2vec2 model |
+| 1     | Train the model                                              |
+| 2     | Get the final model by averaging the top-k models, set k = 1 means to choose the best model |
+| 3     | Test the final model performance                             |
+| 4     | Infer the single audio file                                  |
+
+
+You can choose to run a range of stages by setting `stage` and `stop_stage `. 
+
+For example, if you want to execute the code in stage 2 and stage 3, you can run this script:
+```bash
+bash run.sh --stage 2 --stop_stage 3
+```
+Or you can set `stage` equal to `stop-stage` to only run one stage.
+For example, if you only want to run `stage 0`, you can use the script below:
+```bash
+bash run.sh --stage 0 --stop_stage 0
+```
+The document below will describe the scripts in `run.sh` in detail.
+## The Environment Variables
+The path.sh contains the environment variables. 
+```bash
+. ./path.sh
+. ./cmd.sh
+```
+This script needs to be run first. And another script is also needed:
+```bash
+source ${MAIN_ROOT}/utils/parse_options.sh
+```
+It will support the way of using `--variable value` in the shell scripts.
+## The Local Variables
+Some local variables are set in `run.sh`. 
+`gpus` denotes the GPU number you want to use. If you set `gpus=`, it means you only use CPU. 
+`stage` denotes the number of stages you want to start from in the experiments.
+`stop stage` denotes the number of the stage you want to end at in the experiments. 
+`conf_path` denotes the config path of the model.
+`avg_num` denotes the number K of top-K models you want to average to get the final model.
+`audio file` denotes the file path of the single file you want to infer in stage 5
+`ckpt` denotes the checkpoint prefix of the model, e.g. "hubertASR"
+
+You can set the local variables (except `ckpt`) when you use `run.sh`
+
+For example, you can set the `gpus` and `avg_num` when you use the command line:
+```bash
+bash run.sh --gpus 0,1 --avg_num 20
+```
+## Stage 0: Data Processing
+To use this example, you need to process data firstly and you can use stage 0 in `run.sh` to do this. The code is shown below:
+```bash
+ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+     # prepare data
+     bash ./local/data.sh || exit -1
+ fi
+```
+Stage 0 is for processing the data.
+
+If you only want to process the data. You can run
+```bash
+bash run.sh --stage 0 --stop_stage 0
+```
+You can also just run these scripts in your command line.
+```bash
+. ./path.sh
+. ./cmd.sh
+bash ./local/data.sh
+```
+After processing the data, the `data` directory will look like this:
+```bash
+data/
+|-- dev.meta
+|-- lang_char
+|   `-- bpe_unigram_5000.model
+|   `-- bpe_unigram_5000.vocab
+|   `-- vocab.txt
+|-- manifest.dev
+|-- manifest.dev.raw
+|-- manifest.test
+|-- manifest.test.raw
+|-- manifest.train
+|-- manifest.train.raw
+|-- mean_std.json
+|-- test.meta
+`-- train.meta
+```
+
+Stage 0 also downloads the pre-trained [hubert](https://paddlespeech.bj.bcebos.com/hubert/hubert-large-lv60.pdparams) model.
+```bash
+mkdir -p exp/hubert
+wget -P exp/hubert https://paddlespeech.bj.bcebos.com/hubert/hubert-large-lv60.pdparams
+```
+## Stage 1: Model Training
+If you want to train the model. you can use stage 1 in `run.sh`. The code is shown below. 
+```bash
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+     # train model, all `ckpt` under `exp` dir
+     CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt}
+ fi
+```
+If you want to train the model, you can use the script below to execute stage 0 and stage 1:
+```bash
+bash run.sh --stage 0 --stop_stage 1
+```
+or you can run these scripts in the command line (only use CPU).
+```bash
+. ./path.sh
+. ./cmd.sh
+bash ./local/data.sh
+CUDA_VISIBLE_DEVICES= ./local/train.sh conf/hubertASR.yaml hubertASR
+```
+## Stage 2: Top-k Models Averaging
+After training the model, we need to get the final model for testing and inference. In every epoch, the model checkpoint is saved, so we can choose the best model from them based on the validation loss or we can sort them and average the parameters of the top-k models to get the final model. We can use stage 2 to do this, and the code is shown below. Note: We only train one epoch for hubertASR, thus the `avg_num` is set to 1.
+```bash
+ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+     # avg n best model
+     avg.sh best exp/${ckpt}/checkpoints ${avg_num}
+ fi
+```
+The `avg.sh` is in the `../../../utils/` which is define in the `path.sh`.
+If you want to get the final model, you can use the script below to execute stage 0, stage 1, and stage 2:
+```bash
+bash run.sh --stage 0 --stop_stage 2
+```
+or you can run these scripts in the command line (only use CPU).
+
+```bash
+. ./path.sh
+. ./cmd.sh
+bash ./local/data.sh
+CUDA_VISIBLE_DEVICES= ./local/train.sh conf/hubertASR.yaml hubertASR
+avg.sh best exp/hubertASR/checkpoints 1
+```
+## Stage 3: Model Testing
+The test stage is to evaluate the model performance. The code of test stage is shown below:
+```bash
+ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+     # test ckpt avg_n
+     CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+ fi
+```
+If you want to train a model and test it, you can use the script below to execute stage 0, stage 1, stage 2, and stage 3 :
+```bash
+bash run.sh --stage 0 --stop_stage 3
+```
+or you can run these scripts in the command line (only use CPU).
+```bash
+. ./path.sh
+. ./cmd.sh
+bash ./local/data.sh
+CUDA_VISIBLE_DEVICES= ./local/train.sh conf/hubertASR.yaml hubertASR
+avg.sh best exp/hubertASR/checkpoints 1
+CUDA_VISIBLE_DEVICES= ./local/test.sh conf/hubertASR.yaml conf/tuning/decode.yaml exp/hubertASR/checkpoints/avg_1
+```
+## Pretrained Model
+You can get the pretrained hubertASR from [this](../../../docs/source/released_model.md).
+
+using the `tar` scripts to unpack the model and then you can use the script to test the model.
+
+For example:
+```bash
+wget https://paddlespeech.bj.bcebos.com/hubert/hubertASR-large-100h-librispeech_ckpt_1.4.0.model.tar.gz
+tar xzvf hubertASR-large-100h-librispeech_ckpt_1.4.0.model.tar.gz
+source path.sh
+# If you have process the data and get the manifest file， you can skip the following 2 steps
+bash local/data.sh --stage -1 --stop_stage -1
+bash local/data.sh --stage 2 --stop_stage 2
+CUDA_VISIBLE_DEVICES= ./local/test.sh conf/hubertASR.yaml conf/tuning/decode.yaml exp/hubertASR/checkpoints/avg_1
+```
+The performance of the released models are shown in [here](./RESULTS.md).
+
+
+## Stage 4: Single Audio File Inference
+In some situations, you want to use the trained model to do the inference for the single audio file. You can use stage 5. The code is shown below
+```bash
+ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+     # test a single .wav file
+     CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1
+ fi
+```
+you can train the model by yourself using ```bash run.sh --stage 0 --stop_stage 3```, or you can download the pretrained model through the script below:
+```bash
+wget https://paddlespeech.bj.bcebos.com/hubert/hubertASR-large-100h-librispeech_ckpt_1.4.0.model.tar.gz
+tar xzvf hubertASR-large-100h-librispeech_ckpt_1.4.0.model.tar.gz
+```
+You can download the audio demo:
+```bash
+wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav -P data/
+```
+You need to prepare an audio file or use the audio demo above, please confirm the sample rate of the audio is 16K. You can get the result of the audio demo by running the script below.
+```bash
+CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/hubertASR.yaml conf/tuning/decode.yaml exp/hubertASR/checkpoints/avg_1 data/demo_002_en.wav
+```
diff --git a/examples/librispeech/asr4/RESULTS.md b/examples/librispeech/asr4/RESULTS.md
new file mode 100644
index 00000000..81ce6ee9
--- /dev/null
+++ b/examples/librispeech/asr4/RESULTS.md
@@ -0,0 +1,9 @@
+# LibriSpeech
+
+## hubertASR
+Fintuning on train-clean-100
+train: Epoch 3, 1*V100-32G, batchsize: 4, accum_grad: 8
+
+| Model | Params | Config | Augmentation| Test set | Decode method | WER |  
+| --- | --- | --- | --- | --- | --- | --- |
+| hubertASR | 326.16M | conf/hubertASR.yaml | spec_aug | test-clean | greedy search | 0.05868 |  
diff --git a/examples/librispeech/asr4/cmd.sh b/examples/librispeech/asr4/cmd.sh
new file mode 100644
index 00000000..7b70ef5e
--- /dev/null
+++ b/examples/librispeech/asr4/cmd.sh
@@ -0,0 +1,89 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# "qsub" (SGE, Torque, PBS, etc.)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/gpu.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/examples/librispeech/asr4/conf/config.json b/examples/librispeech/asr4/conf/config.json
new file mode 100644
index 00000000..ff2572ee
--- /dev/null
+++ b/examples/librispeech/asr4/conf/config.json
@@ -0,0 +1,77 @@
+{
+  "_name_or_path": "facebook/hubert-large-ll60k",
+  "activation_dropout": 0.0,
+  "apply_spec_augment": true,
+  "architectures": [
+    "HubertModel"
+  ],
+  "attention_dropout": 0.1,
+  "bos_token_id": 1,
+  "conv_bias": true,
+  "conv_dim": [
+    512,
+    512,
+    512,
+    512,
+    512,
+    512,
+    512
+  ],
+  "conv_kernel": [
+    10,
+    3,
+    3,
+    3,
+    3,
+    2,
+    2
+  ],
+  "conv_stride": [
+    5,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2
+  ],
+  "ctc_loss_reduction": "sum",
+  "ctc_zero_infinity": false,
+  "do_stable_layer_norm": true,
+  "eos_token_id": 2,
+  "feat_extract_activation": "gelu",
+  "feat_extract_dropout": 0.0,
+  "feat_extract_norm": "layer",
+  "feat_proj_dropout": 0.1,
+  "final_dropout": 0.0,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.1,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-05,
+  "layerdrop": 0.1,
+  "mask_channel_length": 10,
+  "mask_channel_min_space": 1,
+  "mask_channel_other": 0.0,
+  "mask_channel_prob": 0.0,
+  "mask_channel_selection": "static",
+  "mask_feature_length": 10,
+  "mask_feature_prob": 0.0,
+  "mask_time_length": 10,
+  "mask_time_min_space": 1,
+  "mask_time_other": 0.0,
+  "mask_time_prob": 0.075,
+  "mask_time_selection": "static",
+  "model_type": "hubert",
+  "num_attention_heads": 16,
+  "num_conv_pos_embedding_groups": 16,
+  "num_conv_pos_embeddings": 128,
+  "num_feat_extract_layers": 7,
+  "num_hidden_layers": 24,
+  "pad_token_id": 0,
+  "transformers_version": "4.10.0.dev0",
+  "vocab_size": 32,
+  "tokenizer_class": "Wav2Vec2CTCTokenizer"
+}
diff --git a/examples/librispeech/asr4/conf/hubertASR.yaml b/examples/librispeech/asr4/conf/hubertASR.yaml
new file mode 100644
index 00000000..44c3d3e1
--- /dev/null
+++ b/examples/librispeech/asr4/conf/hubertASR.yaml
@@ -0,0 +1,142 @@
+############################################
+#          Network Architecture           #
+############################################
+freeze_hubert: False
+normalize_wav: True
+output_norm: True
+init_type: kaiming_uniform # !Warning: need to convergence
+enc:
+  input_shape: 1024
+  dnn_blocks: 2
+  dnn_neurons: 1024
+  activation: True
+ctc:
+  enc_n_units: 1024
+  blank_id: 0
+  dropout_rate: 0.0
+hubert_params_path: "exp/hubert/hubert-large-lv60.pdparams"
+
+
+task_cfg:
+  label_rate: 50.0
+  sample_rate: 16000
+  normalize: True
+  enable_padding: False
+  max_keep_size: None
+  max_sample_size: 250000
+  min_sample_size: 32000
+  single_target: False
+  random_crop: True
+  pad_audio: False
+
+model_cfg:
+  dropout_input: 0.0
+  final_dropout: 0.0
+  dropout: 0.0
+  attention_dropout: 0.0
+  activation_dropout: 0.1
+  apply_mask: True
+  mask_length: 10
+  mask_prob: 0.5
+  mask_selection: static
+  mask_other: 0.0
+  no_mask_overlap: False
+  mask_channel_length: 64
+  mask_channel_prob: 0.25
+  mask_channel_selection: static
+  mask_channel_other: 0.0
+  no_mask_channel_overlap: False
+  feature_grad_mult: 0.0
+  layerdrop: 0.1
+  normalize: True
+  fp16: True
+  label_rate: 50
+  extractor_mode: layer_norm
+  encoder_layers: 24
+  encoder_embed_dim: 1024
+  encoder_ffn_embed_dim: 4096
+  encoder_attention_heads: 16
+  activation_fn: gelu
+  encoder_layerdrop: 0.1
+  dropout_features: 0.0
+  final_dim: 768
+  untie_final_proj: True
+  layer_norm_first: True
+  conv_feature_layers: "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2"
+  conv_bias: False
+  logit_temp: 0.1
+  target_glu: False
+  mask_min_space: 1
+  mask_channel_min_space: 1
+  conv_pos: 128
+  conv_pos_groups: 16
+  latent_temp: [2.0, 0.5, 0.999995]
+  skip_masked: False
+  skip_nomask: True
+
+###########################################
+#                   Data                  #
+###########################################
+train_manifest: data/manifest.train-clean-100
+dev_manifest: data/manifest.dev
+test_manifest: data/manifest.test-clean
+
+###########################################
+#              Dataloader                 #
+###########################################
+vocab_filepath: data/lang_char/vocab.txt 
+unit_type: char
+mean_std_filepath: ""
+preprocess_config: conf/preprocess.yaml
+sortagrad: -1 # Feed samples from shortest to longest ; -1: enabled for all epochs 0: disabled other: enabled for other epochs 
+batch_size: 4  # Different batch_size may cause large differences in results
+maxlen_in: 1500  # if input length  > maxlen-in batchsize is automatically reduced
+maxlen_out: 150  # if output length > maxlen-out batchsize is automatically reduced
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0 
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+num_workers: 0
+subsampling_factor: 1
+num_encs: 1
+dist_sampler: True
+shortest_first: True
+return_lens_rate: True
+  
+############################################
+#             Data Augmentation            #
+############################################
+audio_augment:  # for raw audio 
+  sample_rate: 16000
+  speeds: [95, 100, 105]
+
+###########################################
+#                 Training                #
+###########################################
+n_epoch: 3
+accum_grad: 8
+global_grad_clip: 5.0
+model_optim: adadelta
+model_optim_conf:
+  lr: 1.0
+  epsilon: 1.0e-6
+  rho: 0.95
+model_scheduler: constantlr    
+model_scheduler_conf:
+  warmup_steps: 25000
+  lr_decay: 1.0
+hubert_optim: adadelta
+hubert_optim_conf:
+  lr: 0.95
+  epsilon: 1.0e-6
+  rho: 0.95
+hubert_scheduler: constantlr    
+hubert_scheduler_conf:
+  warmup_steps: 25000
+  lr_decay: 1.0
+log_interval: 1
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
diff --git a/examples/librispeech/asr4/conf/preprocess.yaml b/examples/librispeech/asr4/conf/preprocess.yaml
new file mode 100644
index 00000000..724782ed
--- /dev/null
+++ b/examples/librispeech/asr4/conf/preprocess.yaml
@@ -0,0 +1,3 @@
+process:
+    # use raw audio
+  - type: wav_process
diff --git a/examples/librispeech/asr4/conf/preprocessor_config.json b/examples/librispeech/asr4/conf/preprocessor_config.json
new file mode 100644
index 00000000..36ebe8b7
--- /dev/null
+++ b/examples/librispeech/asr4/conf/preprocessor_config.json
@@ -0,0 +1,9 @@
+{
+  "do_normalize": true,
+  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
+  "feature_size": 1,
+  "padding_side": "right",
+  "padding_value": 0,
+  "return_attention_mask": true,
+  "sampling_rate": 16000
+}
diff --git a/examples/librispeech/asr4/conf/tuning/decode.yaml b/examples/librispeech/asr4/conf/tuning/decode.yaml
new file mode 100644
index 00000000..2ba39326
--- /dev/null
+++ b/examples/librispeech/asr4/conf/tuning/decode.yaml
@@ -0,0 +1,4 @@
+decode_batch_size: 1
+error_rate_type: wer
+decoding_method: ctc_greedy_search  # 'ctc_greedy_search', 'ctc_prefix_beam_search'
+beam_size: 10
diff --git a/examples/librispeech/asr4/local/data.sh b/examples/librispeech/asr4/local/data.sh
new file mode 100755
index 00000000..7d0613d5
--- /dev/null
+++ b/examples/librispeech/asr4/local/data.sh
@@ -0,0 +1,110 @@
+#!/bin/bash
+
+stage=-1
+stop_stage=100
+
+unit_type=char
+dict_dir=data/lang_char
+
+source ${MAIN_ROOT}/utils/parse_options.sh
+
+mkdir -p data
+mkdir -p ${dict_dir}
+TARGET_DIR=${MAIN_ROOT}/dataset
+mkdir -p ${TARGET_DIR}
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    # download data, generate manifests
+    python3 ${TARGET_DIR}/librispeech/librispeech.py \
+    --manifest_prefix="data/manifest" \
+    --target_dir="${TARGET_DIR}/librispeech" \
+    --full_download="True"
+
+    if [ $? -ne 0 ]; then
+        echo "Prepare LibriSpeech failed. Terminated."
+        exit 1
+    fi
+
+    for set in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
+        mv data/manifest.${set} data/manifest.${set}.raw
+    done
+
+    rm -rf data/manifest.train.raw data/manifest.dev.raw  data/manifest.test.raw
+    for set in train-clean-100 train-clean-360 train-other-500; do
+        cat data/manifest.${set}.raw >> data/manifest.train.raw
+    done
+
+    for set in dev-clean dev-other; do
+        cat data/manifest.${set}.raw >> data/manifest.dev.raw
+    done
+
+    for set in test-clean test-other; do
+        cat data/manifest.${set}.raw >> data/manifest.test.raw
+    done
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # compute mean and stddev for normalizer
+    num_workers=$(nproc)
+    python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
+    --manifest_path="data/manifest.train.raw" \
+    --num_samples=2000 \
+    --spectrum_type="fbank" \
+    --feat_dim=161 \
+    --delta_delta=false \
+    --sample_rate=16000 \
+    --stride_ms=10 \
+    --window_ms=25 \
+    --use_dB_normalization=False \
+    --num_workers=${num_workers} \
+    --output_path="data/mean_std.json"
+
+    if [ $? -ne 0 ]; then
+        echo "Compute mean and stddev failed. Terminated."
+        exit 1
+    fi
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # build vocabulary
+    python3 ${MAIN_ROOT}/utils/build_vocab.py \
+    --unit_type ${unit_type} \
+    --count_threshold=0 \
+    --vocab_path="${dict_dir}/vocab.txt" \
+    --manifest_paths="data/manifest.train.raw"
+
+    if [ $? -ne 0 ]; then
+        echo "Build vocabulary failed. Terminated."
+        exit 1
+    fi
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # format manifest with tokenids, vocab size
+    for set in train dev test dev-clean dev-other test-clean test-other; do
+    {
+        python3 ${MAIN_ROOT}/utils/format_data.py \
+        --cmvn_path "data/mean_std.json" \
+        --unit_type ${unit_type} \
+        --vocab_path="${dict_dir}/vocab.txt" \
+        --manifest_path="data/manifest.${set}.raw" \
+        --output_path="data/manifest.${set}"
+
+        if [ $? -ne 0 ]; then
+            echo "Formt mnaifest.${set} failed. Terminated."
+            exit 1
+        fi
+    }&
+    done
+    wait
+fi
+
+echo "LibriSpeech Data preparation done."
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    mkdir -p exp/hubert
+    echo "Pretrained hubert model download"
+    wget -P exp/hubert https://paddlespeech.bj.bcebos.com/hubert/hubert-large-lv60.pdparams
+fi
+
+exit 0
\ No newline at end of file
diff --git a/examples/librispeech/asr4/local/test.sh b/examples/librispeech/asr4/local/test.sh
new file mode 100755
index 00000000..dfbd56ac
--- /dev/null
+++ b/examples/librispeech/asr4/local/test.sh
@@ -0,0 +1,83 @@
+#!/bin/bash
+
+set -e
+
+ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+echo "using $ngpu gpus..."
+
+expdir=exp
+datadir=data
+
+recog_set="test-clean test-other dev-clean dev-other"
+recog_set="test-clean"
+
+config_path=$1
+decode_config_path=$2
+ckpt_prefix=$3
+
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
+
+# download language model
+#bash local/download_lm_en.sh
+#if [ $? -ne 0 ]; then
+#    exit 1
+#fi
+
+python3 utils/format_rsl.py \
+    --origin_ref data/manifest.test-clean.raw \
+    --trans_ref data/manifest.test-clean.text
+
+
+for type in ctc_greedy_search; do
+    echo "decoding ${type}"
+    batch_size=16
+    python3 -u ${BIN_DIR}/test.py \
+        --ngpu ${ngpu} \
+        --config ${config_path} \
+        --decode_cfg ${decode_config_path} \
+        --result_file ${ckpt_prefix}.${type}.rsl \
+        --checkpoint_path ${ckpt_prefix} \
+        --opts decode.decoding_method ${type} \
+        --opts decode.decode_batch_size ${batch_size}
+
+    if [ $? -ne 0 ]; then
+        echo "Failed in evaluation!"
+        exit 1
+    fi
+    python3 utils/format_rsl.py \
+        --origin_hyp ${ckpt_prefix}.${type}.rsl \
+        --trans_hyp ${ckpt_prefix}.${type}.rsl.text
+
+    python3 utils/compute-wer.py --char=1 --v=1 \
+        data/manifest.test-clean.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error
+    echo "decoding ${type} done."
+done
+
+for type in ctc_prefix_beam_search; do
+    echo "decoding ${type}"
+    batch_size=1
+    python3 -u ${BIN_DIR}/test.py \
+        --ngpu ${ngpu} \
+        --config ${config_path} \
+        --decode_cfg ${decode_config_path} \
+        --result_file ${ckpt_prefix}.${type}.rsl \
+        --checkpoint_path ${ckpt_prefix} \
+        --opts decode.decoding_method ${type} \
+        --opts decode.decode_batch_size ${batch_size}
+
+    if [ $? -ne 0 ]; then
+        echo "Failed in evaluation!"
+        exit 1
+    fi
+    python3 utils/format_rsl.py \
+        --origin_hyp ${ckpt_prefix}.${type}.rsl \
+        --trans_hyp ${ckpt_prefix}.${type}.rsl.text
+
+    python3 utils/compute-wer.py --char=1 --v=1 \
+        data/manifest.test-clean.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error
+    echo "decoding ${type} done."
+done
+
+echo "Finished"
+
+exit 0
diff --git a/examples/librispeech/asr4/local/test_wav.sh b/examples/librispeech/asr4/local/test_wav.sh
new file mode 100755
index 00000000..fdf3589f
--- /dev/null
+++ b/examples/librispeech/asr4/local/test_wav.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+
+if [ $# != 4 ];then
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix audio_file"
+    exit -1
+fi
+
+ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+echo "using $ngpu gpus..."
+
+config_path=$1
+decode_config_path=$2
+ckpt_prefix=$3
+audio_file=$4
+
+mkdir -p data
+wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav -P data/
+if [ $? -ne 0 ]; then
+   exit 1
+fi
+
+if [ ! -f ${audio_file} ]; then
+    echo "Plase input the right audio_file path"
+    exit 1
+fi
+
+chunk_mode=false
+if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then
+    chunk_mode=true
+fi
+
+# download language model
+#bash local/download_lm_ch.sh
+#if [ $? -ne 0 ]; then
+#    exit 1
+#fi
+
+for type in ctc_greedy_search; do
+    echo "decoding ${type}"
+    batch_size=1
+    output_dir=${ckpt_prefix}
+    mkdir -p ${output_dir}
+    python3 -u ${BIN_DIR}/test_wav.py \
+    --ngpu ${ngpu} \
+    --config ${config_path} \
+    --decode_cfg ${decode_config_path} \
+    --result_file ${output_dir}/${type}.rsl \
+    --checkpoint_path ${ckpt_prefix} \
+    --opts decode.decoding_method ${type} \
+    --opts decode.decode_batch_size ${batch_size} \
+    --audio_file ${audio_file}
+
+    if [ $? -ne 0 ]; then
+        echo "Failed in evaluation!"
+        exit 1
+    fi
+done
+exit 0
diff --git a/examples/librispeech/asr4/local/train.sh b/examples/librispeech/asr4/local/train.sh
new file mode 100755
index 00000000..24776fd1
--- /dev/null
+++ b/examples/librispeech/asr4/local/train.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+
+if [ $# -lt 2 ] && [ $# -gt 3 ];then
+    echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ips(optional)"
+    exit -1
+fi
+
+ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+echo "using $ngpu gpus..."
+
+config_path=$1
+ckpt_name=$2
+resume=$3
+ips=$4
+
+if [ ! $ips ];then
+  ips_config=
+else
+  ips_config="--ips="${ips}
+fi
+
+mkdir -p exp
+
+# seed may break model convergence
+seed=1988
+if [ ${seed} != 0 ]; then
+    export FLAGS_cudnn_deterministic=True
+fi
+
+# export FLAGS_cudnn_exhaustive_search=true
+# export FLAGS_conv_workspace_size_limit=4000
+export FLAGS_allocator_strategy=naive_best_fit
+if [ ${ngpu} == 0 ]; then
+python3 -u ${BIN_DIR}/train.py \
+--ngpu ${ngpu} \
+--config ${config_path} \
+--output exp/${ckpt_name} \
+--seed ${seed} \
+--resume ${resume}
+else
+python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${ips_config} ${BIN_DIR}/train.py \
+--ngpu ${ngpu} \
+--config ${config_path} \
+--output exp/${ckpt_name} \
+--seed ${seed} \
+--resume ${resume}
+fi
+
+if [ ${seed} != 0 ]; then
+    unset FLAGS_cudnn_deterministic
+fi
+
+if [ $? -ne 0 ]; then
+    echo "Failed in training!"
+    exit 1
+fi
+
+exit 0
diff --git a/examples/librispeech/asr4/path.sh b/examples/librispeech/asr4/path.sh
new file mode 100644
index 00000000..254216a6
--- /dev/null
+++ b/examples/librispeech/asr4/path.sh
@@ -0,0 +1,13 @@
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/tools/sctk/bin:${PWD}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
+
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/s2t/exps/hubert/bin
diff --git a/examples/librispeech/asr4/run.sh b/examples/librispeech/asr4/run.sh
new file mode 100755
index 00000000..6d7dc6c9
--- /dev/null
+++ b/examples/librispeech/asr4/run.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+set -e
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+
+gpus=0
+stage=0
+stop_stage=0
+conf_path=conf/hubertASR.yaml
+ips=            #xx.xx.xx.xx,xx.xx.xx.xx
+decode_conf_path=conf/tuning/decode.yaml
+avg_num=1
+resume=         # xx e.g. 30
+
+. ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
+
+audio_file=data/demo_002_en.wav
+
+avg_ckpt=avg_${avg_num}
+ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
+echo "checkpoint name ${ckpt}"
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    bash ./local/data.sh || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `exp` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} ${resume} ${ips}
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # avg n best model
+    avg.sh best exp/${ckpt}/checkpoints ${avg_num}
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # greedy search decoder
+    CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    # test a single .wav file
+    CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1
+fi
diff --git a/examples/librispeech/asr4/utils b/examples/librispeech/asr4/utils
new file mode 120000
index 00000000..973afe67
--- /dev/null
+++ b/examples/librispeech/asr4/utils
@@ -0,0 +1 @@
+../../../utils
\ No newline at end of file
diff --git a/examples/librispeech/asr5/README.md b/examples/librispeech/asr5/README.md
new file mode 100644
index 00000000..826c33ce
--- /dev/null
+++ b/examples/librispeech/asr5/README.md
@@ -0,0 +1,197 @@
+# WavLM2ASR with Librispeech
+This example contains code used to finetune [WavLM](https://arxiv.org/abs/2110.13900) model with [Librispeech dataset](http://www.openslr.org/resources/12)
+## Overview
+All the scripts you need are in `run.sh`. There are several stages in `run.sh`, and each stage has its function.
+| Stage | Function                                                     |
+|:---- |:----------------------------------------------------------- |
+| 0     | Process data. It includes: <br>       (1) Download the dataset <br>       (2) Calculate the CMVN of the train dataset <br>       (3) Get the vocabulary file <br>       (4) Get the manifest files of the train, development and test dataset<br>       (5) Download the pretrained wav2vec2 model |
+| 1     | Train the model                                              |
+| 2     | Get the final model by averaging the top-k models, set k = 1 means to choose the best model |
+| 3     | Test the final model performance                             |
+| 4     | Infer the single audio file                                  |
+
+
+You can choose to run a range of stages by setting `stage` and `stop_stage `. 
+
+For example, if you want to execute the code in stage 2 and stage 3, you can run this script:
+```bash
+bash run.sh --stage 2 --stop_stage 3
+```
+Or you can set `stage` equal to `stop-stage` to only run one stage.
+For example, if you only want to run `stage 0`, you can use the script below:
+```bash
+bash run.sh --stage 0 --stop_stage 0
+```
+The document below will describe the scripts in `run.sh` in detail.
+## The Environment Variables
+The path.sh contains the environment variables. 
+```bash
+. ./path.sh
+. ./cmd.sh
+```
+This script needs to be run first. And another script is also needed:
+```bash
+source ${MAIN_ROOT}/utils/parse_options.sh
+```
+It will support the way of using `--variable value` in the shell scripts.
+## The Local Variables
+Some local variables are set in `run.sh`. 
+`gpus` denotes the GPU number you want to use. If you set `gpus=`, it means you only use CPU. 
+`stage` denotes the number of stages you want to start from in the experiments.
+`stop stage` denotes the number of the stage you want to end at in the experiments. 
+`conf_path` denotes the config path of the model.
+`avg_num` denotes the number K of top-K models you want to average to get the final model.
+`audio file` denotes the file path of the single file you want to infer in stage 5
+`ckpt` denotes the checkpoint prefix of the model, e.g. "WavLMASR"
+
+You can set the local variables (except `ckpt`) when you use `run.sh`
+
+For example, you can set the `gpus` and `avg_num` when you use the command line:
+```bash
+bash run.sh --gpus 0,1 --avg_num 20
+```
+## Stage 0: Data Processing
+To use this example, you need to process data firstly and you can use stage 0 in `run.sh` to do this. The code is shown below:
+```bash
+ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+     # prepare data
+     bash ./local/data.sh || exit -1
+ fi
+```
+Stage 0 is for processing the data.
+
+If you only want to process the data. You can run
+```bash
+bash run.sh --stage 0 --stop_stage 0
+```
+You can also just run these scripts in your command line.
+```bash
+. ./path.sh
+. ./cmd.sh
+bash ./local/data.sh
+```
+After processing the data, the `data` directory will look like this:
+```bash
+data/
+|-- dev.meta
+|-- lang_char
+|   `-- bpe_unigram_5000.model
+|   `-- bpe_unigram_5000.vocab
+|   `-- vocab.txt
+|-- manifest.dev
+|-- manifest.dev.raw
+|-- manifest.test
+|-- manifest.test.raw
+|-- manifest.train
+|-- manifest.train.raw
+|-- mean_std.json
+|-- test.meta
+`-- train.meta
+```
+
+Stage 0 also downloads the pre-trained [wavlm](https://paddlespeech.bj.bcebos.com/wavlm/wavlm-base-plus.pdparams) model.
+```bash
+mkdir -p exp/wavlm
+wget -P exp/wavlm https://paddlespeech.bj.bcebos.com/wavlm/wavlm-base-plus.pdparams
+```
+## Stage 1: Model Training
+If you want to train the model. you can use stage 1 in `run.sh`. The code is shown below. 
+```bash
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+     # train model, all `ckpt` under `exp` dir
+     CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt}
+ fi
+```
+If you want to train the model, you can use the script below to execute stage 0 and stage 1:
+```bash
+bash run.sh --stage 0 --stop_stage 1
+```
+or you can run these scripts in the command line (only use CPU).
+```bash
+. ./path.sh
+. ./cmd.sh
+bash ./local/data.sh
+CUDA_VISIBLE_DEVICES= ./local/train.sh conf/wavlmASR.yaml wavlmASR
+```
+## Stage 2: Top-k Models Averaging
+After training the model, we need to get the final model for testing and inference. In every epoch, the model checkpoint is saved, so we can choose the best model from them based on the validation loss or we can sort them and average the parameters of the top-k models to get the final model. We can use stage 2 to do this, and the code is shown below. Note: We only train one epoch for wavlmASR, thus the `avg_num` is set to 1.
+```bash
+ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+     # avg n best model
+     avg.sh best exp/${ckpt}/checkpoints ${avg_num}
+ fi
+```
+The `avg.sh` is in the `../../../utils/` which is define in the `path.sh`.
+If you want to get the final model, you can use the script below to execute stage 0, stage 1, and stage 2:
+```bash
+bash run.sh --stage 0 --stop_stage 2
+```
+or you can run these scripts in the command line (only use CPU).
+
+```bash
+. ./path.sh
+. ./cmd.sh
+bash ./local/data.sh
+CUDA_VISIBLE_DEVICES= ./local/train.sh conf/wavlmASR.yaml wavlmASR
+avg.sh best exp/wavlmASR/checkpoints 1
+```
+## Stage 3: Model Testing
+The test stage is to evaluate the model performance. The code of test stage is shown below:
+```bash
+ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+     # test ckpt avg_n
+     CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+ fi
+```
+If you want to train a model and test it, you can use the script below to execute stage 0, stage 1, stage 2, and stage 3 :
+```bash
+bash run.sh --stage 0 --stop_stage 3
+```
+or you can run these scripts in the command line (only use CPU).
+```bash
+. ./path.sh
+. ./cmd.sh
+bash ./local/data.sh
+CUDA_VISIBLE_DEVICES= ./local/train.sh conf/wavlmASR.yaml wavlmASR
+avg.sh best exp/wavlmASR/checkpoints 1
+CUDA_VISIBLE_DEVICES= ./local/test.sh conf/wavlmASR.yaml conf/tuning/decode.yaml exp/wavlmASR/checkpoints/avg_1
+```
+## Pretrained Model
+You can get the pretrained wavlmASR from [this](../../../docs/source/released_model.md).
+
+using the `tar` scripts to unpack the model and then you can use the script to test the model.
+
+For example:
+```bash
+wget https://paddlespeech.bj.bcebos.com/wavlm/wavlmASR-base-100h-librispeech_ckpt_1.4.0.model.tar.gz
+tar xzvf wavlmASR-base-100h-librispeech_ckpt_1.4.0.model.tar.gz
+source path.sh
+# If you have process the data and get the manifest file， you can skip the following 2 steps
+bash local/data.sh --stage -1 --stop_stage -1
+bash local/data.sh --stage 2 --stop_stage 2
+CUDA_VISIBLE_DEVICES= ./local/test.sh conf/wavlmASR.yaml conf/tuning/decode.yaml exp/wavlmASR/checkpoints/avg_1
+```
+The performance of the released models are shown in [here](./RESULTS.md).
+
+
+## Stage 4: Single Audio File Inference
+In some situations, you want to use the trained model to do the inference for the single audio file. You can use stage 5. The code is shown below
+```bash
+ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+     # test a single .wav file
+     CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1
+ fi
+```
+you can train the model by yourself using ```bash run.sh --stage 0 --stop_stage 3```, or you can download the pretrained model through the script below:
+```bash
+wget https://paddlespeech.bj.bcebos.com/wavlm/wavlm_baseplus_libriclean_100h.tar.gz
+tar xzvf wavlm_baseplus_libriclean_100h.tar.gz
+```
+You can download the audio demo:
+```bash
+wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav -P data/
+```
+You need to prepare an audio file or use the audio demo above, please confirm the sample rate of the audio is 16K. You can get the result of the audio demo by running the script below.
+```bash
+CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/wavlmASR.yaml conf/tuning/decode.yaml exp/wavlmASR/checkpoints/avg_1 data/demo_002_en.wav
+```
diff --git a/examples/librispeech/asr5/RESULTS.md b/examples/librispeech/asr5/RESULTS.md
new file mode 100644
index 00000000..806b39a1
--- /dev/null
+++ b/examples/librispeech/asr5/RESULTS.md
@@ -0,0 +1,9 @@
+# LibriSpeech
+
+## WavLMASR
+Fintuning on train-clean-100
+train: Epoch 16, 4*A800-80G, batchsize: 16, accum_grad: 8
+
+| Model | Params | Config | Augmentation| Test set | Decode method | WER |  
+| --- | --- | --- | --- | --- | --- | --- |
+| WavLMASR | 326.16M | conf/wavlmasr.yaml | spec_aug | test-clean | greedy search | 0.0561 |  
diff --git a/examples/librispeech/asr5/avg.sh b/examples/librispeech/asr5/avg.sh
new file mode 100644
index 00000000..c49b5c25
--- /dev/null
+++ b/examples/librispeech/asr5/avg.sh
@@ -0,0 +1,33 @@
+#! /usr/bin/env bash
+
+if [ $# != 3 ]; then
+    echo "usage: ${0} [best|latest] ckpt_dir avg_num"
+    exit -1
+fi
+
+avg_mode=${1} # best,latest
+ckpt_dir=${2}
+average_num=${3}
+decode_checkpoint=${ckpt_dir}/avg_${average_num}.pdparams
+
+if [ $avg_mode == best ];then
+    # best
+    python avg_model.py \
+    --dst_model ${decode_checkpoint} \
+    --ckpt_dir ${ckpt_dir}  \
+    --num ${average_num} \
+    --val_best
+else
+    # latest
+    python avg_model.py \
+    --dst_model ${decode_checkpoint} \
+    --ckpt_dir ${ckpt_dir}  \
+    --num ${average_num}
+fi
+
+if [ $? -ne 0 ]; then
+    echo "Failed in avg ckpt!"
+    exit 1
+fi
+
+exit 0
diff --git a/examples/librispeech/asr5/cmd.sh b/examples/librispeech/asr5/cmd.sh
new file mode 100644
index 00000000..7b70ef5e
--- /dev/null
+++ b/examples/librispeech/asr5/cmd.sh
@@ -0,0 +1,89 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# "qsub" (SGE, Torque, PBS, etc.)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/gpu.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/examples/librispeech/asr5/compute_wer.py b/examples/librispeech/asr5/compute_wer.py
new file mode 100644
index 00000000..5711c725
--- /dev/null
+++ b/examples/librispeech/asr5/compute_wer.py
@@ -0,0 +1,558 @@
+# Copyright 2021 Mobvoi Inc. All Rights Reserved.
+# flake8: noqa
+import codecs
+import re
+import sys
+import unicodedata
+
+remove_tag = True
+spacelist = [' ', '\t', '\r', '\n']
+puncts = [
+    '!', ',', '?', '、', '。', '！', '，', '；', '？', '：', '「', '」', '︰', '『', '』',
+    '《', '》'
+]
+
+
+def characterize(string):
+    res = []
+    i = 0
+    while i < len(string):
+        char = string[i]
+        if char in puncts:
+            i += 1
+            continue
+        cat1 = unicodedata.category(char)
+        #https://unicodebook.readthedocs.io/unicode.html#unicode-categories
+        if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist:  # space or not assigned
+            i += 1
+            continue
+        if cat1 == 'Lo':  # letter-other
+            res.append(char)
+            i += 1
+        else:
+            # some input looks like: <unk><noise>, we want to separate it to two words.
+            sep = ' '
+            if char == '<': sep = '>'
+            j = i + 1
+            while j < len(string):
+                c = string[j]
+                if ord(c) >= 128 or (c in spacelist) or (c == sep):
+                    break
+                j += 1
+            if j < len(string) and string[j] == '>':
+                j += 1
+            res.append(string[i:j])
+            i = j
+    return res
+
+
+def stripoff_tags(x):
+    if not x: return ''
+    chars = []
+    i = 0
+    T = len(x)
+    while i < T:
+        if x[i] == '<':
+            while i < T and x[i] != '>':
+                i += 1
+            i += 1
+        else:
+            chars.append(x[i])
+            i += 1
+    return ''.join(chars)
+
+
+def normalize(sentence, ignore_words, cs, split=None):
+    """ sentence, ignore_words are both in unicode
+    """
+    new_sentence = []
+    for token in sentence:
+        x = token
+        if not cs:
+            x = x.upper()
+        if x in ignore_words:
+            continue
+        if remove_tag:
+            x = stripoff_tags(x)
+        if not x:
+            continue
+        if split and x in split:
+            new_sentence += split[x]
+        else:
+            new_sentence.append(x)
+    return new_sentence
+
+
+class Calculator:
+    def __init__(self):
+        self.data = {}
+        self.space = []
+        self.cost = {}
+        self.cost['cor'] = 0
+        self.cost['sub'] = 1
+        self.cost['del'] = 1
+        self.cost['ins'] = 1
+
+    def calculate(self, lab, rec):
+        # Initialization
+        lab.insert(0, '')
+        rec.insert(0, '')
+        while len(self.space) < len(lab):
+            self.space.append([])
+        for row in self.space:
+            for element in row:
+                element['dist'] = 0
+                element['error'] = 'non'
+            while len(row) < len(rec):
+                row.append({'dist': 0, 'error': 'non'})
+        for i in range(len(lab)):
+            self.space[i][0]['dist'] = i
+            self.space[i][0]['error'] = 'del'
+        for j in range(len(rec)):
+            self.space[0][j]['dist'] = j
+            self.space[0][j]['error'] = 'ins'
+        self.space[0][0]['error'] = 'non'
+        for token in lab:
+            if token not in self.data and len(token) > 0:
+                self.data[token] = {
+                    'all': 0,
+                    'cor': 0,
+                    'sub': 0,
+                    'ins': 0,
+                    'del': 0
+                }
+        for token in rec:
+            if token not in self.data and len(token) > 0:
+                self.data[token] = {
+                    'all': 0,
+                    'cor': 0,
+                    'sub': 0,
+                    'ins': 0,
+                    'del': 0
+                }
+        # Computing edit distance
+        for i, lab_token in enumerate(lab):
+            for j, rec_token in enumerate(rec):
+                if i == 0 or j == 0:
+                    continue
+                min_dist = sys.maxsize
+                min_error = 'none'
+                dist = self.space[i - 1][j]['dist'] + self.cost['del']
+                error = 'del'
+                if dist < min_dist:
+                    min_dist = dist
+                    min_error = error
+                dist = self.space[i][j - 1]['dist'] + self.cost['ins']
+                error = 'ins'
+                if dist < min_dist:
+                    min_dist = dist
+                    min_error = error
+                if lab_token == rec_token:
+                    dist = self.space[i - 1][j - 1]['dist'] + self.cost['cor']
+                    error = 'cor'
+                else:
+                    dist = self.space[i - 1][j - 1]['dist'] + self.cost['sub']
+                    error = 'sub'
+                if dist < min_dist:
+                    min_dist = dist
+                    min_error = error
+                self.space[i][j]['dist'] = min_dist
+                self.space[i][j]['error'] = min_error
+        # Tracing back
+        result = {
+            'lab': [],
+            'rec': [],
+            'all': 0,
+            'cor': 0,
+            'sub': 0,
+            'ins': 0,
+            'del': 0
+        }
+        i = len(lab) - 1
+        j = len(rec) - 1
+        while True:
+            if self.space[i][j]['error'] == 'cor':  # correct
+                if len(lab[i]) > 0:
+                    self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1
+                    self.data[lab[i]]['cor'] = self.data[lab[i]]['cor'] + 1
+                    result['all'] = result['all'] + 1
+                    result['cor'] = result['cor'] + 1
+                result['lab'].insert(0, lab[i])
+                result['rec'].insert(0, rec[j])
+                i = i - 1
+                j = j - 1
+            elif self.space[i][j]['error'] == 'sub':  # substitution
+                if len(lab[i]) > 0:
+                    self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1
+                    self.data[lab[i]]['sub'] = self.data[lab[i]]['sub'] + 1
+                    result['all'] = result['all'] + 1
+                    result['sub'] = result['sub'] + 1
+                result['lab'].insert(0, lab[i])
+                result['rec'].insert(0, rec[j])
+                i = i - 1
+                j = j - 1
+            elif self.space[i][j]['error'] == 'del':  # deletion
+                if len(lab[i]) > 0:
+                    self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1
+                    self.data[lab[i]]['del'] = self.data[lab[i]]['del'] + 1
+                    result['all'] = result['all'] + 1
+                    result['del'] = result['del'] + 1
+                result['lab'].insert(0, lab[i])
+                result['rec'].insert(0, "")
+                i = i - 1
+            elif self.space[i][j]['error'] == 'ins':  # insertion
+                if len(rec[j]) > 0:
+                    self.data[rec[j]]['ins'] = self.data[rec[j]]['ins'] + 1
+                    result['ins'] = result['ins'] + 1
+                result['lab'].insert(0, "")
+                result['rec'].insert(0, rec[j])
+                j = j - 1
+            elif self.space[i][j]['error'] == 'non':  # starting point
+                break
+            else:  # shouldn't reach here
+                print(
+                    'this should not happen , i = {i} , j = {j} , error = {error}'.
+                    format(i=i, j=j, error=self.space[i][j]['error']))
+        return result
+
+    def overall(self):
+        result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0}
+        for token in self.data:
+            result['all'] = result['all'] + self.data[token]['all']
+            result['cor'] = result['cor'] + self.data[token]['cor']
+            result['sub'] = result['sub'] + self.data[token]['sub']
+            result['ins'] = result['ins'] + self.data[token]['ins']
+            result['del'] = result['del'] + self.data[token]['del']
+        return result
+
+    def cluster(self, data):
+        result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0}
+        for token in data:
+            if token in self.data:
+                result['all'] = result['all'] + self.data[token]['all']
+                result['cor'] = result['cor'] + self.data[token]['cor']
+                result['sub'] = result['sub'] + self.data[token]['sub']
+                result['ins'] = result['ins'] + self.data[token]['ins']
+                result['del'] = result['del'] + self.data[token]['del']
+        return result
+
+    def keys(self):
+        return list(self.data.keys())
+
+
+def width(string):
+    return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string)
+
+
+def default_cluster(word):
+    unicode_names = [unicodedata.name(char) for char in word]
+    for i in reversed(range(len(unicode_names))):
+        if unicode_names[i].startswith('DIGIT'):  # 1
+            unicode_names[i] = 'Number'  # 'DIGIT'
+        elif (unicode_names[i].startswith('CJK UNIFIED IDEOGRAPH') or
+              unicode_names[i].startswith('CJK COMPATIBILITY IDEOGRAPH')):
+            # 明 / 郎
+            unicode_names[i] = 'Mandarin'  # 'CJK IDEOGRAPH'
+        elif (unicode_names[i].startswith('LATIN CAPITAL LETTER') or
+              unicode_names[i].startswith('LATIN SMALL LETTER')):
+            # A / a
+            unicode_names[i] = 'English'  # 'LATIN LETTER'
+        elif unicode_names[i].startswith('HIRAGANA LETTER'):  # は こ め
+            unicode_names[i] = 'Japanese'  # 'GANA LETTER'
+        elif (unicode_names[i].startswith('AMPERSAND') or
+              unicode_names[i].startswith('APOSTROPHE') or
+              unicode_names[i].startswith('COMMERCIAL AT') or
+              unicode_names[i].startswith('DEGREE CELSIUS') or
+              unicode_names[i].startswith('EQUALS SIGN') or
+              unicode_names[i].startswith('FULL STOP') or
+              unicode_names[i].startswith('HYPHEN-MINUS') or
+              unicode_names[i].startswith('LOW LINE') or
+              unicode_names[i].startswith('NUMBER SIGN') or
+              unicode_names[i].startswith('PLUS SIGN') or
+              unicode_names[i].startswith('SEMICOLON')):
+            # & / ' / @ / ℃ / = / . / - / _ / # / + / ;
+            del unicode_names[i]
+        else:
+            return 'Other'
+    if len(unicode_names) == 0:
+        return 'Other'
+    if len(unicode_names) == 1:
+        return unicode_names[0]
+    for i in range(len(unicode_names) - 1):
+        if unicode_names[i] != unicode_names[i + 1]:
+            return 'Other'
+    return unicode_names[0]
+
+
+def usage():
+    print(
+        "compute-wer.py : compute word error rate (WER) and align recognition results and references."
+    )
+    print(
+        "         usage : python compute-wer.py [--cs={0,1}] [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] [--padding-symbol={space,underline}] test.ref test.hyp > test.wer"
+    )
+
+
+def main():
+    # python utils/compute-wer.py --char=1 --v=1 ref hyp > rsl.error
+    if len(sys.argv) == 1:
+        usage()
+        sys.exit(0)
+    calculator = Calculator()
+    cluster_file = ''
+    ignore_words = set()
+    tochar = False
+    verbose = 1
+    padding_symbol = ' '
+    case_sensitive = False
+    max_words_per_line = sys.maxsize
+    split = None
+    while len(sys.argv) > 3:
+        a = '--maxw='
+        if sys.argv[1].startswith(a):
+            b = sys.argv[1][len(a):]
+            del sys.argv[1]
+            max_words_per_line = int(b)
+            continue
+        a = '--rt='
+        if sys.argv[1].startswith(a):
+            b = sys.argv[1][len(a):].lower()
+            del sys.argv[1]
+            remove_tag = (b == 'true') or (b != '0')
+            continue
+        a = '--cs='
+        if sys.argv[1].startswith(a):
+            b = sys.argv[1][len(a):].lower()
+            del sys.argv[1]
+            case_sensitive = (b == 'true') or (b != '0')
+            continue
+        a = '--cluster='
+        if sys.argv[1].startswith(a):
+            cluster_file = sys.argv[1][len(a):]
+            del sys.argv[1]
+            continue
+        a = '--splitfile='
+        if sys.argv[1].startswith(a):
+            split_file = sys.argv[1][len(a):]
+            del sys.argv[1]
+            split = dict()
+            with codecs.open(split_file, 'r', 'utf-8') as fh:
+                for line in fh:  # line in unicode
+                    words = line.strip().split()
+                    if len(words) >= 2:
+                        split[words[0]] = words[1:]
+            continue
+        a = '--ig='
+        if sys.argv[1].startswith(a):
+            ignore_file = sys.argv[1][len(a):]
+            del sys.argv[1]
+            with codecs.open(ignore_file, 'r', 'utf-8') as fh:
+                for line in fh:  # line in unicode
+                    line = line.strip()
+                    if len(line) > 0:
+                        ignore_words.add(line)
+            continue
+        a = '--char='
+        if sys.argv[1].startswith(a):
+            b = sys.argv[1][len(a):].lower()
+            del sys.argv[1]
+            tochar = (b == 'true') or (b != '0')
+            continue
+        a = '--v='
+        if sys.argv[1].startswith(a):
+            b = sys.argv[1][len(a):].lower()
+            del sys.argv[1]
+            verbose = 0
+            try:
+                verbose = int(b)
+            except:
+                if b == 'true' or b != '0':
+                    verbose = 1
+            continue
+        a = '--padding-symbol='
+        if sys.argv[1].startswith(a):
+            b = sys.argv[1][len(a):].lower()
+            del sys.argv[1]
+            if b == 'space':
+                padding_symbol = ' '
+            elif b == 'underline':
+                padding_symbol = '_'
+            continue
+        if True or sys.argv[1].startswith('-'):
+            #ignore invalid switch
+            del sys.argv[1]
+            continue
+
+    if not case_sensitive:
+        ig = set([w.upper() for w in ignore_words])
+        ignore_words = ig
+
+    default_clusters = {}
+    default_words = {}
+
+    ref_file = sys.argv[1]
+    hyp_file = sys.argv[2]
+    rec_set = {}
+    if split and not case_sensitive:
+        newsplit = dict()
+        for w in split:
+            words = split[w]
+            for i in range(len(words)):
+                words[i] = words[i].upper()
+            newsplit[w.upper()] = words
+        split = newsplit
+
+    with codecs.open(hyp_file, 'r', 'utf-8') as fh:
+        for line in fh:
+            if tochar:
+                array = characterize(line)
+            else:
+                array = line.strip().split()
+            if len(array) == 0: continue
+            fid = array[0]
+            rec_set[fid] = normalize(array[1:], ignore_words, case_sensitive,
+                                     split)
+
+    # compute error rate on the interaction of reference file and hyp file
+    for line in open(ref_file, 'r', encoding='utf-8'):
+        if tochar:
+            array = characterize(line)
+        else:
+            array = line.rstrip('\n').split()
+        if len(array) == 0: continue
+        fid = array[0]
+        if fid not in rec_set:
+            continue
+        lab = normalize(array[1:], ignore_words, case_sensitive, split)
+        rec = rec_set[fid]
+        if verbose:
+            print('\nutt: %s' % fid)
+
+        for word in rec + lab:
+            if word not in default_words:
+                default_cluster_name = default_cluster(word)
+                if default_cluster_name not in default_clusters:
+                    default_clusters[default_cluster_name] = {}
+                if word not in default_clusters[default_cluster_name]:
+                    default_clusters[default_cluster_name][word] = 1
+                default_words[word] = default_cluster_name
+
+        result = calculator.calculate(lab, rec)
+        if verbose:
+            if result['all'] != 0:
+                wer = float(result['ins'] + result['sub'] + result[
+                    'del']) * 100.0 / result['all']
+            else:
+                wer = 0.0
+            print('WER: %4.2f %%' % wer, end=' ')
+            print('N=%d C=%d S=%d D=%d I=%d' %
+                  (result['all'], result['cor'], result['sub'], result['del'],
+                   result['ins']))
+            space = {}
+            space['lab'] = []
+            space['rec'] = []
+            for idx in range(len(result['lab'])):
+                len_lab = width(result['lab'][idx])
+                len_rec = width(result['rec'][idx])
+                length = max(len_lab, len_rec)
+                space['lab'].append(length - len_lab)
+                space['rec'].append(length - len_rec)
+            upper_lab = len(result['lab'])
+            upper_rec = len(result['rec'])
+            lab1, rec1 = 0, 0
+            while lab1 < upper_lab or rec1 < upper_rec:
+                if verbose > 1:
+                    print('lab(%s):' % fid.encode('utf-8'), end=' ')
+                else:
+                    print('lab:', end=' ')
+                lab2 = min(upper_lab, lab1 + max_words_per_line)
+                for idx in range(lab1, lab2):
+                    token = result['lab'][idx]
+                    print('{token}'.format(token=token), end='')
+                    for n in range(space['lab'][idx]):
+                        print(padding_symbol, end='')
+                    print(' ', end='')
+                print()
+                if verbose > 1:
+                    print('rec(%s):' % fid.encode('utf-8'), end=' ')
+                else:
+                    print('rec:', end=' ')
+                rec2 = min(upper_rec, rec1 + max_words_per_line)
+                for idx in range(rec1, rec2):
+                    token = result['rec'][idx]
+                    print('{token}'.format(token=token), end='')
+                    for n in range(space['rec'][idx]):
+                        print(padding_symbol, end='')
+                    print(' ', end='')
+                print('\n', end='\n')
+                lab1 = lab2
+                rec1 = rec2
+
+    if verbose:
+        print(
+            '==========================================================================='
+        )
+        print()
+
+    result = calculator.overall()
+    if result['all'] != 0:
+        wer = float(result['ins'] + result['sub'] + result[
+            'del']) * 100.0 / result['all']
+    else:
+        wer = 0.0
+    print('Overall -> %4.2f %%' % wer, end=' ')
+    print('N=%d C=%d S=%d D=%d I=%d' %
+          (result['all'], result['cor'], result['sub'], result['del'],
+           result['ins']))
+    if not verbose:
+        print()
+
+    if verbose:
+        for cluster_id in default_clusters:
+            result = calculator.cluster(
+                [k for k in default_clusters[cluster_id]])
+            if result['all'] != 0:
+                wer = float(result['ins'] + result['sub'] + result[
+                    'del']) * 100.0 / result['all']
+            else:
+                wer = 0.0
+            print('%s -> %4.2f %%' % (cluster_id, wer), end=' ')
+            print('N=%d C=%d S=%d D=%d I=%d' %
+                  (result['all'], result['cor'], result['sub'], result['del'],
+                   result['ins']))
+        if len(cluster_file) > 0:  # compute separated WERs for word clusters
+            cluster_id = ''
+            cluster = []
+            for line in open(cluster_file, 'r', encoding='utf-8'):
+                for token in line.decode('utf-8').rstrip('\n').split():
+                    # end of cluster reached, like </Keyword>
+                    if token[0:2] == '</' and token[len(token)-1] == '>' and \
+                       token.lstrip('</').rstrip('>') == cluster_id :
+                        result = calculator.cluster(cluster)
+                        if result['all'] != 0:
+                            wer = float(result['ins'] + result['sub'] + result[
+                                'del']) * 100.0 / result['all']
+                        else:
+                            wer = 0.0
+                        print('%s -> %4.2f %%' % (cluster_id, wer), end=' ')
+                        print('N=%d C=%d S=%d D=%d I=%d' %
+                              (result['all'], result['cor'], result['sub'],
+                               result['del'], result['ins']))
+                        cluster_id = ''
+                        cluster = []
+                    # begin of cluster reached, like <Keyword>
+                    elif token[0] == '<' and token[len(token)-1] == '>' and \
+                         cluster_id == '' :
+                        cluster_id = token.lstrip('<').rstrip('>')
+                        cluster = []
+                    # general terms, like WEATHER / CAR / ...
+                    else:
+                        cluster.append(token)
+        print()
+        print(
+            '==========================================================================='
+        )
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/librispeech/asr5/conf/preprocess.yaml b/examples/librispeech/asr5/conf/preprocess.yaml
new file mode 100644
index 00000000..724782ed
--- /dev/null
+++ b/examples/librispeech/asr5/conf/preprocess.yaml
@@ -0,0 +1,3 @@
+process:
+    # use raw audio
+  - type: wav_process
diff --git a/examples/librispeech/asr5/conf/preprocessor_config.json b/examples/librispeech/asr5/conf/preprocessor_config.json
new file mode 100644
index 00000000..36ebe8b7
--- /dev/null
+++ b/examples/librispeech/asr5/conf/preprocessor_config.json
@@ -0,0 +1,9 @@
+{
+  "do_normalize": true,
+  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
+  "feature_size": 1,
+  "padding_side": "right",
+  "padding_value": 0,
+  "return_attention_mask": true,
+  "sampling_rate": 16000
+}
diff --git a/examples/librispeech/asr5/conf/tuning/decode.yaml b/examples/librispeech/asr5/conf/tuning/decode.yaml
new file mode 100644
index 00000000..e5781495
--- /dev/null
+++ b/examples/librispeech/asr5/conf/tuning/decode.yaml
@@ -0,0 +1,4 @@
+decode_batch_size: 1
+error_rate_type: wer
+decoding_method: "ctc_greedy_search"  # 'ctc_greedy_search', 'ctc_prefix_beam_search'
+beam_size: 10
diff --git a/examples/librispeech/asr5/conf/wavlmASR.yaml b/examples/librispeech/asr5/conf/wavlmASR.yaml
new file mode 100644
index 00000000..25f9643e
--- /dev/null
+++ b/examples/librispeech/asr5/conf/wavlmASR.yaml
@@ -0,0 +1,137 @@
+############################################
+#          Network Architecture           #
+############################################
+freeze_wavlm: False
+normalize_wav: True
+output_norm: True
+init_type: kaiming_uniform # !Warning: need to convergence
+enc:
+  input_shape: 768
+  dnn_blocks: 2
+  dnn_neurons: 768
+  activation: True
+  normalization: True
+  dropout_rate: [0.15, 0]
+ctc:
+  enc_n_units: 768
+  blank_id: 0
+  dropout_rate: 0.0
+wavlm_params_path: exp/wavlm/wavlm-base-plus.pdparams
+
+
+task_cfg:
+  label_rate: 50.0
+  sample_rate: 16000
+  normalize: True
+  enable_padding: False
+  max_keep_size: None
+  max_sample_size: 250000
+  min_sample_size: 32000
+  dropout_input: 0.1
+  final_dropout: 0.0
+  dropout: 0.1
+  attention_dropout: 0.0
+  activation_dropout: 0.1
+  apply_mask: True
+  mask_length: 10
+  mask_prob: 0.5
+  mask_selection: static
+  mask_other: 0.0
+  no_mask_overlap: False
+  mask_channel_length: 10
+  mask_channel_prob: 0.0
+  mask_channel_selection: static
+  mask_channel_other: 0.0
+  no_mask_channel_overlap: False
+  feature_grad_mult: 0.0
+  layerdrop: 0.1
+  fp16: True
+  extractor_mode: layer_norm
+  encoder_layers: 12
+  encoder_embed_dim: 768
+  encoder_ffn_embed_dim: 3072
+  encoder_attention_heads: 12
+  activation_fn: gelu
+  encoder_layerdrop: 0.0
+  dropout_features: 0.0
+  final_dim: 768
+  untie_final_proj: True
+  layer_norm_first: True
+  conv_feature_layers: "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2"
+  conv_bias: False
+  logit_temp: 0.1
+  target_glu: False
+  mask_min_space: 1
+  mask_channel_min_space: 1
+  conv_pos: 128
+  conv_pos_groups: 16
+  latent_temp: [2.0, 0.5, 0.999995]
+  skip_masked: False
+  skip_nomask: True
+
+###########################################
+#                   Data                  #
+###########################################
+train_manifest: data/manifest.train
+dev_manifest: data/manifest.dev
+test_manifest: data/manifest.test-clean
+
+###########################################
+#              Dataloader                 #
+###########################################
+vocab_filepath: data/lang_char/vocab.txt 
+unit_type: char
+mean_std_filepath: ""
+preprocess_config: conf/preprocess.yaml
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs 0: disabled other: enabled for other epochs 
+batch_size: 8  # Different batch_size may cause large differences in results
+maxlen_in: 51200000000  # if input length  > maxlen-in batchsize is automatically reduced
+maxlen_out: 160000
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0 
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+num_workers: 0
+subsampling_factor: 1
+num_encs: 1
+dist_sampler: True
+shortest_first: False
+return_lens_rate: True
+  
+############################################
+#             Data Augmentation            #
+############################################
+audio_augment:  # for raw audio 
+  sample_rate: 16000
+  speeds: [90, 100, 110]
+
+###########################################
+#                 Training                #
+###########################################
+n_epoch: 10
+accum_grad: 8
+global_grad_clip: 5.0
+model_scheduler: newbobscheduler
+model_scheduler_conf:
+  improvement_threshold: 0.0025
+  annealing_factor: 0.8
+  patient: 0
+model_optim: adam
+model_optim_conf:
+  lr: 0.0001
+  weight_decay: 0.0
+# I changed this
+wavlm_optim: adam
+wavlm_optim_conf:
+  lr: 0.00005
+  weight_decay: 0.0
+wavlm_scheduler: constantlr    
+wavlm_scheduler_conf:
+  warmup_steps: 1000
+  lr_decay: 1.0
+log_interval: 1
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
diff --git a/examples/librispeech/asr5/local/data.sh b/examples/librispeech/asr5/local/data.sh
new file mode 100644
index 00000000..8e69dd76
--- /dev/null
+++ b/examples/librispeech/asr5/local/data.sh
@@ -0,0 +1,110 @@
+#!/bin/bash
+
+stage=-1
+stop_stage=100
+
+unit_type=char
+dict_dir=data/lang_char
+
+source ${MAIN_ROOT}/utils/parse_options.sh
+
+mkdir -p data
+mkdir -p ${dict_dir}
+TARGET_DIR=${MAIN_ROOT}/dataset
+mkdir -p ${TARGET_DIR}
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    # download data, generate manifests
+    python3 ${TARGET_DIR}/librispeech/librispeech.py \
+    --manifest_prefix="data/manifest" \
+    --target_dir="${TARGET_DIR}/librispeech" \
+    --full_download="False"
+
+    if [ $? -ne 0 ]; then
+        echo "Prepare LibriSpeech failed. Terminated."
+        exit 1
+    fi
+
+    for set in train-clean-100 dev-clean test-clean; do
+        mv data/manifest.${set} data/manifest.${set}.raw
+    done
+
+    rm -rf data/manifest.train.raw data/manifest.dev.raw  data/manifest.test.raw
+    for set in train-clean-100; do
+        cat data/manifest.${set}.raw >> data/manifest.train.raw
+    done
+
+    for set in dev-clean; do
+        cat data/manifest.${set}.raw >> data/manifest.dev.raw
+    done
+
+    for set in test-clean; do
+        cat data/manifest.${set}.raw >> data/manifest.test.raw
+    done
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # compute mean and stddev for normalizer
+    num_workers=$(nproc)
+    python ${MAIN_ROOT}/utils/compute_mean_std.py \
+    --manifest_path="data/manifest.train.raw" \
+    --num_samples=2000 \
+    --spectrum_type="fbank" \
+    --feat_dim=161 \
+    --delta_delta=false \
+    --sample_rate=16000 \
+    --stride_ms=10 \
+    --window_ms=25 \
+    --use_dB_normalization=False \
+    --num_workers=${num_workers} \
+    --output_path="data/mean_std.json"
+
+    if [ $? -ne 0 ]; then
+        echo "Compute mean and stddev failed. Terminated."
+        exit 1
+    fi
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # build vocabulary
+    python3 ${MAIN_ROOT}/utils/build_vocab.py \
+    --unit_type ${unit_type} \
+    --count_threshold=0 \
+    --vocab_path="${dict_dir}/vocab.txt" \
+    --manifest_paths="data/manifest.train.raw"
+
+    if [ $? -ne 0 ]; then
+        echo "Build vocabulary failed. Terminated."
+        exit 1
+    fi
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # format manifest with tokenids, vocab size
+    for set in train dev test dev-clean test-clean; do
+    {
+        python3 ${MAIN_ROOT}/utils/format_data.py \
+        --cmvn_path "data/mean_std.json" \
+        --unit_type ${unit_type} \
+        --vocab_path="${dict_dir}/vocab.txt" \
+        --manifest_path="data/manifest.${set}.raw" \
+        --output_path="data/manifest.${set}"
+
+        if [ $? -ne 0 ]; then
+            echo "Formt manifest.${set} failed. Terminated."
+            exit 1
+        fi
+    }&
+    done
+    wait
+fi
+
+echo "LibriSpeech Data preparation done."
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    mkdir -p exp/wavlm
+    echo "Pretrained wavlm model download"
+    wget -P exp/wavlm https://paddlespeech.bj.bcebos.com/wavlm/wavlm-base-plus.pdparams
+fi
+
+exit 0
\ No newline at end of file
diff --git a/examples/librispeech/asr5/local/test.sh b/examples/librispeech/asr5/local/test.sh
new file mode 100644
index 00000000..18158bd5
--- /dev/null
+++ b/examples/librispeech/asr5/local/test.sh
@@ -0,0 +1,83 @@
+#!/bin/bash
+
+set -e
+
+ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+echo "using $ngpu gpus..."
+
+expdir=exp
+datadir=data
+
+recog_set="test-clean test-other dev-clean dev-other"
+recog_set="test-clean"
+
+config_path=$1
+decode_config_path=$2
+ckpt_prefix=$3
+
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
+
+# download language model
+#bash local/download_lm_en.sh
+#if [ $? -ne 0 ]; then
+#    exit 1
+#fi
+
+python3 format_rsl.py \
+    --origin_ref data/manifest.test-clean.raw \
+    --trans_ref data/manifest.test-clean.text
+
+
+for type in ctc_greedy_search; do
+    echo "decoding ${type}"
+    batch_size=16
+    python3 -u ${BIN_DIR}/test.py \
+        --ngpu ${ngpu} \
+        --config ${config_path} \
+        --decode_cfg ${decode_config_path} \
+        --result_file ${ckpt_prefix}.${type}.rsl \
+        --checkpoint_path ${ckpt_prefix} \
+        --opts decode.decoding_method ${type} \
+        --opts decode.decode_batch_size ${batch_size}
+
+    if [ $? -ne 0 ]; then
+        echo "Failed in evaluation!"
+        exit 1
+    fi
+    python3 format_rsl.py \
+        --origin_hyp ${ckpt_prefix}.${type}.rsl \
+        --trans_hyp ${ckpt_prefix}.${type}.rsl.text
+
+    python3 compute_wer.py --char=1 --v=1 \
+        data/manifest.test-clean.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error
+    echo "decoding ${type} done."
+done
+
+for type in ctc_prefix_beam_search; do
+    echo "decoding ${type}"
+    batch_size=1
+    python3 -u ${BIN_DIR}/test.py \
+        --ngpu ${ngpu} \
+        --config ${config_path} \
+        --decode_cfg ${decode_config_path} \
+        --result_file ${ckpt_prefix}.${type}.rsl \
+        --checkpoint_path ${ckpt_prefix} \
+        --opts decode.decoding_method ${type} \
+        --opts decode.decode_batch_size ${batch_size}
+
+    if [ $? -ne 0 ]; then
+        echo "Failed in evaluation!"
+        exit 1
+    fi
+    python3 format_rsl.py \
+        --origin_hyp ${ckpt_prefix}.${type}.rsl \
+        --trans_hyp ${ckpt_prefix}.${type}.rsl.text
+
+    python3 compute_wer.py --char=1 --v=1 \
+        data/manifest.test-clean.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error
+    echo "decoding ${type} done."
+done
+
+echo "Finished"
+
+exit 0
diff --git a/examples/librispeech/asr5/local/test_wav.sh b/examples/librispeech/asr5/local/test_wav.sh
new file mode 100644
index 00000000..fdf3589f
--- /dev/null
+++ b/examples/librispeech/asr5/local/test_wav.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+
+if [ $# != 4 ];then
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix audio_file"
+    exit -1
+fi
+
+ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+echo "using $ngpu gpus..."
+
+config_path=$1
+decode_config_path=$2
+ckpt_prefix=$3
+audio_file=$4
+
+mkdir -p data
+wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav -P data/
+if [ $? -ne 0 ]; then
+   exit 1
+fi
+
+if [ ! -f ${audio_file} ]; then
+    echo "Plase input the right audio_file path"
+    exit 1
+fi
+
+chunk_mode=false
+if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then
+    chunk_mode=true
+fi
+
+# download language model
+#bash local/download_lm_ch.sh
+#if [ $? -ne 0 ]; then
+#    exit 1
+#fi
+
+for type in ctc_greedy_search; do
+    echo "decoding ${type}"
+    batch_size=1
+    output_dir=${ckpt_prefix}
+    mkdir -p ${output_dir}
+    python3 -u ${BIN_DIR}/test_wav.py \
+    --ngpu ${ngpu} \
+    --config ${config_path} \
+    --decode_cfg ${decode_config_path} \
+    --result_file ${output_dir}/${type}.rsl \
+    --checkpoint_path ${ckpt_prefix} \
+    --opts decode.decoding_method ${type} \
+    --opts decode.decode_batch_size ${batch_size} \
+    --audio_file ${audio_file}
+
+    if [ $? -ne 0 ]; then
+        echo "Failed in evaluation!"
+        exit 1
+    fi
+done
+exit 0
diff --git a/examples/librispeech/asr5/local/train.sh b/examples/librispeech/asr5/local/train.sh
new file mode 100644
index 00000000..24776fd1
--- /dev/null
+++ b/examples/librispeech/asr5/local/train.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+
+if [ $# -lt 2 ] && [ $# -gt 3 ];then
+    echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ips(optional)"
+    exit -1
+fi
+
+ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+echo "using $ngpu gpus..."
+
+config_path=$1
+ckpt_name=$2
+resume=$3
+ips=$4
+
+if [ ! $ips ];then
+  ips_config=
+else
+  ips_config="--ips="${ips}
+fi
+
+mkdir -p exp
+
+# seed may break model convergence
+seed=1988
+if [ ${seed} != 0 ]; then
+    export FLAGS_cudnn_deterministic=True
+fi
+
+# export FLAGS_cudnn_exhaustive_search=true
+# export FLAGS_conv_workspace_size_limit=4000
+export FLAGS_allocator_strategy=naive_best_fit
+if [ ${ngpu} == 0 ]; then
+python3 -u ${BIN_DIR}/train.py \
+--ngpu ${ngpu} \
+--config ${config_path} \
+--output exp/${ckpt_name} \
+--seed ${seed} \
+--resume ${resume}
+else
+python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${ips_config} ${BIN_DIR}/train.py \
+--ngpu ${ngpu} \
+--config ${config_path} \
+--output exp/${ckpt_name} \
+--seed ${seed} \
+--resume ${resume}
+fi
+
+if [ ${seed} != 0 ]; then
+    unset FLAGS_cudnn_deterministic
+fi
+
+if [ $? -ne 0 ]; then
+    echo "Failed in training!"
+    exit 1
+fi
+
+exit 0
diff --git a/examples/librispeech/asr5/path.sh b/examples/librispeech/asr5/path.sh
new file mode 100644
index 00000000..dbf3a940
--- /dev/null
+++ b/examples/librispeech/asr5/path.sh
@@ -0,0 +1,13 @@
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/tools/sctk/bin:${PWD}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+# export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
+
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/s2t/exps/wavlm/bin
diff --git a/examples/librispeech/asr5/run.sh b/examples/librispeech/asr5/run.sh
new file mode 100644
index 00000000..9634bc8c
--- /dev/null
+++ b/examples/librispeech/asr5/run.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+set -e
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+
+gpus=0,1,2
+stage=0
+stop_stage=3
+conf_path=conf/wavlmASR.yaml
+ips=            #xx.xx.xx.xx,xx.xx.xx.xx
+decode_conf_path=conf/tuning/decode.yaml
+avg_num=3
+resume=         # xx e.g. 30
+
+. ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
+
+audio_file=data/demo_002_en.wav
+
+# avg_ckpt=avg_${avg_num}
+avg_ckpt=4
+ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
+echo "checkpoint name ${ckpt}"
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    bash ./local/data.sh || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `exp` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} ${resume} ${ips}
+fi
+
+# if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+#     # avg n best model
+#     ./avg.sh best exp/${ckpt}/checkpoints ${avg_num}
+# fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # greedy search decoder
+    CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    # test a single .wav file
+    CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1
+fi
diff --git a/examples/librispeech/asr5/utils b/examples/librispeech/asr5/utils
new file mode 100644
index 00000000..973afe67
--- /dev/null
+++ b/examples/librispeech/asr5/utils
@@ -0,0 +1 @@
+../../../utils
\ No newline at end of file
diff --git a/examples/ljspeech/tts0/README.md b/examples/ljspeech/tts0/README.md
index 85d9e448..fa986c85 100644
--- a/examples/ljspeech/tts0/README.md
+++ b/examples/ljspeech/tts0/README.md
@@ -239,7 +239,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
   --voc_ckpt=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz  \
   --voc_stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \
   --lang=en \
-  --text=${BIN_DIR}/../sentences_en.txt \
+  --text=${BIN_DIR}/../../assets/sentences_en.txt \
   --output_dir=exp/default/test_e2e \
   --phones_dict=tacotron2_ljspeech_ckpt_0.2.0/phone_id_map.txt
 ```
diff --git a/examples/ljspeech/tts0/local/synthesize_e2e.sh b/examples/ljspeech/tts0/local/synthesize_e2e.sh
index 73dfff60..903ebb47 100755
--- a/examples/ljspeech/tts0/local/synthesize_e2e.sh
+++ b/examples/ljspeech/tts0/local/synthesize_e2e.sh
@@ -16,7 +16,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
     --voc_ckpt=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz  \
     --voc_stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \
     --lang=en \
-    --text=${BIN_DIR}/../sentences_en.txt \
+    --text=${BIN_DIR}/../../assets/sentences_en.txt \
     --output_dir=${train_output_path}/test_e2e \
     --phones_dict=dump/phone_id_map.txt \
     # --inference_dir=${train_output_path}/inference
\ No newline at end of file
diff --git a/examples/ljspeech/tts1/README.md b/examples/ljspeech/tts1/README.md
index 85621653..7f0571a1 100644
--- a/examples/ljspeech/tts1/README.md
+++ b/examples/ljspeech/tts1/README.md
@@ -191,7 +191,7 @@ python3 ${BIN_DIR}/synthesize_e2e.py \
   --transformer-tts-stat=transformer_tts_ljspeech_ckpt_0.4/speech_stats.npy \
   --waveflow-config=waveflow_ljspeech_ckpt_0.3/config.yaml \
   --waveflow-checkpoint=waveflow_ljspeech_ckpt_0.3/step-2000000.pdparams \
-  --text=${BIN_DIR}/../sentences_en.txt \
+  --text=${BIN_DIR}/../../assets/sentences_en.txt \
   --output-dir=exp/default/test_e2e \
   --phones-dict=transformer_tts_ljspeech_ckpt_0.4/phone_id_map.txt
 ```
diff --git a/examples/ljspeech/tts1/local/synthesize_e2e.sh b/examples/ljspeech/tts1/local/synthesize_e2e.sh
index 25a862f9..d6ff9cae 100755
--- a/examples/ljspeech/tts1/local/synthesize_e2e.sh
+++ b/examples/ljspeech/tts1/local/synthesize_e2e.sh
@@ -12,6 +12,6 @@ python3 ${BIN_DIR}/synthesize_e2e.py \
     --transformer-tts-stat=dump/train/speech_stats.npy \
     --waveflow-config=waveflow_ljspeech_ckpt_0.3/config.yaml \
     --waveflow-checkpoint=waveflow_ljspeech_ckpt_0.3/step-2000000.pdparams \
-    --text=${BIN_DIR}/../sentences_en.txt \
+    --text=${BIN_DIR}/../../assets/sentences_en.txt \
     --output-dir=${train_output_path}/test_e2e \
     --phones-dict=dump/phone_id_map.txt
diff --git a/examples/ljspeech/tts3/README.md b/examples/ljspeech/tts3/README.md
index 23b433d4..f1ed111a 100644
--- a/examples/ljspeech/tts3/README.md
+++ b/examples/ljspeech/tts3/README.md
@@ -254,7 +254,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
   --voc_ckpt=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz  \
   --voc_stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \
   --lang=en \
-  --text=${BIN_DIR}/../sentences_en.txt \
+  --text=${BIN_DIR}/../../assets/sentences_en.txt \
   --output_dir=exp/default/test_e2e \
   --inference_dir=exp/default/inference \
   --phones_dict=fastspeech2_nosil_ljspeech_ckpt_0.5/phone_id_map.txt
diff --git a/examples/ljspeech/tts3/local/inference.sh b/examples/ljspeech/tts3/local/inference.sh
index ff192f3e..94d6b371 100755
--- a/examples/ljspeech/tts3/local/inference.sh
+++ b/examples/ljspeech/tts3/local/inference.sh
@@ -11,7 +11,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --inference_dir=${train_output_path}/inference \
         --am=fastspeech2_ljspeech \
         --voc=pwgan_ljspeech \
-        --text=${BIN_DIR}/../sentences_en.txt \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt \
         --output_dir=${train_output_path}/pd_infer_out \
         --phones_dict=dump/phone_id_map.txt \
         --lang=en
@@ -23,7 +23,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --inference_dir=${train_output_path}/inference \
         --am=fastspeech2_ljspeech \
         --voc=hifigan_ljspeech \
-        --text=${BIN_DIR}/../sentences_en.txt \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt \
         --output_dir=${train_output_path}/pd_infer_out \
         --phones_dict=dump/phone_id_map.txt \
         --lang=en
diff --git a/examples/ljspeech/tts3/local/lite_predict.sh b/examples/ljspeech/tts3/local/lite_predict.sh
index 75db6a0e..9cf1d8d7 100755
--- a/examples/ljspeech/tts3/local/lite_predict.sh
+++ b/examples/ljspeech/tts3/local/lite_predict.sh
@@ -11,7 +11,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --inference_dir=${train_output_path}/pdlite \
         --am=fastspeech2_ljspeech \
         --voc=pwgan_ljspeech \
-        --text=${BIN_DIR}/../sentences_en.txt \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt \
         --output_dir=${train_output_path}/lite_infer_out \
         --phones_dict=dump/phone_id_map.txt \
         --lang=en
@@ -23,7 +23,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --inference_dir=${train_output_path}/pdlite \
         --am=fastspeech2_ljspeech \
         --voc=hifigan_ljspeech \
-        --text=${BIN_DIR}/../sentences_en.txt \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt \
         --output_dir=${train_output_path}/lite_infer_out \
         --phones_dict=dump/phone_id_map.txt \
         --lang=en
diff --git a/examples/ljspeech/tts3/local/ort_predict.sh b/examples/ljspeech/tts3/local/ort_predict.sh
index b4716f70..b82ec15f 100755
--- a/examples/ljspeech/tts3/local/ort_predict.sh
+++ b/examples/ljspeech/tts3/local/ort_predict.sh
@@ -10,7 +10,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --am=fastspeech2_ljspeech \
         --voc=pwgan_ljspeech\
         --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../sentences_en.txt  \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt  \
         --phones_dict=dump/phone_id_map.txt \
         --device=cpu \
         --cpu_threads=2 \
@@ -24,7 +24,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --am=fastspeech2_ljspeech \
         --voc=hifigan_ljspeech \
         --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../sentences_en.txt  \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt  \
         --phones_dict=dump/phone_id_map.txt \
         --device=cpu \
         --cpu_threads=2 \
diff --git a/examples/ljspeech/tts3/local/synthesize_e2e.sh b/examples/ljspeech/tts3/local/synthesize_e2e.sh
index 36865f7f..3f234080 100755
--- a/examples/ljspeech/tts3/local/synthesize_e2e.sh
+++ b/examples/ljspeech/tts3/local/synthesize_e2e.sh
@@ -21,7 +21,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --voc_ckpt=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz  \
         --voc_stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \
         --lang=en \
-        --text=${BIN_DIR}/../sentences_en.txt \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt \
         --output_dir=${train_output_path}/test_e2e \
         --inference_dir=${train_output_path}/inference \
         --phones_dict=dump/phone_id_map.txt
@@ -41,7 +41,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --voc_ckpt=hifigan_ljspeech_ckpt_0.2.0/snapshot_iter_2500000.pdz \
         --voc_stat=hifigan_ljspeech_ckpt_0.2.0/feats_stats.npy \
         --lang=en \
-        --text=${BIN_DIR}/../sentences_en.txt \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt \
         --output_dir=${train_output_path}/test_e2e \
         --inference_dir=${train_output_path}/inference \
         --phones_dict=dump/phone_id_map.txt
diff --git a/examples/ljspeech/tts3/run.sh b/examples/ljspeech/tts3/run.sh
index aacd4cc0..0d8da920 100755
--- a/examples/ljspeech/tts3/run.sh
+++ b/examples/ljspeech/tts3/run.sh
@@ -45,10 +45,7 @@ fi
 # we have only tested the following models so far
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
     # install paddle2onnx
-    version=$(echo `pip list |grep "paddle2onnx"` |awk -F" " '{print $2}')
-    if [[ -z "$version" || ${version} != '1.0.0' ]]; then
-        pip install paddle2onnx==1.0.0
-    fi
+    pip install paddle2onnx --upgrade
     ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx fastspeech2_ljspeech
     # considering the balance between speed and quality, we recommend that you use hifigan as vocoder
     ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_ljspeech
diff --git a/examples/opencpop/README.md b/examples/opencpop/README.md
new file mode 100644
index 00000000..5a574dc8
--- /dev/null
+++ b/examples/opencpop/README.md
@@ -0,0 +1,6 @@
+
+# Opencpop
+
+* svs1 - DiffSinger
+* voc1 - Parallel WaveGAN
+* voc5 - HiFiGAN
diff --git a/examples/opencpop/svs1/README.md b/examples/opencpop/svs1/README.md
new file mode 100644
index 00000000..43cc6e86
--- /dev/null
+++ b/examples/opencpop/svs1/README.md
@@ -0,0 +1,276 @@
+([简体中文](./README_cn.md)|English)
+# DiffSinger with Opencpop
+This example contains code used to train a [DiffSinger](https://arxiv.org/abs/2105.02446) model with [Mandarin singing corpus](https://wenet.org.cn/opencpop/).
+
+## Dataset
+### Download and Extract
+Download Opencpop from it's [Official Website](https://wenet.org.cn/opencpop/download/) and extract it to `~/datasets`. Then the dataset is in the directory `~/datasets/Opencpop`.
+
+## Get Started
+Assume the path to the dataset is `~/datasets/Opencpop`.
+Run the command below to
+1. **source path**.
+2. preprocess the dataset.
+3. train the model.
+4. synthesize wavs.
+    - synthesize waveform from `metadata.jsonl`.
+    - (Supporting) synthesize waveform from a text file. 
+5. (Supporting) inference using the static model.
+```bash
+./run.sh
+```
+You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, running the following command will only preprocess the dataset.
+```bash
+./run.sh --stage 0 --stop-stage 0
+```
+### Data Preprocessing
+```bash
+./local/preprocess.sh ${conf_path}
+```
+When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.
+
+```text
+dump
+├── dev
+│   ├── norm
+│   └── raw
+├── phone_id_map.txt
+├── speaker_id_map.txt
+├── test
+│   ├── norm
+│   └── raw
+└── train
+    ├── energy_stats.npy
+    ├── norm
+    ├── pitch_stats.npy
+    ├── raw
+    ├── speech_stats.npy
+    └── speech_stretchs.npy
+
+```
+The dataset is split into 3 parts, namely `train`, `dev`, and` test`, each of which contains a `norm` and `raw` subfolder. The raw folder contains speech, pitch and energy features of each utterance, while the norm folder contains normalized ones. The statistics used to normalize features are computed from the training set, which is located in `dump/train/*_stats.npy`. `speech_stretchs.npy` contains the minimum and maximum values of each dimension of the mel spectrum, which is used for linear stretching before training/inference of the diffusion module.
+Note: Since the training effect of non-norm features is due to norm, the features saved under `norm` are features that have not been normed.
+
+
+Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file that contains utterance id, speaker id, phones, text_lengths, speech_lengths, phone durations, the path of speech features, the path of pitch features, the path of energy features, note, note durations, slur.
+
+### Model Training
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
+```
+`./local/train.sh` calls `${BIN_DIR}/train.py`.
+Here's the complete help message.
+```text
+usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
+                [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
+                [--ngpu NGPU] [--phones-dict PHONES_DICT]
+                [--speaker-dict SPEAKER_DICT] [--speech-stretchs SPEECH_STRETCHS]
+
+Train a FastSpeech2 model.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --config CONFIG       diffsinger config file.
+  --train-metadata TRAIN_METADATA
+                        training data.
+  --dev-metadata DEV_METADATA
+                        dev data.
+  --output-dir OUTPUT_DIR
+                        output dir.
+  --ngpu NGPU           if ngpu=0, use cpu.
+  --phones-dict PHONES_DICT
+                        phone vocabulary file.
+  --speaker-dict SPEAKER_DICT
+                        speaker id map file for multiple speaker model.
+  --speech-stretchs SPEECH_STRETCHS
+                        min amd max mel for stretching.
+```
+1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
+2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder.
+3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are saved in `checkpoints/` inside this directory.
+4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
+5. `--phones-dict` is the path of the phone vocabulary file.
+6. `--speech-stretchs` is the path of mel's min-max data file.
+
+### Synthesizing
+We use parallel wavegan as the neural vocoder.
+Download pretrained parallel wavegan model from [pwgan_opencpop_ckpt_1.4.0.zip](https://paddlespeech.bj.bcebos.com/t2s/svs/opencpop/pwgan_opencpop_ckpt_1.4.0.zip) and unzip it.
+```bash
+unzip pwgan_opencpop_ckpt_1.4.0.zip
+```
+Parallel WaveGAN checkpoint contains files listed below.
+```text
+pwgan_opencpop_ckpt_1.4.0.zip
+├── default.yaml                   # default config used to train parallel wavegan
+├── snapshot_iter_100000.pdz       # model parameters of parallel wavegan
+└── feats_stats.npy                # statistics used to normalize spectrogram when training parallel wavegan
+```
+`./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
+```
+```text
+usage: synthesize.py [-h]
+                     [--am {diffsinger_opencpop}]
+                     [--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
+                     [--am_stat AM_STAT] [--phones_dict PHONES_DICT]
+                     [--voc {pwgan_opencpop}]
+                     [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
+                     [--voc_stat VOC_STAT] [--ngpu NGPU]
+                     [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR]
+                     [--speech_stretchs SPEECH_STRETCHS]
+
+Synthesize with acoustic model & vocoder
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}
+                        Choose acoustic model type of tts task.
+       {diffsinger_opencpop} Choose acoustic model type of svs task.
+  --am_config AM_CONFIG
+                        Config of acoustic model.
+  --am_ckpt AM_CKPT     Checkpoint file of acoustic model.
+  --am_stat AM_STAT     mean and standard deviation used to normalize
+                        spectrogram when training acoustic model.
+  --phones_dict PHONES_DICT
+                        phone vocabulary file.
+  --tones_dict TONES_DICT
+                        tone vocabulary file.
+  --speaker_dict SPEAKER_DICT
+                        speaker id map file.
+  --voice-cloning VOICE_CLONING
+                        whether training voice cloning model.
+  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}
+                        Choose vocoder type of tts task.
+        {pwgan_opencpop, hifigan_opencpop} Choose vocoder type of svs task.
+  --voc_config VOC_CONFIG
+                        Config of voc.
+  --voc_ckpt VOC_CKPT   Checkpoint file of voc.
+  --voc_stat VOC_STAT   mean and standard deviation used to normalize
+                        spectrogram when training voc.
+  --ngpu NGPU           if ngpu == 0, use cpu.
+  --test_metadata TEST_METADATA
+                        test metadata.
+  --output_dir OUTPUT_DIR
+                        output dir.
+  --speech-stretchs     SPEECH_STRETCHS
+                        The min and max values of the mel spectrum, using on diffusion of diffsinger.
+```
+
+`./local/synthesize_e2e.sh` calls `${BIN_DIR}/../synthesize_e2e.py`, which can synthesize waveform from text file. 
+`local/pinyin_to_phone.txt` comes from the readme of the opencpop dataset, indicating the mapping from pinyin to phonemes in opencpop.
+
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name}
+```
+```text
+usage: synthesize_e2e.py [-h]
+                         [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}]
+                         [--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
+                         [--am_stat AM_STAT] [--phones_dict PHONES_DICT]
+                         [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID]
+                         [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}]
+                         [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
+                         [--voc_stat VOC_STAT] [--lang LANG]
+                         [--inference_dir INFERENCE_DIR] [--ngpu NGPU]
+                         [--text TEXT] [--output_dir OUTPUT_DIR]
+                         [--pinyin_phone PINYIN_PHONE]
+                         [--speech_stretchs SPEECH_STRETCHS]
+
+Synthesize with acoustic model & vocoder
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}
+                        Choose acoustic model type of tts task.
+       {diffsinger_opencpop} Choose acoustic model type of svs task.
+  --am_config AM_CONFIG
+                        Config of acoustic model.
+  --am_ckpt AM_CKPT     Checkpoint file of acoustic model.
+  --am_stat AM_STAT     mean and standard deviation used to normalize
+                        spectrogram when training acoustic model.
+  --phones_dict PHONES_DICT
+                        phone vocabulary file.
+  --speaker_dict SPEAKER_DICT
+                        speaker id map file.
+  --spk_id SPK_ID       spk id for multi speaker acoustic model
+  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}
+                        Choose vocoder type of tts task.
+        {pwgan_opencpop, hifigan_opencpop} Choose vocoder type of svs task.
+  --voc_config VOC_CONFIG
+                        Config of voc.
+  --voc_ckpt VOC_CKPT   Checkpoint file of voc.
+  --voc_stat VOC_STAT   mean and standard deviation used to normalize
+                        spectrogram when training voc.
+  --lang LANG           {zh, en, mix, canton} Choose language type of tts task.
+                        {sing} Choose language type of svs task.
+  --inference_dir INFERENCE_DIR
+                        dir to save inference models
+  --ngpu NGPU           if ngpu == 0, use cpu.
+  --text TEXT           text to synthesize file, a 'utt_id sentence' pair per line for tts task.
+                        A '{ utt_id input_type (is word) text notes note_durs}' or '{utt_id input_type (is phoneme) phones notes note_durs is_slurs}' pair per line for svs task.
+  --output_dir OUTPUT_DIR
+                        output dir.
+  --pinyin_phone PINYIN_PHONE
+                        pinyin to phone map file, using on sing_frontend.
+  --speech_stretchs SPEECH_STRETCHS
+                        The min and max values of the mel spectrum, using on diffusion of diffsinger.
+```
+1. `--am` is acoustic model type with the format {model_name}_{dataset}
+2. `--am_config`, `--am_ckpt`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the diffsinger pretrained model.
+3. `--voc` is vocoder type with the format {model_name}_{dataset}
+4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
+5. `--lang` is language. `zh`, `en`, `mix` and `canton` for tts task. `sing` for tts task.
+6. `--test_metadata` should be the metadata file in the normalized subfolder of `test`  in the `dump` folder.
+7. `--text` is the text file, which contains sentences to synthesize.
+8. `--output_dir` is the directory to save synthesized audio files.
+9. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
+10. `--inference_dir` is the directory to save static models. If this line is not added, it will not be generated and saved as a static model.
+11. `--pinyin_phone` pinyin to phone map file, using on sing_frontend.
+12. `--speech_stretchs` The min and max values of the mel spectrum, using on diffusion of diffsinger.
+
+Note: At present, the diffsinger model does not support dynamic to static, so do not add `--inference_dir`.
+
+
+## Pretrained Model
+Pretrained DiffSinger model:
+- [diffsinger_opencpop_ckpt_1.4.0.zip](https://paddlespeech.bj.bcebos.com/t2s/svs/opencpop/diffsinger_opencpop_ckpt_1.4.0.zip)
+
+DiffSinger checkpoint contains files listed below.
+```text
+diffsinger_opencpop_ckpt_1.4.0.zip
+├── default.yaml             # default config used to train diffsinger
+├── energy_stats.npy         # statistics used to normalize energy when training diffsinger if norm is needed
+├── phone_id_map.txt         # phone vocabulary file when training diffsinger
+├── pinyin_to_phone.txt      # pinyin-to-phoneme mapping file when training diffsinger
+├── pitch_stats.npy          # statistics used to normalize pitch when training diffsinger if norm is needed 
+├── snapshot_iter_160000.pdz # model parameters of diffsinger
+├── speech_stats.npy         # statistics used to normalize mel when training diffsinger if norm is needed
+└── speech_stretchs.npy      # min and max values to use for mel spectral stretching before training diffusion
+
+```
+
+You can use the following scripts to synthesize for `${BIN_DIR}/../sentences_sing.txt` using pretrained diffsinger and parallel wavegan models.
+
+```bash
+source path.sh
+
+FLAGS_allocator_strategy=naive_best_fit \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python3 ${BIN_DIR}/../synthesize_e2e.py \
+  --am=diffsinger_opencpop \
+  --am_config=diffsinger_opencpop_ckpt_1.4.0/default.yaml \
+  --am_ckpt=diffsinger_opencpop_ckpt_1.4.0/snapshot_iter_160000.pdz \
+  --am_stat=diffsinger_opencpop_ckpt_1.4.0/speech_stats.npy  \
+  --voc=pwgan_opencpop \
+  --voc_config=pwgan_opencpop_ckpt_1.4.0/default.yaml \
+  --voc_ckpt=pwgan_opencpop_ckpt_1.4.0/snapshot_iter_100000.pdz \
+  --voc_stat=pwgan_opencpop_ckpt_1.4.0/feats_stats.npy \
+  --lang=sing \
+  --text=${BIN_DIR}/../../assets/sentences_sing.txt \
+  --output_dir=exp/default/test_e2e \
+  --phones_dict=diffsinger_opencpop_ckpt_1.4.0/phone_id_map.txt \
+  --pinyin_phone=diffsinger_opencpop_ckpt_1.4.0/pinyin_to_phone.txt \
+  --speech_stretchs=diffsinger_opencpop_ckpt_1.4.0/speech_stretchs.npy
+  
+```
diff --git a/examples/opencpop/svs1/README_cn.md b/examples/opencpop/svs1/README_cn.md
new file mode 100644
index 00000000..cf65c97f
--- /dev/null
+++ b/examples/opencpop/svs1/README_cn.md
@@ -0,0 +1,280 @@
+(简体中文|[English](./README.md))
+# 用 Opencpop 数据集训练 DiffSinger 模型
+
+本用例包含用于训练 [DiffSinger](https://arxiv.org/abs/2105.02446) 模型的代码，使用 [Mandarin singing corpus](https://wenet.org.cn/opencpop/) 数据集。
+
+## 数据集
+### 下载并解压
+从 [官方网站](https://wenet.org.cn/opencpop/download/) 下载数据集
+
+## 开始
+假设数据集的路径是 `~/datasets/Opencpop`.
+运行下面的命令会进行如下操作：
+
+1. **设置原路径**。
+2. 对数据集进行预处理。
+3. 训练模型
+4. 合成波形
+    - 从 `metadata.jsonl` 合成波形。
+    - （支持中）从文本文件合成波形。
+5. （支持中）使用静态模型进行推理。
+```bash
+./run.sh
+```
+您可以选择要运行的一系列阶段，或者将 `stage` 设置为 `stop-stage` 以仅使用一个阶段，例如，运行以下命令只会预处理数据集。
+```bash
+./run.sh --stage 0 --stop-stage 0
+```
+### 数据预处理
+```bash
+./local/preprocess.sh ${conf_path}
+```
+当它完成时。将在当前目录中创建 `dump` 文件夹。转储文件夹的结构如下所示。
+
+```text
+dump
+├── dev
+│   ├── norm
+│   └── raw
+├── phone_id_map.txt
+├── speaker_id_map.txt
+├── test
+│   ├── norm
+│   └── raw
+└── train
+    ├── energy_stats.npy
+    ├── norm
+    ├── pitch_stats.npy
+    ├── raw
+    ├── speech_stats.npy
+    └── speech_stretchs.npy
+```
+
+数据集分为三个部分，即 `train` 、 `dev` 和 `test` ，每个部分都包含一个 `norm` 和 `raw` 子文件夹。原始文件夹包含每个话语的语音、音调和能量特征，而 `norm` 文件夹包含规范化的特征。用于规范化特征的统计数据是从 `dump/train/*_stats.npy` 中的训练集计算出来的。`speech_stretchs.npy` 中包含 mel谱每个维度上的最小值和最大值，用于 diffusion 模块训练/推理前的线性拉伸。
+注意：由于非 norm 特征训练效果由于 norm，因此 `norm` 下保存的特征是未经过 norm 的特征。
+
+
+此外，还有一个 `metadata.jsonl` 在每个子文件夹中。它是一个类似表格的文件，包含话语id，音色id，音素、文本长度、语音长度、音素持续时间、语音特征路径、音调特征路径、能量特征路径、音调，音调持续时间，是否为转音。
+
+### 模型训练
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
+```
+`./local/train.sh` 调用 `${BIN_DIR}/train.py` 。
+以下是完整的帮助信息。
+
+```text
+usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
+                [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
+                [--ngpu NGPU] [--phones-dict PHONES_DICT]
+                [--speaker-dict SPEAKER_DICT] [--speech-stretchs SPEECH_STRETCHS]
+
+Train a DiffSinger model.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --config CONFIG       diffsinger config file.
+  --train-metadata TRAIN_METADATA
+                        training data.
+  --dev-metadata DEV_METADATA
+                        dev data.
+  --output-dir OUTPUT_DIR
+                        output dir.
+  --ngpu NGPU           if ngpu=0, use cpu.
+  --phones-dict PHONES_DICT
+                        phone vocabulary file.
+  --speaker-dict SPEAKER_DICT
+                        speaker id map file for multiple speaker model.
+  --speech-stretchs SPEECH_STRETCHS
+                        min amd max mel for stretching.
+```
+1. `--config` 是一个 yaml 格式的配置文件，用于覆盖默认配置，位于 `conf/default.yaml`.
+2. `--train-metadata` 和 `--dev-metadata` 应为 `dump` 文件夹中 `train` 和 `dev` 下的规范化元数据文件
+3. `--output-dir` 是保存结果的目录。 检查点保存在此目录中的 `checkpoints/` 目录下。
+4. `--ngpu` 要使用的 GPU 数，如果 ngpu==0，则使用 cpu 。
+5. `--phones-dict` 是音素词汇表文件的路径。
+6. `--speech-stretchs` mel的最小最大值数据的文件路径。
+
+### 合成
+我们使用 parallel opencpop 作为神经声码器（vocoder）。
+从 [pwgan_opencpop_ckpt_1.4.0.zip](https://paddlespeech.bj.bcebos.com/t2s/svs/opencpop/pwgan_opencpop_ckpt_1.4.0.zip) 下载预训练的 parallel wavegan 模型并将其解压。
+
+```bash
+unzip pwgan_opencpop_ckpt_1.4.0.zip
+```
+Parallel WaveGAN 检查点包含如下文件。
+```text
+pwgan_opencpop_ckpt_1.4.0.zip
+├── default.yaml               # 用于训练 parallel wavegan 的默认配置
+├── snapshot_iter_100000.pdz   # parallel wavegan 的模型参数
+└── feats_stats.npy            # 训练平行波形时用于规范化谱图的统计数据
+```
+`./local/synthesize.sh` 调用 `${BIN_DIR}/../synthesize.py` 即可从 `metadata.jsonl`中合成波形。
+
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
+```
+```text
+usage: synthesize.py [-h]
+                     [--am {diffsinger_opencpop}]
+                     [--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
+                     [--am_stat AM_STAT] [--phones_dict PHONES_DICT]
+                     [--voc {pwgan_opencpop}]
+                     [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
+                     [--voc_stat VOC_STAT] [--ngpu NGPU]
+                     [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR]
+                     [--speech_stretchs SPEECH_STRETCHS]
+
+Synthesize with acoustic model & vocoder
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}
+                        Choose acoustic model type of tts task.
+       {diffsinger_opencpop} Choose acoustic model type of svs task.
+  --am_config AM_CONFIG
+                        Config of acoustic model.
+  --am_ckpt AM_CKPT     Checkpoint file of acoustic model.
+  --am_stat AM_STAT     mean and standard deviation used to normalize
+                        spectrogram when training acoustic model.
+  --phones_dict PHONES_DICT
+                        phone vocabulary file.
+  --tones_dict TONES_DICT
+                        tone vocabulary file.
+  --speaker_dict SPEAKER_DICT
+                        speaker id map file.
+  --voice-cloning VOICE_CLONING
+                        whether training voice cloning model.
+  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}
+                        Choose vocoder type of tts task.
+        {pwgan_opencpop, hifigan_opencpop} Choose vocoder type of svs task.
+  --voc_config VOC_CONFIG
+                        Config of voc.
+  --voc_ckpt VOC_CKPT   Checkpoint file of voc.
+  --voc_stat VOC_STAT   mean and standard deviation used to normalize
+                        spectrogram when training voc.
+  --ngpu NGPU           if ngpu == 0, use cpu.
+  --test_metadata TEST_METADATA
+                        test metadata.
+  --output_dir OUTPUT_DIR
+                        output dir.
+  --speech-stretchs     SPEECH_STRETCHS
+                        The min and max values of the mel spectrum, using on diffusion of diffsinger.
+```
+
+`./local/synthesize_e2e.sh` 调用 `${BIN_DIR}/../synthesize_e2e.py`，即可从文本文件中合成波形。
+`local/pinyin_to_phone.txt`来源于opencpop数据集中的README，表示opencpop中拼音到音素的映射。
+
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name}
+```
+```text
+usage: synthesize_e2e.py [-h]
+                         [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}]
+                         [--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
+                         [--am_stat AM_STAT] [--phones_dict PHONES_DICT]
+                         [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID]
+                         [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}]
+                         [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
+                         [--voc_stat VOC_STAT] [--lang LANG]
+                         [--inference_dir INFERENCE_DIR] [--ngpu NGPU]
+                         [--text TEXT] [--output_dir OUTPUT_DIR]
+                         [--pinyin_phone PINYIN_PHONE]
+                         [--speech_stretchs SPEECH_STRETCHS]
+
+Synthesize with acoustic model & vocoder
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}
+                        Choose acoustic model type of tts task.
+       {diffsinger_opencpop} Choose acoustic model type of svs task.
+  --am_config AM_CONFIG
+                        Config of acoustic model.
+  --am_ckpt AM_CKPT     Checkpoint file of acoustic model.
+  --am_stat AM_STAT     mean and standard deviation used to normalize
+                        spectrogram when training acoustic model.
+  --phones_dict PHONES_DICT
+                        phone vocabulary file.
+  --speaker_dict SPEAKER_DICT
+                        speaker id map file.
+  --spk_id SPK_ID       spk id for multi speaker acoustic model
+  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}
+                        Choose vocoder type of tts task.
+        {pwgan_opencpop, hifigan_opencpop} Choose vocoder type of svs task.
+  --voc_config VOC_CONFIG
+                        Config of voc.
+  --voc_ckpt VOC_CKPT   Checkpoint file of voc.
+  --voc_stat VOC_STAT   mean and standard deviation used to normalize
+                        spectrogram when training voc.
+  --lang LANG           {zh, en, mix, canton} Choose language type of tts task.
+                        {sing} Choose language type of svs task.
+  --inference_dir INFERENCE_DIR
+                        dir to save inference models
+  --ngpu NGPU           if ngpu == 0, use cpu.
+  --text TEXT           text to synthesize file, a 'utt_id sentence' pair per line for tts task.
+                        A '{ utt_id input_type (is word) text notes note_durs}' or '{utt_id input_type (is phoneme) phones notes note_durs is_slurs}' pair per line for svs task.
+  --output_dir OUTPUT_DIR
+                        output dir.
+  --pinyin_phone PINYIN_PHONE
+                        pinyin to phone map file, using on sing_frontend.
+  --speech_stretchs SPEECH_STRETCHS
+                        The min and max values of the mel spectrum, using on diffusion of diffsinger.
+```
+1. `--am` 声学模型格式是否符合 {model_name}_{dataset}
+2. `--am_config`, `--am_ckpt`, `--am_stat` 和 `--phones_dict` 是声学模型的参数，对应于 diffsinger 预训练模型中的 4 个文件。
+3. `--voc` 声码器(vocoder)格式是否符合 {model_name}_{dataset}
+4. `--voc_config`, `--voc_ckpt`, `--voc_stat` 是声码器的参数，对应于 parallel wavegan 预训练模型中的 3 个文件。
+5. `--lang` tts对应模型的语言可以是 `zh`、`en`、`mix`和`canton`。 svs 对应的语言是 `sing` 。
+6. `--test_metadata` 应为 `dump` 文件夹中 `test` 下的规范化元数据文件、
+7. `--text` 是文本文件，其中包含要合成的句子。
+8. `--output_dir` 是保存合成音频文件的目录。
+9. `--ngpu` 要使用的GPU数，如果 ngpu==0，则使用 cpu。
+10. `--inference_dir` 静态模型保存的目录。如果不加这一行，就不会生并保存成静态模型。
+11. `--pinyin_phone` 拼音到音素的映射文件。
+12. `--speech_stretchs` mel谱的最大最小值用于diffsinger中diffusion之前的线性拉伸。
+
+注意： 目前 diffsinger 模型还不支持动转静，所以不要加 `--inference_dir`。
+
+
+## 预训练模型
+预先训练的 DiffSinger 模型：
+- [diffsinger_opencpop_ckpt_1.4.0.zip](https://paddlespeech.bj.bcebos.com/t2s/svs/opencpop/diffsinger_opencpop_ckpt_1.4.0.zip)
+
+
+DiffSinger 检查点包含下列文件。
+```text
+diffsinger_opencpop_ckpt_1.4.0.zip
+├── default.yaml             # 用于训练 diffsinger 的默认配置
+├── energy_stats.npy         # 训练 diffsinger 时如若需要 norm energy 会使用到的统计数据 
+├── phone_id_map.txt         # 训练 diffsinger 时的音素词汇文件
+├── pinyin_to_phone.txt      # 训练 diffsinger 时的拼音到音素映射文件
+├── pitch_stats.npy          # 训练 diffsinger 时如若需要 norm pitch 会使用到的统计数据 
+├── snapshot_iter_160000.pdz # 模型参数和优化器状态
+├── speech_stats.npy         # 训练 diffsinger 时用于规范化频谱图的统计数据
+└── speech_stretchs.npy      # 训练 diffusion 前用于 mel 谱拉伸的最小及最大值
+
+```
+您可以使用以下脚本通过使用预训练的 diffsinger 和 parallel wavegan 模型为 `${BIN_DIR}/../sentences_sing.txt` 合成句子
+```bash
+source path.sh
+
+FLAGS_allocator_strategy=naive_best_fit \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python3 ${BIN_DIR}/../synthesize_e2e.py \
+  --am=diffsinger_opencpop \
+  --am_config=diffsinger_opencpop_ckpt_1.4.0/default.yaml \
+  --am_ckpt=diffsinger_opencpop_ckpt_1.4.0/snapshot_iter_160000.pdz \
+  --am_stat=diffsinger_opencpop_ckpt_1.4.0/speech_stats.npy  \
+  --voc=pwgan_opencpop \
+  --voc_config=pwgan_opencpop_ckpt_1.4.0/default.yaml \
+  --voc_ckpt=pwgan_opencpop_ckpt_1.4.0/snapshot_iter_100000.pdz \
+  --voc_stat=pwgan_opencpop_ckpt_1.4.0/feats_stats.npy \
+  --lang=sing \
+  --text=${BIN_DIR}/../../assets/sentences_sing.txt \
+  --output_dir=exp/default/test_e2e \
+  --phones_dict=diffsinger_opencpop_ckpt_1.4.0/phone_id_map.txt \
+  --pinyin_phone=diffsinger_opencpop_ckpt_1.4.0/pinyin_to_phone.txt \
+  --speech_stretchs=diffsinger_opencpop_ckpt_1.4.0/speech_stretchs.npy
+  
+```
diff --git a/examples/opencpop/svs1/conf/default.yaml b/examples/opencpop/svs1/conf/default.yaml
new file mode 100644
index 00000000..5d806063
--- /dev/null
+++ b/examples/opencpop/svs1/conf/default.yaml
@@ -0,0 +1,159 @@
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+
+fs: 24000          # sr
+n_fft: 512         # FFT size (samples).
+n_shift: 128       # Hop size (samples). 12.5ms
+win_length: 512    # Window length (samples). 50ms
+                   # If set to null, it will be the same as fft_size.
+window: "hann"     # Window function.
+
+# Only used for feats_type != raw
+
+fmin: 30           # Minimum frequency of Mel basis.
+fmax: 12000        # Maximum frequency of Mel basis.
+n_mels: 80         # The number of mel basis.
+
+# Only used for the model using pitch features (e.g. FastSpeech2)
+f0min: 80          # Minimum f0 for pitch extraction.
+f0max: 750         # Maximum f0 for pitch extraction.
+
+
+###########################################################
+#                       DATA SETTING                      #
+###########################################################
+batch_size: 48     # batch size
+num_workers: 1     # number of gpu
+
+
+###########################################################
+#                       MODEL SETTING                     #
+###########################################################
+model:
+    # music score related
+    note_num: 300                                     # number of note
+    is_slur_num: 2                                    # number of slur
+    # fastspeech2 module options
+    use_energy_pred: False                            # whether use energy predictor
+    use_postnet: False                                # whether use postnet
+
+    # fastspeech2 module
+    fastspeech2_params:
+        adim: 256                                     # attention dimension
+        aheads: 2                                     # number of attention heads
+        elayers: 4                                    # number of encoder layers
+        eunits: 1024                                  # number of encoder ff units
+        dlayers: 4                                    # number of decoder layers
+        dunits: 1024                                  # number of decoder ff units
+        positionwise_layer_type: conv1d-linear        # type of position-wise layer
+        positionwise_conv_kernel_size: 9              # kernel size of position wise conv layer
+        transformer_enc_dropout_rate: 0.1             # dropout rate for transformer encoder layer
+        transformer_enc_positional_dropout_rate: 0.1  # dropout rate for transformer encoder positional encoding
+        transformer_enc_attn_dropout_rate: 0.0        # dropout rate for transformer encoder attention layer
+        transformer_activation_type: "gelu"           # Activation function type in transformer.
+        encoder_normalize_before: True                # whether to perform layer normalization before the input
+        decoder_normalize_before: True                # whether to perform layer normalization before the input
+        reduction_factor: 1                           # reduction factor
+        init_type: xavier_uniform                     # initialization type
+        init_enc_alpha: 1.0                           # initial value of alpha of encoder scaled position encoding
+        init_dec_alpha: 1.0                           # initial value of alpha of decoder scaled position encoding
+        use_scaled_pos_enc: True                      # whether to use scaled positional encoding
+        transformer_dec_dropout_rate: 0.1             # dropout rate for transformer decoder layer
+        transformer_dec_positional_dropout_rate: 0.1  # dropout rate for transformer decoder positional encoding
+        transformer_dec_attn_dropout_rate: 0.0        # dropout rate for transformer decoder attention layer
+        duration_predictor_layers: 5                  # number of layers of duration predictor
+        duration_predictor_chans: 256                 # number of channels of duration predictor
+        duration_predictor_kernel_size: 3             # filter size of duration predictor
+        duration_predictor_dropout_rate: 0.5          # dropout rate in energy predictor
+        pitch_predictor_layers: 5                     # number of conv layers in pitch predictor
+        pitch_predictor_chans: 256                    # number of channels of conv layers in pitch predictor
+        pitch_predictor_kernel_size: 5                # kernel size of conv leyers in pitch predictor
+        pitch_predictor_dropout: 0.5                  # dropout rate in pitch predictor
+        pitch_embed_kernel_size: 1                    # kernel size of conv embedding layer for pitch
+        pitch_embed_dropout: 0.0                      # dropout rate after conv embedding layer for pitch
+        stop_gradient_from_pitch_predictor: True      # whether to stop the gradient from pitch predictor to encoder
+        energy_predictor_layers: 2                    # number of conv layers in energy predictor
+        energy_predictor_chans: 256                   # number of channels of conv layers in energy predictor
+        energy_predictor_kernel_size: 3               # kernel size of conv leyers in energy predictor
+        energy_predictor_dropout: 0.5                 # dropout rate in energy predictor
+        energy_embed_kernel_size: 1                   # kernel size of conv embedding layer for energy
+        energy_embed_dropout: 0.0                     # dropout rate after conv embedding layer for energy
+        stop_gradient_from_energy_predictor: False    # whether to stop the gradient from energy predictor to encoder
+        postnet_layers: 5                             # number of layers of postnet
+        postnet_filts: 5                              # filter size of conv layers in postnet
+        postnet_chans: 256                            # number of channels of conv layers in postnet
+        postnet_dropout_rate: 0.5                     # dropout rate for postnet
+ 
+    # denoiser module
+    denoiser_params:
+        in_channels: 80                               # Number of channels of the input mel-spectrogram
+        out_channels: 80                              # Number of channels of the output mel-spectrogram
+        kernel_size: 3                                # Kernel size of the residual blocks inside                           
+        layers: 20                                    # Number of residual blocks inside
+        stacks: 5                                     # The number of groups to split the residual blocks into
+        residual_channels: 256                        # Residual channel of the residual blocks
+        gate_channels: 512                            # Gate channel of the residual blocks
+        skip_channels: 256                            # Skip channel of the residual blocks
+        aux_channels: 256                             # Auxiliary channel of the residual blocks
+        dropout: 0.1                                  # Dropout of the residual blocks
+        bias: True                                    # Whether to use bias in residual blocks
+        use_weight_norm: False                        # Whether to use weight norm in all convolutions
+        init_type: "kaiming_normal"                   # Type of initialize weights of a neural network module
+
+
+    diffusion_params:
+        num_train_timesteps: 100                      # The number of timesteps between the noise and the real during training
+        beta_start: 0.0001                            # beta start parameter for the scheduler
+        beta_end: 0.06                                # beta end parameter for the scheduler
+        beta_schedule: "linear"                       # beta schedule parameter for the scheduler
+        num_max_timesteps: 100                        # The max timestep transition from real to noise
+        stretch: True                                 # whether to stretch before diffusion
+
+
+###########################################################
+#                       UPDATER SETTING                   #
+###########################################################
+fs2_updater:
+    use_masking: True                 # whether to apply masking for padded part in loss calculation
+
+ds_updater:
+    use_masking: True                 # whether to apply masking for padded part in loss calculation
+
+
+###########################################################
+#                     OPTIMIZER SETTING                   #
+###########################################################
+# fastspeech2 optimizer
+fs2_optimizer:
+    optim: adam              # optimizer type
+    learning_rate: 0.001     # learning rate
+
+# diffusion optimizer
+ds_optimizer_params:
+    beta1: 0.9
+    beta2: 0.98
+    weight_decay: 0.0
+
+ds_scheduler_params:
+    learning_rate: 0.001              
+    gamma: 0.5                          
+    step_size: 50000
+ds_grad_norm: 1
+
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+only_train_diffusion: True                 # Whether to freeze fastspeech2 parameters when training diffusion
+ds_train_start_steps: 160000              # Number of steps to start to train diffusion module.
+train_max_steps: 320000                   # Number of training steps.
+save_interval_steps: 2000                 # Interval steps to save checkpoint.
+eval_interval_steps: 2000                 # Interval steps to evaluate the network.
+num_snapshots: 5
+
+
+###########################################################
+#                       OTHER SETTING                     #
+###########################################################
+seed: 10086
diff --git a/examples/opencpop/svs1/local/pinyin_to_phone.txt b/examples/opencpop/svs1/local/pinyin_to_phone.txt
new file mode 100644
index 00000000..34ed079d
--- /dev/null
+++ b/examples/opencpop/svs1/local/pinyin_to_phone.txt
@@ -0,0 +1,418 @@
+a|a
+ai|ai
+an|an
+ang|ang
+ao|ao
+ba|b a
+bai|b ai
+ban|b an
+bang|b ang
+bao|b ao
+bei|b ei
+ben|b en
+beng|b eng
+bi|b i
+bian|b ian
+biao|b iao
+bie|b ie
+bin|b in
+bing|b ing
+bo|b o
+bu|b u
+ca|c a
+cai|c ai
+can|c an
+cang|c ang
+cao|c ao
+ce|c e
+cei|c ei
+cen|c en
+ceng|c eng
+cha|ch a
+chai|ch ai
+chan|ch an
+chang|ch ang
+chao|ch ao
+che|ch e
+chen|ch en
+cheng|ch eng
+chi|ch i
+chong|ch ong
+chou|ch ou
+chu|ch u
+chua|ch ua
+chuai|ch uai
+chuan|ch uan
+chuang|ch uang
+chui|ch ui
+chun|ch un
+chuo|ch uo
+ci|c i
+cong|c ong
+cou|c ou
+cu|c u
+cuan|c uan
+cui|c ui
+cun|c un
+cuo|c uo
+da|d a
+dai|d ai
+dan|d an
+dang|d ang
+dao|d ao
+de|d e
+dei|d ei
+den|d en
+deng|d eng
+di|d i
+dia|d ia
+dian|d ian
+diao|d iao
+die|d ie
+ding|d ing
+diu|d iu
+dong|d ong
+dou|d ou
+du|d u
+duan|d uan
+dui|d ui
+dun|d un
+duo|d uo
+e|e
+ei|ei
+en|en
+eng|eng
+er|er
+fa|f a
+fan|f an
+fang|f ang
+fei|f ei
+fen|f en
+feng|f eng
+fo|f o
+fou|f ou
+fu|f u
+ga|g a
+gai|g ai
+gan|g an
+gang|g ang
+gao|g ao
+ge|g e
+gei|g ei
+gen|g en
+geng|g eng
+gong|g ong
+gou|g ou
+gu|g u
+gua|g ua
+guai|g uai
+guan|g uan
+guang|g uang
+gui|g ui
+gun|g un
+guo|g uo
+ha|h a
+hai|h ai
+han|h an
+hang|h ang
+hao|h ao
+he|h e
+hei|h ei
+hen|h en
+heng|h eng
+hm|h m
+hng|h ng
+hong|h ong
+hou|h ou
+hu|h u
+hua|h ua
+huai|h uai
+huan|h uan
+huang|h uang
+hui|h ui
+hun|h un
+huo|h uo
+ji|j i
+jia|j ia
+jian|j ian
+jiang|j iang
+jiao|j iao
+jie|j ie
+jin|j in
+jing|j ing
+jiong|j iong
+jiu|j iu
+ju|j v
+juan|j van
+jue|j ve
+jun|j vn
+ka|k a
+kai|k ai
+kan|k an
+kang|k ang
+kao|k ao
+ke|k e
+kei|k ei
+ken|k en
+keng|k eng
+kong|k ong
+kou|k ou
+ku|k u
+kua|k ua
+kuai|k uai
+kuan|k uan
+kuang|k uang
+kui|k ui
+kun|k un
+kuo|k uo
+la|l a
+lai|l ai
+lan|l an
+lang|l ang
+lao|l ao
+le|l e
+lei|l ei
+leng|l eng
+li|l i
+lia|l ia
+lian|l ian
+liang|l iang
+liao|l iao
+lie|l ie
+lin|l in
+ling|l ing
+liu|l iu
+lo|l o
+long|l ong
+lou|l ou
+lu|l u
+luan|l uan
+lun|l un
+luo|l uo
+lv|l v
+lve|l ve
+m|m
+ma|m a
+mai|m ai
+man|m an
+mang|m ang
+mao|m ao
+me|m e
+mei|m ei
+men|m en
+meng|m eng
+mi|m i
+mian|m ian
+miao|m iao
+mie|m ie
+min|m in
+ming|m ing
+miu|m iu
+mo|m o
+mou|m ou
+mu|m u
+n|n
+na|n a
+nai|n ai
+nan|n an
+nang|n ang
+nao|n ao
+ne|n e
+nei|n ei
+nen|n en
+neng|n eng
+ng|n g
+ni|n i
+nian|n ian
+niang|n iang
+niao|n iao
+nie|n ie
+nin|n in
+ning|n ing
+niu|n iu
+nong|n ong
+nou|n ou
+nu|n u
+nuan|n uan
+nun|n un
+nuo|n uo
+nv|n v
+nve|n ve
+o|o
+ou|ou
+pa|p a
+pai|p ai
+pan|p an
+pang|p ang
+pao|p ao
+pei|p ei
+pen|p en
+peng|p eng
+pi|p i
+pian|p ian
+piao|p iao
+pie|p ie
+pin|p in
+ping|p ing
+po|p o
+pou|p ou
+pu|p u
+qi|q i
+qia|q ia
+qian|q ian
+qiang|q iang
+qiao|q iao
+qie|q ie
+qin|q in
+qing|q ing
+qiong|q iong
+qiu|q iu
+qu|q v
+quan|q van
+que|q ve
+qun|q vn
+ran|r an
+rang|r ang
+rao|r ao
+re|r e
+ren|r en
+reng|r eng
+ri|r i
+rong|r ong
+rou|r ou
+ru|r u
+rua|r ua
+ruan|r uan
+rui|r ui
+run|r un
+ruo|r uo
+sa|s a
+sai|s ai
+san|s an
+sang|s ang
+sao|s ao
+se|s e
+sen|s en
+seng|s eng
+sha|sh a
+shai|sh ai
+shan|sh an
+shang|sh ang
+shao|sh ao
+she|sh e
+shei|sh ei
+shen|sh en
+sheng|sh eng
+shi|sh i
+shou|sh ou
+shu|sh u
+shua|sh ua
+shuai|sh uai
+shuan|sh uan
+shuang|sh uang
+shui|sh ui
+shun|sh un
+shuo|sh uo
+si|s i
+song|s ong
+sou|s ou
+su|s u
+suan|s uan
+sui|s ui
+sun|s un
+suo|s uo
+ta|t a
+tai|t ai
+tan|t an
+tang|t ang
+tao|t ao
+te|t e
+tei|t ei
+teng|t eng
+ti|t i
+tian|t ian
+tiao|t iao
+tie|t ie
+ting|t ing
+tong|t ong
+tou|t ou
+tu|t u
+tuan|t uan
+tui|t ui
+tun|t un
+tuo|t uo
+wa|w a
+wai|w ai
+wan|w an
+wang|w ang
+wei|w ei
+wen|w en
+weng|w eng
+wo|w o
+wu|w u
+xi|x i
+xia|x ia
+xian|x ian
+xiang|x iang
+xiao|x iao
+xie|x ie
+xin|x in
+xing|x ing
+xiong|x iong
+xiu|x iu
+xu|x v
+xuan|x van
+xue|x ve
+xun|x vn
+ya|y a
+yan|y an
+yang|y ang
+yao|y ao
+ye|y e
+yi|y i
+yin|y in
+ying|y ing
+yo|y o
+yong|y ong
+you|y ou
+yu|y v
+yuan|y van
+yue|y ve
+yun|y vn
+za|z a
+zai|z ai
+zan|z an
+zang|z ang
+zao|z ao
+ze|z e
+zei|z ei
+zen|z en
+zeng|z eng
+zha|zh a
+zhai|zh ai
+zhan|zh an
+zhang|zh ang
+zhao|zh ao
+zhe|zh e
+zhei|zh ei
+zhen|zh en
+zheng|zh eng
+zhi|zh i
+zhong|zh ong
+zhou|zh ou
+zhu|zh u
+zhua|zh ua
+zhuai|zh uai
+zhuan|zh uan
+zhuang|zh uang
+zhui|zh ui
+zhun|zh un
+zhuo|zh uo
+zi|z i
+zong|z ong
+zou|z ou
+zu|z u
+zuan|z uan
+zui|z ui
+zun|z un
+zuo|z uo
\ No newline at end of file
diff --git a/examples/opencpop/svs1/local/preprocess.sh b/examples/opencpop/svs1/local/preprocess.sh
new file mode 100755
index 00000000..26fd4468
--- /dev/null
+++ b/examples/opencpop/svs1/local/preprocess.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+
+stage=0
+stop_stage=100
+
+config_path=$1
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # extract features
+    echo "Extract features ..."
+    python3 ${BIN_DIR}/preprocess.py \
+        --dataset=opencpop \
+        --rootdir=~/datasets/Opencpop/segments \
+        --dumpdir=dump \
+        --label-file=~/datasets/Opencpop/segments/transcriptions.txt \
+        --config=${config_path} \
+        --num-cpu=20 \
+        --cut-sil=True
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # get features' stats(mean and std)
+    echo "Get features' stats ..."
+    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --field-name="speech"
+
+    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --field-name="pitch"
+
+    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --field-name="energy"
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # normalize and covert phone/speaker to id, dev and test should use train's stats
+    echo "Normalize ..."
+    python3 ${BIN_DIR}/normalize.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --dumpdir=dump/train/norm \
+        --speech-stats=dump/train/speech_stats.npy \
+        --pitch-stats=dump/train/pitch_stats.npy \
+        --energy-stats=dump/train/energy_stats.npy \
+        --phones-dict=dump/phone_id_map.txt \
+        --speaker-dict=dump/speaker_id_map.txt
+
+    python3 ${BIN_DIR}/normalize.py \
+        --metadata=dump/dev/raw/metadata.jsonl \
+        --dumpdir=dump/dev/norm \
+        --speech-stats=dump/train/speech_stats.npy \
+        --pitch-stats=dump/train/pitch_stats.npy \
+        --energy-stats=dump/train/energy_stats.npy \
+        --phones-dict=dump/phone_id_map.txt \
+        --speaker-dict=dump/speaker_id_map.txt
+
+    python3 ${BIN_DIR}/normalize.py \
+        --metadata=dump/test/raw/metadata.jsonl \
+        --dumpdir=dump/test/norm \
+        --speech-stats=dump/train/speech_stats.npy \
+        --pitch-stats=dump/train/pitch_stats.npy \
+        --energy-stats=dump/train/energy_stats.npy \
+        --phones-dict=dump/phone_id_map.txt \
+        --speaker-dict=dump/speaker_id_map.txt
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    # Get feature(mel) extremum for diffusion stretch
+    echo "Get feature(mel) extremum  ..."
+    python3 ${BIN_DIR}/get_minmax.py \
+        --metadata=dump/train/norm/metadata.jsonl \
+        --speech-stretchs=dump/train/speech_stretchs.npy
+fi
diff --git a/examples/opencpop/svs1/local/synthesize.sh b/examples/opencpop/svs1/local/synthesize.sh
new file mode 100755
index 00000000..1159e007
--- /dev/null
+++ b/examples/opencpop/svs1/local/synthesize.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+stage=0
+stop_stage=0
+
+# pwgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+    python3 ${BIN_DIR}/../synthesize.py \
+        --am=diffsinger_opencpop \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=pwgan_opencpop \
+        --voc_config=pwgan_opencpop_ckpt_1.4.0/default.yaml \
+        --voc_ckpt=pwgan_opencpop_ckpt_1.4.0/snapshot_iter_100000.pdz \
+        --voc_stat=pwgan_opencpop_ckpt_1.4.0/feats_stats.npy \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test \
+        --phones_dict=dump/phone_id_map.txt \
+	--speech_stretchs=dump/train/speech_stretchs.npy
+fi
+
diff --git a/examples/opencpop/svs1/local/synthesize_e2e.sh b/examples/opencpop/svs1/local/synthesize_e2e.sh
new file mode 100755
index 00000000..e8d0cc45
--- /dev/null
+++ b/examples/opencpop/svs1/local/synthesize_e2e.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
+stage=0
+stop_stage=0
+
+# pwgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=diffsinger_opencpop \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=pwgan_opencpop \
+        --voc_config=pwgan_opencpop_ckpt_1.4.0/default.yaml \
+        --voc_ckpt=pwgan_opencpop_ckpt_1.4.0/snapshot_iter_100000.pdz \
+        --voc_stat=pwgan_opencpop_ckpt_1.4.0/feats_stats.npy \
+        --lang=sing \
+        --text=${BIN_DIR}/../../assets/sentences_sing.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --speech_stretchs=dump/train/speech_stretchs.npy \
+        --pinyin_phone=local/pinyin_to_phone.txt
+fi
+
+# for more GAN Vocoders
+# hifigan
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    echo "in hifigan syn_e2e"
+    FLAGS_allocator_strategy=naive_best_fit \
+    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=diffsinger_opencpop \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=hifigan_opencpop \
+        --voc_config=hifigan_opencpop_ckpt_1.4.0/default.yaml \
+        --voc_ckpt=hifigan_opencpop_ckpt_1.4.0/snapshot_iter_625000.pdz \
+        --voc_stat=hifigan_opencpop_ckpt_1.4.0/feats_stats.npy \
+        --lang=sing \
+        --text=${BIN_DIR}/../../assets/sentences_sing.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --speech_stretchs=dump/train/speech_stretchs.npy \
+        --pinyin_phone=local/pinyin_to_phone.txt
+        
+fi
diff --git a/examples/opencpop/svs1/local/train.sh b/examples/opencpop/svs1/local/train.sh
new file mode 100755
index 00000000..5be624fc
--- /dev/null
+++ b/examples/opencpop/svs1/local/train.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+
+python3 ${BIN_DIR}/train.py \
+    --train-metadata=dump/train/norm/metadata.jsonl \
+    --dev-metadata=dump/dev/norm/metadata.jsonl \
+    --config=${config_path} \
+    --output-dir=${train_output_path} \
+    --ngpu=1 \
+    --phones-dict=dump/phone_id_map.txt \
+    --speech-stretchs=dump/train/speech_stretchs.npy
diff --git a/examples/opencpop/svs1/path.sh b/examples/opencpop/svs1/path.sh
new file mode 100755
index 00000000..8bda5dce
--- /dev/null
+++ b/examples/opencpop/svs1/path.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+MODEL=diffsinger
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
diff --git a/examples/opencpop/svs1/run.sh b/examples/opencpop/svs1/run.sh
new file mode 100755
index 00000000..bfe5b659
--- /dev/null
+++ b/examples/opencpop/svs1/run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+set -e
+source path.sh
+
+gpus=0
+stage=0
+stop_stage=100
+
+conf_path=conf/default.yaml
+train_output_path=exp/default
+ckpt_name=snapshot_iter_320000.pdz
+
+# with the following command, you can choose the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    ./local/preprocess.sh ${conf_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # synthesize, vocoder is pwgan by default
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # synthesize_e2e, vocoder is pwgan by default
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
diff --git a/examples/opencpop/voc1/README.md b/examples/opencpop/voc1/README.md
new file mode 100644
index 00000000..37570a64
--- /dev/null
+++ b/examples/opencpop/voc1/README.md
@@ -0,0 +1,139 @@
+# Parallel WaveGAN with Opencpop
+This example contains code used to train a [parallel wavegan](http://arxiv.org/abs/1910.11480) model with [Mandarin singing corpus](https://wenet.org.cn/opencpop/).
+
+## Dataset
+### Download and Extract
+Download Opencpop from it's [Official Website](https://wenet.org.cn/opencpop/download/) and extract it to `~/datasets`. Then the dataset is in the directory `~/datasets/Opencpop`.
+
+## Get Started
+Assume the path to the dataset is `~/datasets/Opencpop`.
+Run the command below to
+1. **source path**.
+2. preprocess the dataset.
+3. train the model.
+4. synthesize wavs.
+    - synthesize waveform from `metadata.jsonl`.
+```bash
+./run.sh
+```
+You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, running the following command will only preprocess the dataset.
+```bash
+./run.sh --stage 0 --stop-stage 0
+```
+### Data Preprocessing
+```bash
+./local/preprocess.sh ${conf_path}
+```
+When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.
+
+```text
+dump
+├── dev
+│   ├── norm
+│   └── raw
+├── test
+│   ├── norm
+│   └── raw
+└── train
+    ├── norm
+    ├── raw
+    └── feats_stats.npy
+```
+The dataset is split into 3 parts, namely `train`, `dev`, and `test`, each of which contains a `norm` and `raw` subfolder. The `raw` folder contains the log magnitude of the mel spectrogram of each utterance, while the norm folder contains the normalized spectrogram. The statistics used to normalize the spectrogram are computed from the training set, which is located in `dump/train/feats_stats.npy`.
+
+Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file that contains id and paths to the spectrogram of each utterance.
+
+### Model Training
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
+```
+`./local/train.sh` calls `${BIN_DIR}/train.py`.
+Here's the complete help message.
+
+```text
+usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
+                [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
+                [--ngpu NGPU] [--batch-size BATCH_SIZE] [--max-iter MAX_ITER]
+                [--run-benchmark RUN_BENCHMARK]
+                [--profiler_options PROFILER_OPTIONS]
+
+Train a ParallelWaveGAN model.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --config CONFIG       ParallelWaveGAN config file.
+  --train-metadata TRAIN_METADATA
+                        training data.
+  --dev-metadata DEV_METADATA
+                        dev data.
+  --output-dir OUTPUT_DIR
+                        output dir.
+  --ngpu NGPU           if ngpu == 0, use cpu.
+
+benchmark:
+  arguments related to benchmark.
+
+  --batch-size BATCH_SIZE
+                        batch size.
+  --max-iter MAX_ITER   train max steps.
+  --run-benchmark RUN_BENCHMARK
+                        runing benchmark or not, if True, use the --batch-size
+                        and --max-iter.
+  --profiler_options PROFILER_OPTIONS
+                        The option of profiler, which should be in format
+                        "key1=value1;key2=value2;key3=value3".
+```
+
+1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
+2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder.
+3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are saved in `checkpoints/` inside this directory.
+4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
+
+### Synthesizing
+`./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
+```
+```text
+usage: synthesize.py [-h] [--generator-type GENERATOR_TYPE] [--config CONFIG]
+                     [--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA]
+                     [--output-dir OUTPUT_DIR] [--ngpu NGPU]
+
+Synthesize with GANVocoder.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --generator-type GENERATOR_TYPE
+                        type of GANVocoder, should in {pwgan, mb_melgan,
+                        style_melgan, } now
+  --config CONFIG       GANVocoder config file.
+  --checkpoint CHECKPOINT
+                        snapshot to load.
+  --test-metadata TEST_METADATA
+                        dev data.
+  --output-dir OUTPUT_DIR
+                        output dir.
+  --ngpu NGPU           if ngpu == 0, use cpu.
+```
+
+1. `--config` parallel wavegan config file. You should use the same config with which the model is trained.
+2. `--checkpoint` is the checkpoint to load. Pick one of the checkpoints from `checkpoints` inside the training output directory.
+3. `--test-metadata` is the metadata of the test dataset. Use the `metadata.jsonl` in the `dev/norm` subfolder from the processed directory.
+4. `--output-dir` is the directory to save the synthesized audio files.
+5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
+
+## Pretrained Models
+The pretrained model can be downloaded here:
+- [pwgan_opencpop_ckpt_1.4.0](https://paddlespeech.bj.bcebos.com/t2s/svs/opencpop/pwgan_opencpop_ckpt_1.4.0.zip)
+
+
+Parallel WaveGAN checkpoint contains files listed below.
+
+```text
+pwgan_opencpop_ckpt_1.4.0
+├── default.yaml                    # default config used to train parallel wavegan
+├── snapshot_iter_100000.pdz        # generator parameters of parallel wavegan
+└── feats_stats.npy                 # statistics used to normalize spectrogram when training parallel wavegan
+```
+## Acknowledgement
+We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN.
diff --git a/examples/opencpop/voc1/conf/default.yaml b/examples/opencpop/voc1/conf/default.yaml
new file mode 100644
index 00000000..ee99719d
--- /dev/null
+++ b/examples/opencpop/voc1/conf/default.yaml
@@ -0,0 +1,119 @@
+# This is the hyperparameter configuration file for Parallel WaveGAN.
+# Please make sure this is adjusted for the CSMSC dataset. If you want to
+# apply to the other dataset, you might need to carefully change some parameters.
+# This configuration requires 12 GB GPU memory and takes ~3 days on RTX TITAN.
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+fs: 24000                # Sampling rate.
+n_fft: 512              # FFT size (samples).
+n_shift: 128             # Hop size (samples). 12.5ms
+win_length: 512         # Window length (samples). 50ms
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+n_mels: 80               # Number of mel basis.
+fmin: 30                 # Minimum freq in mel basis calculation. (Hz)
+fmax: 12000               # Maximum frequency in mel basis calculation. (Hz)
+
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Kernel size of dilated convolution.
+    layers: 30            # Number of residual block layers.
+    stacks: 3             # Number of stacks i.e., dilation cycles.
+    residual_channels: 64 # Number of channels in residual conv.
+    gate_channels: 128    # Number of channels in gated conv.
+    skip_channels: 64     # Number of channels in skip conv.
+    aux_channels: 80      # Number of channels for auxiliary feature conv.
+                          # Must be the same as num_mels.
+    aux_context_window: 2 # Context window size for auxiliary feature.
+                          # If set to 2, previous 2 and future 2 frames will be considered.
+    dropout: 0.0          # Dropout rate. 0.0 means no dropout applied.
+    bias: True            # use bias in residual blocks
+    use_weight_norm: True # Whether to use weight norm.
+                          # If set to true, it will be applied to all of the conv layers.
+    use_causal_conv: False               # use causal conv in residual blocks and upsample layers
+    upsample_scales: [8, 4, 2, 2]     # Upsampling scales. Prodcut of these must be the same as hop size.
+    interpolate_mode: "nearest" # upsample net interpolate mode
+    freq_axis_kernel_size: 1 # upsamling net: convolution kernel size in frequencey axis
+    nonlinear_activation: null
+    nonlinear_activation_params: {}
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Number of output channels.
+    layers: 10            # Number of conv layers.
+    conv_channels: 64     # Number of chnn layers.
+    bias: True            # Whether to use bias parameter in conv.
+    use_weight_norm: True # Whether to use weight norm.
+                          # If set to true, it will be applied to all of the conv layers.
+    nonlinear_activation: "leakyrelu" # Nonlinear function after each conv.
+    nonlinear_activation_params:      # Nonlinear function parameters
+        negative_slope: 0.2           # Alpha in leakyrelu.
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+stft_loss_params:
+    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+    window: "hann"         # Window function for STFT-based loss
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_adv: 4.0  # Loss balancing coefficient.
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 8              # Batch size.
+batch_max_steps: 25500     # Length of each audio in batch. Make sure dividable by n_shift.
+num_workers: 1             # Number of workers in DataLoader.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+    epsilon: 1.0e-6        # Generator's epsilon.
+    weight_decay: 0.0      # Generator's weight decay coefficient.
+generator_scheduler_params:
+    learning_rate: 0.0001  # Generator's learning rate.
+    step_size: 200000      # Generator's scheduler step size.
+    gamma: 0.5             # Generator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+generator_grad_norm: 10    # Generator's gradient norm.
+discriminator_optimizer_params:
+    epsilon: 1.0e-6            # Discriminator's epsilon.
+    weight_decay: 0.0          # Discriminator's weight decay coefficient.
+discriminator_scheduler_params:
+    learning_rate: 0.00005     # Discriminator's learning rate.
+    step_size: 200000          # Discriminator's scheduler step size.
+    gamma: 0.5                 # Discriminator's scheduler gamma.
+                               # At each step size, lr will be multiplied by this parameter.
+discriminator_grad_norm: 1     # Discriminator's gradient norm.
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
+train_max_steps: 400000                 # Number of training steps.
+save_interval_steps: 5000               # Interval steps to save checkpoint.
+eval_interval_steps: 1000               # Interval steps to evaluate the network.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
+num_snapshots: 10                 # max number of snapshots to keep while training
+seed: 42                          # random seed for paddle, random, and np.random
diff --git a/examples/opencpop/voc1/local/PTQ_static.sh b/examples/opencpop/voc1/local/PTQ_static.sh
new file mode 120000
index 00000000..247ce5c7
--- /dev/null
+++ b/examples/opencpop/voc1/local/PTQ_static.sh
@@ -0,0 +1 @@
+../../../csmsc/voc1/local/PTQ_static.sh
\ No newline at end of file
diff --git a/examples/opencpop/voc1/local/dygraph_to_static.sh b/examples/opencpop/voc1/local/dygraph_to_static.sh
new file mode 100755
index 00000000..40a2c51b
--- /dev/null
+++ b/examples/opencpop/voc1/local/dygraph_to_static.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
+FLAGS_allocator_strategy=naive_best_fit \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python3 ${BIN_DIR}/../../dygraph_to_static.py \
+    --type=voc \
+    --voc=pwgan_opencpop \
+    --voc_config=${config_path} \
+    --voc_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+    --voc_stat=dump/train/feats_stats.npy \
+    --inference_dir=exp/default/inference/
diff --git a/examples/opencpop/voc1/local/preprocess.sh b/examples/opencpop/voc1/local/preprocess.sh
new file mode 100755
index 00000000..edab4d0d
--- /dev/null
+++ b/examples/opencpop/voc1/local/preprocess.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+
+stage=0
+stop_stage=100
+
+config_path=$1
+
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # extract features
+    echo "Extract features ..."
+    python3 ${BIN_DIR}/../preprocess.py \
+        --rootdir=~/datasets/Opencpop/segments/ \
+        --dataset=opencpop \
+        --dumpdir=dump \
+        --dur-file=~/datasets/Opencpop/segments/transcriptions.txt \
+        --config=${config_path} \
+        --cut-sil=False \
+        --num-cpu=20
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # get features' stats(mean and std)
+    echo "Get features' stats ..."
+    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --field-name="feats"
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # normalize, dev and test should use train's stats
+    echo "Normalize ..."
+   
+    python3 ${BIN_DIR}/../normalize.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --dumpdir=dump/train/norm \
+        --stats=dump/train/feats_stats.npy
+    python3 ${BIN_DIR}/../normalize.py \
+        --metadata=dump/dev/raw/metadata.jsonl \
+        --dumpdir=dump/dev/norm \
+        --stats=dump/train/feats_stats.npy
+    
+    python3 ${BIN_DIR}/../normalize.py \
+        --metadata=dump/test/raw/metadata.jsonl \
+        --dumpdir=dump/test/norm \
+        --stats=dump/train/feats_stats.npy
+fi
diff --git a/examples/opencpop/voc1/local/synthesize.sh b/examples/opencpop/voc1/local/synthesize.sh
new file mode 120000
index 00000000..d6aecd8d
--- /dev/null
+++ b/examples/opencpop/voc1/local/synthesize.sh
@@ -0,0 +1 @@
+../../../csmsc/voc1/local/synthesize.sh
\ No newline at end of file
diff --git a/examples/opencpop/voc1/local/train.sh b/examples/opencpop/voc1/local/train.sh
new file mode 120000
index 00000000..2942893d
--- /dev/null
+++ b/examples/opencpop/voc1/local/train.sh
@@ -0,0 +1 @@
+../../../csmsc/voc1/local/train.sh
\ No newline at end of file
diff --git a/examples/opencpop/voc1/path.sh b/examples/opencpop/voc1/path.sh
new file mode 120000
index 00000000..b7ed4fb8
--- /dev/null
+++ b/examples/opencpop/voc1/path.sh
@@ -0,0 +1 @@
+../../csmsc/voc1/path.sh
\ No newline at end of file
diff --git a/examples/opencpop/voc1/run.sh b/examples/opencpop/voc1/run.sh
new file mode 100755
index 00000000..1f87425f
--- /dev/null
+++ b/examples/opencpop/voc1/run.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+set -e
+source path.sh
+
+gpus=0
+stage=0
+stop_stage=100
+
+conf_path=conf/default.yaml
+train_output_path=exp/default
+ckpt_name=snapshot_iter_100000.pdz
+
+# with the following command, you can choose the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    ./local/preprocess.sh ${conf_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # synthesize
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+# dygraph to static
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/dygraph_to_static.sh  ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+# PTQ_static
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/PTQ_static.sh  ${train_output_path} pwgan_opencpop || exit -1
+fi
diff --git a/examples/opencpop/voc5/conf/default.yaml b/examples/opencpop/voc5/conf/default.yaml
new file mode 100644
index 00000000..10449f86
--- /dev/null
+++ b/examples/opencpop/voc5/conf/default.yaml
@@ -0,0 +1,167 @@
+# This is the configuration file for CSMSC dataset.
+# This configuration is based on HiFiGAN V1, which is an official configuration. 
+# But I found that the optimizer setting does not work well with my implementation.
+# So I changed optimizer settings as follows:
+# - AdamW -> Adam
+# - betas: [0.8, 0.99] -> betas: [0.5, 0.9]
+# - Scheduler: ExponentialLR -> MultiStepLR
+# To match the shift size difference, the upsample scales is also modified from the original 256 shift setting.
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+fs: 24000                # Sampling rate.
+n_fft: 512              # FFT size (samples).
+n_shift: 128             # Hop size (samples). 12.5ms
+win_length: 512         # Window length (samples). 50ms
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+n_mels: 80               # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation. (Hz)
+fmax: 12000               # Maximum frequency in mel basis calculation. (Hz)
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_params:
+    in_channels: 80                       # Number of input channels.
+    out_channels: 1                       # Number of output channels.
+    channels: 512                         # Number of initial channels.
+    kernel_size: 7                        # Kernel size of initial and final conv layers.
+    upsample_scales: [8, 4, 2, 2]         # Upsampling scales.
+    upsample_kernel_sizes: [16, 8, 4, 4] # Kernel size for upsampling layers.
+    resblock_kernel_sizes: [3, 7, 11]     # Kernel size for residual blocks.
+    resblock_dilations:                   # Dilations for residual blocks.
+        - [1, 3, 5]
+        - [1, 3, 5]
+        - [1, 3, 5]
+    use_additional_convs: True            # Whether to use additional conv layer in residual blocks.
+    bias: True                            # Whether to use bias parameter in conv.
+    nonlinear_activation: "leakyrelu"     # Nonlinear activation type.
+    nonlinear_activation_params:          # Nonlinear activation paramters.
+        negative_slope: 0.1
+    use_weight_norm: True                 # Whether to apply weight normalization.
+
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_params:
+    scales: 3                              # Number of multi-scale discriminator.
+    scale_downsample_pooling: "AvgPool1D"  # Pooling operation for scale discriminator.
+    scale_downsample_pooling_params:
+        kernel_size: 4                     # Pooling kernel size.
+        stride: 2                          # Pooling stride.
+        padding: 2                         # Padding size.
+    scale_discriminator_params:
+        in_channels: 1                     # Number of input channels.
+        out_channels: 1                    # Number of output channels.
+        kernel_sizes: [15, 41, 5, 3]       # List of kernel sizes.
+        channels: 128                      # Initial number of channels.
+        max_downsample_channels: 1024      # Maximum number of channels in downsampling conv layers.
+        max_groups: 16                     # Maximum number of groups in downsampling conv layers.
+        bias: True
+        downsample_scales: [4, 4, 4, 4, 1] # Downsampling scales.
+        nonlinear_activation: "leakyrelu"  # Nonlinear activation.
+        nonlinear_activation_params:
+            negative_slope: 0.1
+    follow_official_norm: True             # Whether to follow the official norm setting.
+    periods: [2, 3, 5, 7, 11]              # List of period for multi-period discriminator.
+    period_discriminator_params:
+        in_channels: 1                     # Number of input channels.
+        out_channels: 1                    # Number of output channels.
+        kernel_sizes: [5, 3]               # List of kernel sizes.
+        channels: 32                       # Initial number of channels.
+        downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales.
+        max_downsample_channels: 1024      # Maximum number of channels in downsampling conv layers.
+        bias: True                         # Whether to use bias parameter in conv layer."
+        nonlinear_activation: "leakyrelu"  # Nonlinear activation.
+        nonlinear_activation_params:       # Nonlinear activation paramters.
+            negative_slope: 0.1
+        use_weight_norm: True              # Whether to apply weight normalization.
+        use_spectral_norm: False           # Whether to apply spectral normalization.
+    
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+use_stft_loss: False                 # Whether to use multi-resolution STFT loss.
+use_mel_loss: True                   # Whether to use Mel-spectrogram loss.
+mel_loss_params:
+    fs: 24000
+    fft_size: 512
+    hop_size: 128
+    win_length: 512
+    window: "hann"
+    num_mels: 80
+    fmin: 30
+    fmax: 12000
+    log_base: null
+generator_adv_loss_params:
+    average_by_discriminators: False # Whether to average loss by #discriminators.
+discriminator_adv_loss_params:
+    average_by_discriminators: False # Whether to average loss by #discriminators.
+use_feat_match_loss: True
+feat_match_loss_params:
+    average_by_discriminators: False # Whether to average loss by #discriminators.
+    average_by_layers: False         # Whether to average loss by #layers in each discriminator.
+    include_final_outputs: False     # Whether to include final outputs in feat match loss calculation.
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_aux: 45.0       # Loss balancing coefficient for STFT loss.
+lambda_adv: 1.0        # Loss balancing coefficient for adversarial loss.
+lambda_feat_match: 2.0 # Loss balancing coefficient for feat match loss..
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 16              # Batch size.
+batch_max_steps: 8400       # Length of each audio in batch. Make sure dividable by hop_size.
+num_workers: 1              # Number of workers in DataLoader.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+    beta1: 0.5
+    beta2: 0.9
+    weight_decay: 0.0                   # Generator's weight decay coefficient.
+generator_scheduler_params:
+    learning_rate: 2.0e-4               # Generator's learning rate.
+    gamma: 0.5                          # Generator's scheduler gamma.
+    milestones:                         # At each milestone, lr will be multiplied by gamma.
+        - 200000
+        - 400000
+        - 600000
+        - 800000
+generator_grad_norm: -1                 # Generator's gradient norm.
+discriminator_optimizer_params:
+    beta1: 0.5
+    beta2: 0.9
+    weight_decay: 0.0                   # Discriminator's weight decay coefficient.
+discriminator_scheduler_params:
+    learning_rate: 2.0e-4               # Discriminator's learning rate.
+    gamma: 0.5                          # Discriminator's scheduler gamma.
+    milestones:                         # At each milestone, lr will be multiplied by gamma.
+        - 200000
+        - 400000
+        - 600000
+        - 800000    
+discriminator_grad_norm: -1             # Discriminator's gradient norm.            
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+generator_train_start_steps: 1     # Number of steps to start to train discriminator.
+discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
+train_max_steps: 2500000           # Number of training steps.
+save_interval_steps: 5000         # Interval steps to save checkpoint.
+eval_interval_steps: 1000          # Interval steps to evaluate the network.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_snapshots: 4                 # max number of snapshots to keep while training
+seed: 42                          # random seed for paddle, random, and np.random
diff --git a/examples/opencpop/voc5/conf/finetune.yaml b/examples/opencpop/voc5/conf/finetune.yaml
new file mode 100644
index 00000000..0022a67a
--- /dev/null
+++ b/examples/opencpop/voc5/conf/finetune.yaml
@@ -0,0 +1,168 @@
+# This is the configuration file for CSMSC dataset.
+# This configuration is based on HiFiGAN V1, which is an official configuration. 
+# But I found that the optimizer setting does not work well with my implementation.
+# So I changed optimizer settings as follows:
+# - AdamW -> Adam
+# - betas: [0.8, 0.99] -> betas: [0.5, 0.9]
+# - Scheduler: ExponentialLR -> MultiStepLR
+# To match the shift size difference, the upsample scales is also modified from the original 256 shift setting.
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+fs: 24000                # Sampling rate.
+n_fft: 512              # FFT size (samples).
+n_shift: 128             # Hop size (samples). 12.5ms
+win_length: 512         # Window length (samples). 50ms
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+n_mels: 80               # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation. (Hz)
+fmax: 12000               # Maximum frequency in mel basis calculation. (Hz)
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_params:
+    in_channels: 80                       # Number of input channels.
+    out_channels: 1                       # Number of output channels.
+    channels: 512                         # Number of initial channels.
+    kernel_size: 7                        # Kernel size of initial and final conv layers.
+    upsample_scales: [8, 4, 2, 2]         # Upsampling scales.
+    upsample_kernel_sizes: [16, 8, 4, 4] # Kernel size for upsampling layers.
+    resblock_kernel_sizes: [3, 7, 11]     # Kernel size for residual blocks.
+    resblock_dilations:                   # Dilations for residual blocks.
+        - [1, 3, 5]
+        - [1, 3, 5]
+        - [1, 3, 5]
+    use_additional_convs: True            # Whether to use additional conv layer in residual blocks.
+    bias: True                            # Whether to use bias parameter in conv.
+    nonlinear_activation: "leakyrelu"     # Nonlinear activation type.
+    nonlinear_activation_params:          # Nonlinear activation paramters.
+        negative_slope: 0.1
+    use_weight_norm: True                 # Whether to apply weight normalization.
+
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_params:
+    scales: 3                              # Number of multi-scale discriminator.
+    scale_downsample_pooling: "AvgPool1D"  # Pooling operation for scale discriminator.
+    scale_downsample_pooling_params:
+        kernel_size: 4                     # Pooling kernel size.
+        stride: 2                          # Pooling stride.
+        padding: 2                         # Padding size.
+    scale_discriminator_params:
+        in_channels: 1                     # Number of input channels.
+        out_channels: 1                    # Number of output channels.
+        kernel_sizes: [15, 41, 5, 3]       # List of kernel sizes.
+        channels: 128                      # Initial number of channels.
+        max_downsample_channels: 1024      # Maximum number of channels in downsampling conv layers.
+        max_groups: 16                     # Maximum number of groups in downsampling conv layers.
+        bias: True
+        downsample_scales: [4, 4, 4, 4, 1] # Downsampling scales.
+        nonlinear_activation: "leakyrelu"  # Nonlinear activation.
+        nonlinear_activation_params:
+            negative_slope: 0.1
+    follow_official_norm: True             # Whether to follow the official norm setting.
+    periods: [2, 3, 5, 7, 11]              # List of period for multi-period discriminator.
+    period_discriminator_params:
+        in_channels: 1                     # Number of input channels.
+        out_channels: 1                    # Number of output channels.
+        kernel_sizes: [5, 3]               # List of kernel sizes.
+        channels: 32                       # Initial number of channels.
+        downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales.
+        max_downsample_channels: 1024      # Maximum number of channels in downsampling conv layers.
+        bias: True                         # Whether to use bias parameter in conv layer."
+        nonlinear_activation: "leakyrelu"  # Nonlinear activation.
+        nonlinear_activation_params:       # Nonlinear activation paramters.
+            negative_slope: 0.1
+        use_weight_norm: True              # Whether to apply weight normalization.
+        use_spectral_norm: False           # Whether to apply spectral normalization.
+    
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+use_stft_loss: False                 # Whether to use multi-resolution STFT loss.
+use_mel_loss: True                   # Whether to use Mel-spectrogram loss.
+mel_loss_params:
+    fs: 24000
+    fft_size: 512
+    hop_size: 128
+    win_length: 512
+    window: "hann"
+    num_mels: 80
+    fmin: 30
+    fmax: 12000
+    log_base: null
+generator_adv_loss_params:
+    average_by_discriminators: False # Whether to average loss by #discriminators.
+discriminator_adv_loss_params:
+    average_by_discriminators: False # Whether to average loss by #discriminators.
+use_feat_match_loss: True
+feat_match_loss_params:
+    average_by_discriminators: False # Whether to average loss by #discriminators.
+    average_by_layers: False         # Whether to average loss by #layers in each discriminator.
+    include_final_outputs: False     # Whether to include final outputs in feat match loss calculation.
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_aux: 45.0       # Loss balancing coefficient for STFT loss.
+lambda_adv: 1.0        # Loss balancing coefficient for adversarial loss.
+lambda_feat_match: 2.0 # Loss balancing coefficient for feat match loss..
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+#batch_size: 16              # Batch size.
+batch_size: 1              # Batch size.
+batch_max_steps: 8400       # Length of each audio in batch. Make sure dividable by hop_size.
+num_workers: 1              # Number of workers in DataLoader.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+    beta1: 0.5
+    beta2: 0.9
+    weight_decay: 0.0                   # Generator's weight decay coefficient.
+generator_scheduler_params:
+    learning_rate: 2.0e-4               # Generator's learning rate.
+    gamma: 0.5                          # Generator's scheduler gamma.
+    milestones:                         # At each milestone, lr will be multiplied by gamma.
+        - 200000
+        - 400000
+        - 600000
+        - 800000
+generator_grad_norm: -1                 # Generator's gradient norm.
+discriminator_optimizer_params:
+    beta1: 0.5
+    beta2: 0.9
+    weight_decay: 0.0                   # Discriminator's weight decay coefficient.
+discriminator_scheduler_params:
+    learning_rate: 2.0e-4               # Discriminator's learning rate.
+    gamma: 0.5                          # Discriminator's scheduler gamma.
+    milestones:                         # At each milestone, lr will be multiplied by gamma.
+        - 200000
+        - 400000
+        - 600000
+        - 800000    
+discriminator_grad_norm: -1             # Discriminator's gradient norm.            
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+generator_train_start_steps: 1     # Number of steps to start to train discriminator.
+discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
+train_max_steps: 2600000           # Number of training steps.
+save_interval_steps: 5000         # Interval steps to save checkpoint.
+eval_interval_steps: 1000          # Interval steps to evaluate the network.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_snapshots: 4                 # max number of snapshots to keep while training
+seed: 42                          # random seed for paddle, random, and np.random
diff --git a/examples/opencpop/voc5/finetune.sh b/examples/opencpop/voc5/finetune.sh
new file mode 100755
index 00000000..76f36329
--- /dev/null
+++ b/examples/opencpop/voc5/finetune.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+
+source path.sh
+
+gpus=0
+stage=0
+stop_stage=100
+
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    python3 ${MAIN_ROOT}/paddlespeech/t2s/exps/diffsinger/gen_gta_mel.py \
+        --diffsinger-config=diffsinger_opencpop_ckpt_1.4.0/default.yaml \
+        --diffsinger-checkpoint=diffsinger_opencpop_ckpt_1.4.0/snapshot_iter_160000.pdz \
+        --diffsinger-stat=diffsinger_opencpop_ckpt_1.4.0/speech_stats.npy \
+        --diffsinger-stretch=diffsinger_opencpop_ckpt_1.4.0/speech_stretchs.npy \
+        --dur-file=~/datasets/Opencpop/segments/transcriptions.txt \
+        --output-dir=dump_finetune \
+        --phones-dict=diffsinger_opencpop_ckpt_1.4.0/phone_id_map.txt \
+        --dataset=opencpop \
+        --rootdir=~/datasets/Opencpop/segments/
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    python3 ${MAIN_ROOT}/utils/link_wav.py \
+        --old-dump-dir=dump \
+        --dump-dir=dump_finetune
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # get features' stats(mean and std)
+    echo "Get features' stats ..."
+    cp dump/train/feats_stats.npy dump_finetune/train/
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # normalize, dev and test should use train's stats
+    echo "Normalize ..."
+   
+    python3 ${BIN_DIR}/../normalize.py \
+        --metadata=dump_finetune/train/raw/metadata.jsonl \
+        --dumpdir=dump_finetune/train/norm \
+        --stats=dump_finetune/train/feats_stats.npy
+    python3 ${BIN_DIR}/../normalize.py \
+        --metadata=dump_finetune/dev/raw/metadata.jsonl \
+        --dumpdir=dump_finetune/dev/norm \
+        --stats=dump_finetune/train/feats_stats.npy
+    
+    python3 ${BIN_DIR}/../normalize.py \
+        --metadata=dump_finetune/test/raw/metadata.jsonl \
+        --dumpdir=dump_finetune/test/norm \
+        --stats=dump_finetune/train/feats_stats.npy
+fi
+
+# create finetune env
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    echo "create finetune env"
+    python3 local/prepare_env.py \
+        --pretrained_model_dir=exp/default/checkpoints/ \
+        --output_dir=exp/finetune/
+fi 
+
+# finetune
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+    CUDA_VISIBLE_DEVICES=${gpus} \
+    FLAGS_cudnn_exhaustive_search=true \
+    FLAGS_conv_workspace_size_limit=4000 \
+    python ${BIN_DIR}/train.py \
+        --train-metadata=dump_finetune/train/norm/metadata.jsonl \
+        --dev-metadata=dump_finetune/dev/norm/metadata.jsonl \
+        --config=conf/finetune.yaml \
+        --output-dir=exp/finetune \
+        --ngpu=1
+fi 
diff --git a/examples/opencpop/voc5/local/PTQ_static.sh b/examples/opencpop/voc5/local/PTQ_static.sh
new file mode 120000
index 00000000..247ce5c7
--- /dev/null
+++ b/examples/opencpop/voc5/local/PTQ_static.sh
@@ -0,0 +1 @@
+../../../csmsc/voc1/local/PTQ_static.sh
\ No newline at end of file
diff --git a/examples/opencpop/voc5/local/dygraph_to_static.sh b/examples/opencpop/voc5/local/dygraph_to_static.sh
new file mode 100755
index 00000000..65077661
--- /dev/null
+++ b/examples/opencpop/voc5/local/dygraph_to_static.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
+FLAGS_allocator_strategy=naive_best_fit \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python3 ${BIN_DIR}/../../dygraph_to_static.py \
+    --type=voc \
+    --voc=hifigan_opencpop \
+    --voc_config=${config_path} \
+    --voc_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+    --voc_stat=dump/train/feats_stats.npy \
+    --inference_dir=exp/default/inference/
diff --git a/examples/opencpop/voc5/local/prepare_env.py b/examples/opencpop/voc5/local/prepare_env.py
new file mode 120000
index 00000000..be03c86b
--- /dev/null
+++ b/examples/opencpop/voc5/local/prepare_env.py
@@ -0,0 +1 @@
+../../../other/tts_finetune/tts3/local/prepare_env.py
\ No newline at end of file
diff --git a/examples/opencpop/voc5/local/preprocess.sh b/examples/opencpop/voc5/local/preprocess.sh
new file mode 120000
index 00000000..f0cb24de
--- /dev/null
+++ b/examples/opencpop/voc5/local/preprocess.sh
@@ -0,0 +1 @@
+../../voc1/local/preprocess.sh
\ No newline at end of file
diff --git a/examples/opencpop/voc5/local/synthesize.sh b/examples/opencpop/voc5/local/synthesize.sh
new file mode 120000
index 00000000..c887112c
--- /dev/null
+++ b/examples/opencpop/voc5/local/synthesize.sh
@@ -0,0 +1 @@
+../../../csmsc/voc5/local/synthesize.sh
\ No newline at end of file
diff --git a/examples/opencpop/voc5/local/train.sh b/examples/opencpop/voc5/local/train.sh
new file mode 120000
index 00000000..2942893d
--- /dev/null
+++ b/examples/opencpop/voc5/local/train.sh
@@ -0,0 +1 @@
+../../../csmsc/voc1/local/train.sh
\ No newline at end of file
diff --git a/examples/opencpop/voc5/path.sh b/examples/opencpop/voc5/path.sh
new file mode 120000
index 00000000..b67fe2b3
--- /dev/null
+++ b/examples/opencpop/voc5/path.sh
@@ -0,0 +1 @@
+../../csmsc/voc5/path.sh
\ No newline at end of file
diff --git a/examples/opencpop/voc5/run.sh b/examples/opencpop/voc5/run.sh
new file mode 100755
index 00000000..290c90d2
--- /dev/null
+++ b/examples/opencpop/voc5/run.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+set -e
+source path.sh
+
+gpus=0
+stage=0
+stop_stage=100
+
+conf_path=conf/default.yaml
+train_output_path=exp/default
+ckpt_name=snapshot_iter_2500000.pdz
+
+# with the following command, you can choose the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    ./local/preprocess.sh ${conf_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # synthesize
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+# dygraph to static
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/dygraph_to_static.sh  ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+# PTQ_static
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/PTQ_static.sh  ${train_output_path} hifigan_opencpop || exit -1
+fi
diff --git a/examples/other/mfa/local/detect_oov.py b/examples/other/mfa/local/detect_oov.py
old mode 100644
new mode 100755
index 4928e453..cd259cce
--- a/examples/other/mfa/local/detect_oov.py
+++ b/examples/other/mfa/local/detect_oov.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/examples/other/mfa/local/generate_canton_lexicon_wavlabs.py b/examples/other/mfa/local/generate_canton_lexicon_wavlabs.py
old mode 100644
new mode 100755
diff --git a/examples/other/mfa/local/generate_lexicon.py b/examples/other/mfa/local/generate_lexicon.py
old mode 100644
new mode 100755
index 3deb2470..089ce852
--- a/examples/other/mfa/local/generate_lexicon.py
+++ b/examples/other/mfa/local/generate_lexicon.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -48,7 +49,7 @@ def rule(C, V, R, T):
     
     'i' is distinguished when appeared in phonemes, and separated into 3 categories, 'i', 'ii' and 'iii'.
 
-    Erhua is is possibly applied to every finals, except for finals that already ends with 'r'.
+    Erhua is possibly applied to every finals, except for finals that already ends with 'r'.
 
     When a syllable is impossible or does not have any characters with this pronunciation, return None
     to filter it out.
diff --git a/examples/other/mfa/local/reorganize_aishell3.py b/examples/other/mfa/local/reorganize_aishell3.py
old mode 100644
new mode 100755
diff --git a/examples/other/mfa/local/reorganize_baker.py b/examples/other/mfa/local/reorganize_baker.py
old mode 100644
new mode 100755
index 0e0035bd..dde40210
--- a/examples/other/mfa/local/reorganize_baker.py
+++ b/examples/other/mfa/local/reorganize_baker.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/examples/other/mfa/local/reorganize_ljspeech.py b/examples/other/mfa/local/reorganize_ljspeech.py
old mode 100644
new mode 100755
diff --git a/examples/other/mfa/local/reorganize_vctk.py b/examples/other/mfa/local/reorganize_vctk.py
old mode 100644
new mode 100755
diff --git a/examples/other/mfa/run.sh b/examples/other/mfa/run.sh
index 40241683..ca4777e9 100755
--- a/examples/other/mfa/run.sh
+++ b/examples/other/mfa/run.sh
@@ -1,29 +1,32 @@
-EXP_DIR=exp
+exp=exp
+data=data
+
+mkdir -p $exp
+mkdir -p $data
 
-mkdir -p $EXP_DIR
 LEXICON_NAME='simple'
-if [ ! -f "$EXP_DIR/$LEXICON_NAME.lexicon" ]; then
+MFA_DOWNLOAD_DIR=local/
+
+if [ ! -f "$exp/$LEXICON_NAME.lexicon" ]; then
     echo "generating lexicon..."
-    python local/generate_lexicon.py "$EXP_DIR/$LEXICON_NAME" --with-r --with-tone
+    python local/generate_lexicon.py "$exp/$LEXICON_NAME" --with-r --with-tone
     echo "lexicon done"
 fi
 
-if [ ! -d $EXP_DIR/baker_corpus ]; then
+if [ ! -d $exp/baker_corpus ]; then
     echo "reorganizing baker corpus..."
-    python local/reorganize_baker.py --root-dir=~/datasets/BZNSYP --output-dir=$EXP_DIR/baker_corpus --resample-audio
-    echo "reorganization done. Check output in $EXP_DIR/baker_corpus."
+    python local/reorganize_baker.py --root-dir=~/datasets/BZNSYP --output-dir=$exp/baker_corpus --resample-audio
+    echo "reorganization done. Check output in $exp/baker_corpus."
     echo "audio files are resampled to 16kHz"
-    echo "transcription for each audio file is saved with the same namd in $EXP_DIR/baker_corpus "
+    echo "transcription for each audio file is saved with the same namd in $exp/baker_corpus "
 fi
 
 
 echo "detecting oov..."
-python local/detect_oov.py $EXP_DIR/baker_corpus $EXP_DIR/"$LEXICON_NAME.lexicon"
+python local/detect_oov.py $exp/baker_corpus $exp/"$LEXICON_NAME.lexicon"
 echo "detecting oov done. you may consider regenerate lexicon if there is unexpected OOVs."
 
 
-MFA_DOWNLOAD_DIR=local/
-
 if [ ! -f "$MFA_DOWNLOAD_DIR/montreal-forced-aligner_linux.tar.gz" ]; then
     echo "downloading mfa..."
     (cd $MFA_DOWNLOAD_DIR && wget https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/download/v1.0.1/montreal-forced-aligner_linux.tar.gz)
@@ -37,11 +40,15 @@ if [ ! -d "$MFA_DOWNLOAD_DIR/montreal-forced-aligner" ]; then
 fi
 
 export PATH="$MFA_DOWNLOAD_DIR/montreal-forced-aligner/bin"
-if [ ! -d "$EXP_DIR/baker_alignment" ]; then
+
+if [ ! -d "$exp/baker_alignment" ]; then
     echo "Start MFA training..."
-    mfa_train_and_align $EXP_DIR/baker_corpus "$EXP_DIR/$LEXICON_NAME.lexicon" $EXP_DIR/baker_alignment -o $EXP_DIR/baker_model --clean --verbose --temp_directory $EXP_DIR/.mfa_train_and_align
+    PATH=$MFA_DOWNLOAD_DIR/montreal-forced-aligner/bin/:$PATH \
+    LD_LIBRARY_PATH=$MFA_DOWNLOAD_DIR/montreal-forced-aligner/lib/:$LD_LIBRARY_PATH \
+    ./$MFA_DOWNLOAD_DIR/montreal-forced-aligner/bin/mfa_train_and_align \
+        $exp/baker_corpus "$exp/$LEXICON_NAME.lexicon" $exp/baker_alignment -o $exp/baker_model --clean --verbose -j 10 --temp_directory $exp/.mfa_train_and_align
     echo "training done!"
-    echo "results: $EXP_DIR/baker_alignment"
-    echo "model: $EXP_DIR/baker_model"
+    echo "results: $exp/baker_alignment"
+    echo "model: $exp/baker_model"
 fi
 
diff --git a/examples/other/mfa/run_canton.sh b/examples/other/mfa/run_canton.sh
index cef6a2f0..851e42d0 100755
--- a/examples/other/mfa/run_canton.sh
+++ b/examples/other/mfa/run_canton.sh
@@ -1,16 +1,15 @@
-EXP_DIR=exp
+exp=exp
 
-mkdir -p $EXP_DIR
+mkdir -p $exp
 LEXICON_NAME='canton'
-if [ ! -f "$EXP_DIR/$LEXICON_NAME.lexicon" ]; then
+MFA_DOWNLOAD_DIR=local/
+
+if [ ! -f "$exp/$LEXICON_NAME.lexicon" ]; then
     echo "generating lexicon and training data..."
-    python local/generate_canton_lexicon_wavlabs.py --output_lexicon "$EXP_DIR/$LEXICON_NAME.lexicon" --output_wavlabs "$EXP_DIR/$LEXICON_NAME"_wavlabs --inputs ~/datasets/Guangzhou_Cantonese_Scripted_Speech_Corpus_Daily_Use_Sentence ~/datasets/Guangzhou_Cantonese_Scripted_Speech_Corpus_in_Vehicle
+    python local/generate_canton_lexicon_wavlabs.py --output_lexicon "$exp/$LEXICON_NAME.lexicon" --output_wavlabs "$exp/$LEXICON_NAME"_wavlabs --inputs ~/datasets/Guangzhou_Cantonese_Scripted_Speech_Corpus_Daily_Use_Sentence ~/datasets/Guangzhou_Cantonese_Scripted_Speech_Corpus_in_Vehicle
     echo "lexicon and training data done"
 fi
 
-
-MFA_DOWNLOAD_DIR=local/
-
 if [ ! -f "$MFA_DOWNLOAD_DIR/montreal-forced-aligner_linux.tar.gz" ]; then
     echo "downloading mfa..."
     (cd $MFA_DOWNLOAD_DIR && wget https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/download/v1.0.1/montreal-forced-aligner_linux.tar.gz)
@@ -24,11 +23,14 @@ if [ ! -d "$MFA_DOWNLOAD_DIR/montreal-forced-aligner" ]; then
 fi
 
 export PATH="$MFA_DOWNLOAD_DIR/montreal-forced-aligner/bin"
-if [ ! -d "$EXP_DIR/canton_alignment" ]; then
+if [ ! -d "$exp/canton_alignment" ]; then
     echo "Start MFA training..."
-    mfa_train_and_align "$EXP_DIR/$LEXICON_NAME"_wavlabs "$EXP_DIR/$LEXICON_NAME.lexicon" $EXP_DIR/canton_alignment -o $EXP_DIR/canton_model --clean --verbose --temp_directory $EXP_DIR/.mfa_train_and_align
+    PATH=$MFA_DOWNLOAD_DIR/montreal-forced-aligner/bin/:$PATH \
+    LD_LIBRARY_PATH=$MFA_DOWNLOAD_DIR/montreal-forced-aligner/lib/:$LD_LIBRARY_PATH \
+    ./$MFA_DOWNLOAD_DIR/montreal-forced-aligner/bin/mfa_train_and_align \
+        "$exp/$LEXICON_NAME"_wavlabs "$exp/$LEXICON_NAME.lexicon" $exp/canton_alignment -o $exp/canton_model --clean --verbose -j 10 --temp_directory $exp/.mfa_train_and_align
     echo "training done!"
-    echo "results: $EXP_DIR/canton_alignment"
-    echo "model: $EXP_DIR/canton_model"
+    echo "results: $exp/canton_alignment"
+    echo "model: $exp/canton_model"
 fi
 
diff --git a/examples/other/rhy/local/pre_for_sp_aishell.py b/examples/other/rhy/local/pre_for_sp_aishell.py
old mode 100644
new mode 100755
index a2a71668..ff0830a5
--- a/examples/other/rhy/local/pre_for_sp_aishell.py
+++ b/examples/other/rhy/local/pre_for_sp_aishell.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 import argparse
 import os
 import re
@@ -8,7 +9,7 @@ replace_ = {"#1": "%", "#2": "`", "#3": "~", "#4": "$"}
 
 
 def replace_rhy_with_punc(line):
-    # r'[：、，；。？！,.:;"?!”’《》【】<=>{}()（）#&@“”^_|…\\]%*$', '', line)     #参考checkcheck_oov.py,
+    # r'[：、，；。？！,.:;"?!”’《》【】<=>{}()（）#&@“”^_|…\\]%*$', '', line)     #参考check_oov.py,
     line = re.sub(r'[：、，；。？！,.:;"?!’《》【】<=>{}()（）#&@“”^_|…\\]%*$', '', line)
     for r in replace_.keys():
         if r in line:
diff --git a/examples/other/rhy/local/pre_for_sp_csmsc.py b/examples/other/rhy/local/pre_for_sp_csmsc.py
old mode 100644
new mode 100755
index 0a96092c..8b4f9e1f
--- a/examples/other/rhy/local/pre_for_sp_csmsc.py
+++ b/examples/other/rhy/local/pre_for_sp_csmsc.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python3
 import argparse
 import os
 import re
@@ -6,7 +7,7 @@ replace_ = {"#1": "%", "#2": "`", "#3": "~", "#4": "$"}
 
 
 def replace_rhy_with_punc(line):
-    # r'[：、，；。？！,.:;"?!”’《》【】<=>{}()（）#&@“”^_|…\\]%*$', '', line)     #参考checkcheck_oov.py,
+    # r'[：、，；。？！,.:;"?!”’《》【】<=>{}()（）#&@“”^_|…\\]%*$', '', line)     #参考check_oov.py,
     line = re.sub(r'^$\*%', '', line)
     for r in replace_.keys():
         if r in line:
diff --git a/examples/other/rhy/run.sh b/examples/other/rhy/run.sh
index aed58152..3b65e583 100755
--- a/examples/other/rhy/run.sh
+++ b/examples/other/rhy/run.sh
@@ -6,13 +6,15 @@ gpus=0
 stage=0
 stop_stage=100
 
+data=data
+mkdir -p $data
+
 aishell_data=label_train-set.txt
 csmsc_data=000001-010000.txt
-processed_path=data
 
 conf_path=conf/default.yaml
 train_output_path=exp/default
-ckpt_name=snapshot_iter_2600.pdz
+ckpt_name=snapshot_iter_4680.pdz
 text=我们城市的复苏有赖于他强有力的政策。
 print_eval=false
 
@@ -23,7 +25,7 @@ source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
 
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     # prepare data
-    ./local/data.sh ${aishell_data} ${csmsc_data} ${processed_path}
+    ./local/data.sh ${aishell_data} ${csmsc_data} ${data}
 fi
 
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
diff --git a/examples/other/spm/README.md b/examples/other/spm/README.md
index fc4478eb..5eee8bab 100644
--- a/examples/other/spm/README.md
+++ b/examples/other/spm/README.md
@@ -33,6 +33,18 @@ ec5a9b24acc35469229e41256ceaf77d  data/lang_char/input.txt
 ```
 
 ```
+==> data/lang_char/input.txt <==
+mister quilter is the apostle of the middle classes and we are glad to welcome his gospel
+nor is mister quilter's manner less interesting than his matter
+he tells us that at this festive season of the year with christmas and roast beef looming before us similes drawn from eating and its results occur most readily to the mind
+he has grave doubts whether sir frederick leighton's work is really greek after all and can discover in it but little of rocky ithaca
+linnell's pictures are a sort of up guards and at em paintings and mason's exquisite idylls are as national as a jingo poem mister birket foster's landscapes smile at one much in the same way that mister carker used to flash his teeth and mister john collier gives his sitter a cheerful slap on the back before he says like a shampooer in a turkish bath next man
+it is obviously unnecessary for us to point out how luminous these criticisms are how delicate in expression
+on the general principles of art mister quilter writes with equal lucidity
+painting he tells us is of a different quality to mathematics and finish in art is adding more fact
+as for etchings they are of two kinds british and foreign
+he laments most bitterly the divorce that has been made between decorative art and what we usually call pictures makes the customary appeal to the last judgment and reminds us that in the great days of art michael angelo was the furnishing upholsterer
+
 ==> data/lang_char/input.bpe <==
 ▁mi ster ▁quilter ▁ is ▁the ▁a p ost le ▁o f ▁the ▁mi d d le ▁c las s es ▁ and ▁we ▁ar e ▁g l a d ▁ to ▁we l c om e ▁h is ▁g o s pe l
 ▁ n or ▁ is ▁mi ster ▁quilter ' s ▁ma nne r ▁ l ess ▁in ter es t ing ▁tha n ▁h is ▁ma t ter
@@ -58,17 +70,6 @@ painting he tells us is of a different quality to mathematics and finish in art
 as for etchings they are of two kinds british and foreign
 he laments most bitterly the divorce that has been made between decorative art and what we usually call pictures makes the customary appeal to the last judgment and reminds us that in the great days of art michael angelo was the furnishing upholsterer
 
-==> data/lang_char/input.txt <==
-mister quilter is the apostle of the middle classes and we are glad to welcome his gospel
-nor is mister quilter's manner less interesting than his matter
-he tells us that at this festive season of the year with christmas and roast beef looming before us similes drawn from eating and its results occur most readily to the mind
-he has grave doubts whether sir frederick leighton's work is really greek after all and can discover in it but little of rocky ithaca
-linnell's pictures are a sort of up guards and at em paintings and mason's exquisite idylls are as national as a jingo poem mister birket foster's landscapes smile at one much in the same way that mister carker used to flash his teeth and mister john collier gives his sitter a cheerful slap on the back before he says like a shampooer in a turkish bath next man
-it is obviously unnecessary for us to point out how luminous these criticisms are how delicate in expression
-on the general principles of art mister quilter writes with equal lucidity
-painting he tells us is of a different quality to mathematics and finish in art is adding more fact
-as for etchings they are of two kinds british and foreign
-he laments most bitterly the divorce that has been made between decorative art and what we usually call pictures makes the customary appeal to the last judgment and reminds us that in the great days of art michael angelo was the furnishing upholsterer
 
 ==> data/lang_char/train_unigram100_units.txt <==
 <blank> 0
diff --git a/examples/other/tn/data/textnorm_test_cases.txt b/examples/other/tn/data/textnorm_test_cases.txt
index 17e90d0b..ba9e6529 100644
--- a/examples/other/tn/data/textnorm_test_cases.txt
+++ b/examples/other/tn/data/textnorm_test_cases.txt
@@ -32,7 +32,7 @@ iPad Pro的秒控键盘这次也推出白色版本。|iPad Pro的秒控键盘这
 明天有62%的概率降雨|明天有百分之六十二的概率降雨
 这是固话0421-33441122|这是固话零四二一三三四四一一二二
 这是手机+86 18544139121|这是手机八六一八五四四一三九一二一
-小王的身高是153.5cm,梦想是打篮球!我觉得有0.1%的可能性。|小王的身高是一百五十三点五cm,梦想是打篮球!我觉得有百分之零点一的可能性。
+小王的身高是153.5cm,梦想是打篮球!我觉得有0.1%的可能性。|小王的身高是一百五十三点五厘米,梦想是打篮球!我觉得有百分之零点一的可能性。
 不管三七二十一|不管三七二十一
 九九八十一难|九九八十一难
 2018年5月23号上午10点10分|二零一八年五月二十三号上午十点十分
@@ -124,4 +124,4 @@ iPad Pro的秒控键盘这次也推出白色版本。|iPad Pro的秒控键盘这
 12~23|十二到二十三
 12-23|十二到二十三
 25cm²|二十五平方厘米
-25m|米
\ No newline at end of file
+25m|米
diff --git a/examples/other/tts_finetune/tts3/run.sh b/examples/other/tts_finetune/tts3/run.sh
index cc25d8f6..f5a65e6b 100755
--- a/examples/other/tts_finetune/tts3/run.sh
+++ b/examples/other/tts_finetune/tts3/run.sh
@@ -99,7 +99,7 @@ if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
         --voc_ckpt=pretrained_models/hifigan_aishell3_ckpt_0.2.0/snapshot_iter_2500000.pdz \
         --voc_stat=pretrained_models/hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \
         --lang=zh \
-        --text=${BIN_DIR}/../sentences.txt \
+        --text=${BIN_DIR}/../../assets/sentences.txt \
         --output_dir=./test_e2e/ \
         --phones_dict=${dump_dir}/phone_id_map.txt \
         --speaker_dict=${dump_dir}/speaker_id_map.txt \
diff --git a/examples/other/tts_finetune/tts3/run_en.sh b/examples/other/tts_finetune/tts3/run_en.sh
index 53721486..86c58afa 100755
--- a/examples/other/tts_finetune/tts3/run_en.sh
+++ b/examples/other/tts_finetune/tts3/run_en.sh
@@ -98,7 +98,7 @@ if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
         --voc_ckpt=pretrained_models/hifigan_vctk_ckpt_0.2.0/snapshot_iter_2500000.pdz \
         --voc_stat=pretrained_models/hifigan_vctk_ckpt_0.2.0/feats_stats.npy \
         --lang=en \
-        --text=${BIN_DIR}/../sentences_en.txt \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt \
         --output_dir=./test_e2e/ \
         --phones_dict=${dump_dir}/phone_id_map.txt \
         --speaker_dict=${dump_dir}/speaker_id_map.txt \
diff --git a/examples/other/tts_finetune/tts3/run_mix.sh b/examples/other/tts_finetune/tts3/run_mix.sh
index 7630022b..210f0314 100755
--- a/examples/other/tts_finetune/tts3/run_mix.sh
+++ b/examples/other/tts_finetune/tts3/run_mix.sh
@@ -100,7 +100,7 @@ if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
         --voc_ckpt=pretrained_models/hifigan_aishell3_ckpt_0.2.0/snapshot_iter_2500000.pdz \
         --voc_stat=pretrained_models/hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \
         --lang=mix \
-        --text=${BIN_DIR}/../sentences_mix.txt \
+        --text=${BIN_DIR}/../../assets/sentences_mix.txt \
         --output_dir=./test_e2e/ \
         --phones_dict=${dump_dir}/phone_id_map.txt \
         --speaker_dict=${dump_dir}/speaker_id_map.txt \
diff --git a/examples/tiny/asr1/README.md b/examples/tiny/asr1/README.md
index cfa26670..489f5bc3 100644
--- a/examples/tiny/asr1/README.md
+++ b/examples/tiny/asr1/README.md
@@ -37,7 +37,7 @@ It will support the way of using `--variable value` in the shell scripts.
 Some local variables are set in `run.sh`. 
 `gpus` denotes the GPU number you want to use. If you set `gpus=`, it means you only use CPU. 
 `stage` denotes the number of stage you want the start from in the experiments.
-`stop stage` denotes the number of stage you want the stop at in the expriments. 
+`stop stage` denotes the number of stage you want the stop at in the experiments. 
 `conf_path` denotes the config path of the model.
 `avg_num`denotes the number K of top-K models you want to average to get the final model.
 `ckpt` denotes the checkpoint prefix of the model, e.g. "transformerr"
diff --git a/examples/vctk/tts3/README.md b/examples/vctk/tts3/README.md
index 0bf2037f..3a6f3e1b 100644
--- a/examples/vctk/tts3/README.md
+++ b/examples/vctk/tts3/README.md
@@ -254,7 +254,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
   --voc_ckpt=pwg_vctk_ckpt_0.1.1/snapshot_iter_1500000.pdz \
   --voc_stat=pwg_vctk_ckpt_0.1.1/feats_stats.npy \
   --lang=en \
-  --text=${BIN_DIR}/../sentences_en.txt \
+  --text=${BIN_DIR}/../../assets/sentences_en.txt \
   --output_dir=exp/default/test_e2e \
   --phones_dict=fastspeech2_vctk_ckpt_1.2.0/phone_id_map.txt \
   --speaker_dict=fastspeech2_vctk_ckpt_1.2.0/speaker_id_map.txt \
diff --git a/examples/vctk/tts3/local/inference.sh b/examples/vctk/tts3/local/inference.sh
index 9c442614..ef23d951 100755
--- a/examples/vctk/tts3/local/inference.sh
+++ b/examples/vctk/tts3/local/inference.sh
@@ -10,7 +10,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --inference_dir=${train_output_path}/inference \
         --am=fastspeech2_vctk \
         --voc=pwgan_vctk \
-        --text=${BIN_DIR}/../sentences_en.txt \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt \
         --output_dir=${train_output_path}/pd_infer_out \
         --phones_dict=dump/phone_id_map.txt \
         --speaker_dict=dump/speaker_id_map.txt \
@@ -23,7 +23,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --inference_dir=${train_output_path}/inference \
         --am=fastspeech2_vctk \
         --voc=hifigan_vctk \
-        --text=${BIN_DIR}/../sentences_en.txt \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt \
         --output_dir=${train_output_path}/pd_infer_out \
         --phones_dict=dump/phone_id_map.txt \
         --speaker_dict=dump/speaker_id_map.txt \
diff --git a/examples/vctk/tts3/local/lite_predict.sh b/examples/vctk/tts3/local/lite_predict.sh
index eb608535..53141b5f 100755
--- a/examples/vctk/tts3/local/lite_predict.sh
+++ b/examples/vctk/tts3/local/lite_predict.sh
@@ -11,7 +11,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --inference_dir=${train_output_path}/pdlite \
         --am=fastspeech2_vctk \
         --voc=pwgan_vctk \
-        --text=${BIN_DIR}/../sentences_en.txt \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt \
         --output_dir=${train_output_path}/lite_infer_out \
         --phones_dict=dump/phone_id_map.txt \
         --speaker_dict=dump/speaker_id_map.txt \
@@ -25,7 +25,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --inference_dir=${train_output_path}/pdlite \
         --am=fastspeech2_vctk \
         --voc=hifigan_vctk \
-        --text=${BIN_DIR}/../sentences_en.txt \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt \
         --output_dir=${train_output_path}/lite_infer_out \
         --phones_dict=dump/phone_id_map.txt \
         --speaker_dict=dump/speaker_id_map.txt \
diff --git a/examples/vctk/tts3/local/ort_predict.sh b/examples/vctk/tts3/local/ort_predict.sh
index 4019e17f..f376ee75 100755
--- a/examples/vctk/tts3/local/ort_predict.sh
+++ b/examples/vctk/tts3/local/ort_predict.sh
@@ -10,7 +10,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --am=fastspeech2_vctk \
         --voc=pwgan_vctk \
         --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../sentences_en.txt \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt \
         --phones_dict=dump/phone_id_map.txt \
         --device=cpu \
         --cpu_threads=2 \
@@ -25,7 +25,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --am=fastspeech2_vctk \
         --voc=hifigan_vctk \
         --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../sentences_en.txt \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt \
         --phones_dict=dump/phone_id_map.txt \
         --device=cpu \
         --cpu_threads=2 \
diff --git a/examples/vctk/tts3/local/synthesize_e2e.sh b/examples/vctk/tts3/local/synthesize_e2e.sh
index a89f42b5..971c8385 100755
--- a/examples/vctk/tts3/local/synthesize_e2e.sh
+++ b/examples/vctk/tts3/local/synthesize_e2e.sh
@@ -21,7 +21,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --voc_ckpt=pwg_vctk_ckpt_0.1.1/snapshot_iter_1500000.pdz \
         --voc_stat=pwg_vctk_ckpt_0.1.1/feats_stats.npy \
         --lang=en \
-        --text=${BIN_DIR}/../sentences_en.txt \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt \
         --output_dir=${train_output_path}/test_e2e \
         --phones_dict=dump/phone_id_map.txt \
         --speaker_dict=dump/speaker_id_map.txt \
@@ -43,7 +43,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --voc_ckpt=hifigan_vctk_ckpt_0.2.0/snapshot_iter_2500000.pdz \
         --voc_stat=hifigan_vctk_ckpt_0.2.0/feats_stats.npy \
         --lang=en \
-        --text=${BIN_DIR}/../sentences_en.txt \
+        --text=${BIN_DIR}/../../assets/sentences_en.txt \
         --output_dir=${train_output_path}/test_e2e \
         --phones_dict=dump/phone_id_map.txt \
         --speaker_dict=dump/speaker_id_map.txt \
diff --git a/examples/vctk/tts3/run.sh b/examples/vctk/tts3/run.sh
index a112b94b..76307bd5 100755
--- a/examples/vctk/tts3/run.sh
+++ b/examples/vctk/tts3/run.sh
@@ -43,10 +43,7 @@ fi
 
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
     # install paddle2onnx
-    version=$(echo `pip list |grep "paddle2onnx"` |awk -F" " '{print $2}')
-    if [[ -z "$version" || ${version} != '1.0.0' ]]; then
-        pip install paddle2onnx==1.0.0
-    fi
+    pip install paddle2onnx --upgrade
     ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx fastspeech2_vctk
     # considering the balance between speed and quality, we recommend that you use hifigan as vocoder
     ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_vctk
diff --git a/examples/vctk/vc3/conf/default.yaml b/examples/vctk/vc3/conf/default.yaml
index 0acc2a56..eb98515a 100644
--- a/examples/vctk/vc3/conf/default.yaml
+++ b/examples/vctk/vc3/conf/default.yaml
@@ -1,22 +1,135 @@
-  generator_params:
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+# 源码 load 的时候用的 24k, 提取 mel 用的 16k, 后续 load 和提取 mel 都要改成 24k
+fs: 16000
+n_fft: 2048
+n_shift: 300
+win_length: 1200   # Window length.(in samples) 50ms
+                   # If set to null, it will be the same as fft_size.
+window: "hann"     # Window function.
+
+fmin: 0           # Minimum frequency of Mel basis.
+fmax: 8000        # Maximum frequency of Mel basis.  sr // 2
+n_mels: 80
+# only for StarGANv2 VC
+norm:             # None here
+htk: True
+power: 2.0
+
+
+###########################################################
+#                       MODEL SETTING                     #
+###########################################################
+generator_params:
     dim_in: 64
     style_dim: 64
     max_conv_dim: 512
     w_hpf: 0
     F0_channel: 256
-  mapping_network_params:
+mapping_network_params:
     num_domains: 20      # num of speakers in StarGANv2
     latent_dim: 16
     style_dim: 64        # same as style_dim in generator_params
     hidden_dim: 512      # same as max_conv_dim in generator_params
-  style_encoder_params:
+style_encoder_params:
     dim_in: 64           # same as dim_in in generator_params
     style_dim: 64        # same as style_dim in generator_params
     num_domains: 20      # same as num_domains in generator_params
     max_conv_dim: 512    # same as max_conv_dim in generator_params
-  discriminator_params:
+discriminator_params:
     dim_in: 64           # same as dim_in in generator_params
     num_domains: 20      # same as num_domains in mapping_network_params
     max_conv_dim: 512    # same as max_conv_dim in generator_params
-    n_repeat: 4
-    
\ No newline at end of file
+    repeat_num: 4
+asr_params:
+    input_dim: 80
+    hidden_dim: 256
+    n_token: 80
+    token_embedding_dim: 256
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+loss_params:
+    g_loss:
+        lambda_sty: 1.
+        lambda_cyc: 5.
+        lambda_ds: 1.
+        lambda_norm: 1.
+        lambda_asr: 10.
+        lambda_f0: 5.
+        lambda_f0_sty: 0.1
+        lambda_adv: 2.
+        lambda_adv_cls: 0.5
+        norm_bias: 0.5
+    d_loss:
+        lambda_reg: 1.
+        lambda_adv_cls: 0.1
+        lambda_con_reg: 10.
+
+    adv_cls_epoch: 50
+    con_reg_epoch: 30
+        
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 5               # Batch size.
+num_workers: 2              # Number of workers in DataLoader.
+max_mel_length: 192
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+    beta1: 0.0
+    beta2: 0.99
+    weight_decay: 1.0e-4                   
+    epsilon: 1.0e-9
+generator_scheduler_params:
+    max_learning_rate: 2.0e-4
+    phase_pct: 0.0
+    divide_factor: 1
+    total_steps: 200000                # train_max_steps
+    end_learning_rate: 2.0e-4
+style_encoder_optimizer_params:
+    beta1: 0.0
+    beta2: 0.99
+    weight_decay: 1.0e-4                 
+    epsilon: 1.0e-9
+style_encoder_scheduler_params:
+    max_learning_rate: 2.0e-4
+    phase_pct: 0.0
+    divide_factor: 1
+    total_steps: 200000                # train_max_steps
+    end_learning_rate: 2.0e-4
+mapping_network_optimizer_params:
+    beta1: 0.0
+    beta2: 0.99
+    weight_decay: 1.0e-4                 
+    epsilon: 1.0e-9
+mapping_network_scheduler_params:
+    max_learning_rate: 2.0e-6
+    phase_pct: 0.0
+    divide_factor: 1
+    total_steps: 200000                # train_max_steps
+    end_learning_rate: 2.0e-6
+discriminator_optimizer_params:
+    beta1: 0.0
+    beta2: 0.99
+    weight_decay: 1.0e-4                 
+    epsilon: 1.0e-9
+discriminator_scheduler_params:
+    max_learning_rate: 2.0e-4
+    phase_pct: 0.0
+    divide_factor: 1
+    total_steps: 200000                # train_max_steps
+    end_learning_rate: 2.0e-4        
+
+###########################################################
+#                    TRAINING SETTING                     #
+###########################################################
+max_epoch: 150
+num_snapshots: 5
+seed: 1
\ No newline at end of file
diff --git a/examples/vctk/vc3/local/preprocess.sh b/examples/vctk/vc3/local/preprocess.sh
index ea0fbc43..058171c5 100755
--- a/examples/vctk/vc3/local/preprocess.sh
+++ b/examples/vctk/vc3/local/preprocess.sh
@@ -6,13 +6,32 @@ stop_stage=100
 config_path=$1
 
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # extract features
+    echo "Extract features ..."
+    python3 ${BIN_DIR}/preprocess.py \
+        --dataset=vctk \
+        --rootdir=~/datasets/VCTK-Corpus-0.92/ \
+        --dumpdir=dump \
+        --config=${config_path} \
+        --num-cpu=20
 
 fi
 
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-
-fi
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+echo "Normalize ..."
+    python3 ${BIN_DIR}/normalize.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --dumpdir=dump/train/norm \
+        --speaker-dict=dump/speaker_id_map.txt
+
+    python3 ${BIN_DIR}/normalize.py \
+        --metadata=dump/dev/raw/metadata.jsonl \
+        --dumpdir=dump/dev/norm \
+        --speaker-dict=dump/speaker_id_map.txt
+
+    python3 ${BIN_DIR}/normalize.py \
+        --metadata=dump/test/raw/metadata.jsonl \
+        --dumpdir=dump/test/norm \
+        --speaker-dict=dump/speaker_id_map.txt
 
 fi
diff --git a/examples/vctk/vc3/local/train.sh b/examples/vctk/vc3/local/train.sh
index 3a507650..d4ea02da 100755
--- a/examples/vctk/vc3/local/train.sh
+++ b/examples/vctk/vc3/local/train.sh
@@ -9,5 +9,4 @@ python3 ${BIN_DIR}/train.py \
     --config=${config_path} \
     --output-dir=${train_output_path} \
     --ngpu=1 \
-    --phones-dict=dump/phone_id_map.txt \
     --speaker-dict=dump/speaker_id_map.txt
diff --git a/examples/zh_en_tts/tts3/.gitignore b/examples/zh_en_tts/tts3/.gitignore
new file mode 100644
index 00000000..bbd86a25
--- /dev/null
+++ b/examples/zh_en_tts/tts3/.gitignore
@@ -0,0 +1,2 @@
+data
+exp
diff --git a/examples/zh_en_tts/tts3/README.md b/examples/zh_en_tts/tts3/README.md
index 01202800..15de3f48 100644
--- a/examples/zh_en_tts/tts3/README.md
+++ b/examples/zh_en_tts/tts3/README.md
@@ -6,11 +6,11 @@ This example contains code used to train a [Fastspeech2](https://arxiv.org/abs/2
 
 ## Dataset
 ### Download and Extract
-Download all datasets and extract it to `~/datasets`:
-- The CSMSC dataset is in the directory `~/datasets/BZNSYP`
-- The Ljspeech dataset is in the directory `~/datasets/LJSpeech-1.1`
-- The aishell3 dataset is in the directory `~/datasets/data_aishell3`
-- The vctk dataset is in the directory `~/datasets/VCTK-Corpus-0.92`
+Download all datasets and extract it to `./data`:
+- The CSMSC dataset is in the directory `./data/BZNSYP`
+- The Ljspeech dataset is in the directory `./data/LJSpeech-1.1`
+- The aishell3 dataset is in the directory `./data/data_aishell3`
+- The vctk dataset is in the directory `./data/VCTK-Corpus-0.92`
  
 ### Get MFA Result and Extract
 We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for the fastspeech2 training.
@@ -24,16 +24,16 @@ Or train your MFA model reference to [mfa example](https://github.com/PaddlePadd
 
 ## Get Started
 Assume the paths to the datasets are:
-- `~/datasets/BZNSYP`
-- `~/datasets/LJSpeech-1.1`
-- `~/datasets/data_aishell3` 
-- `~/datasets/VCTK-Corpus-0.92`
+- `./data/BZNSYP`
+- `./data/LJSpeech-1.1`
+- `./data/data_aishell3` 
+- `./data/VCTK-Corpus-0.92`
 
 Assume the path to the MFA results of the datasets are:
-- `./mfa_results/baker_alignment_tone`
-- `./mfa_results/ljspeech_alignment`
-- `./mfa_results/aishell3_alignment_tone`
-- `./mfa_results/vctk_alignment`
+- `./data/mfa/baker_alignment_tone`
+- `./data/mfa/ljspeech_alignment`
+- `./data/mfa/aishell3_alignment_tone`
+- `./data/mfa/vctk_alignment`
 
 Run the command below to
 1. **source path**.
@@ -252,8 +252,10 @@ optional arguments:
 
 
 ## Pretrained Model
+
 Pretrained FastSpeech2 model with no silence in the edge of audios:
 - [fastspeech2_mix_ckpt_1.2.0.zip](https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_mix_ckpt_1.2.0.zip)
+- [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip)
 
 The static model can be downloaded here:
 - [fastspeech2_mix_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_mix_static_0.2.0.zip)
@@ -285,18 +287,18 @@ FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
 python3 ${BIN_DIR}/../synthesize_e2e.py \
   --am=fastspeech2_mix \
-  --am_config=fastspeech2_mix_ckpt_1.2.0/default.yaml \
-  --am_ckpt=fastspeech2_mix_ckpt_1.2.0/snapshot_iter_99200.pdz \
-  --am_stat=fastspeech2_mix_ckpt_1.2.0/speech_stats.npy \
+  --am_config=exp/pretrain/fastspeech2_mix_ckpt_1.2.0/default.yaml \
+  --am_ckpt=exp/pretrain/fastspeech2_mix_ckpt_1.2.0/snapshot_iter_99200.pdz \
+  --am_stat=exp/pretrain/fastspeech2_mix_ckpt_1.2.0/speech_stats.npy \
+  --phones_dict=exp/pretrain/fastspeech2_mix_ckpt_1.2.0/phone_id_map.txt \
+  --speaker_dict=exp/pretrain/fastspeech2_mix_ckpt_1.2.0/speaker_id_map.txt \
+  --spk_id=174 \
   --voc=pwgan_aishell3 \
-  --voc_config=pwg_aishell3_ckpt_0.5/default.yaml \
-  --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
-  --voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
+  --voc_config=exp/pretrain/pwg_aishell3_ckpt_0.5/default.yaml \
+  --voc_ckpt=exp/pretrain/pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
+  --voc_stat=exp/pretrain/pwg_aishell3_ckpt_0.5/feats_stats.npy \
   --lang=mix \
-  --text=${BIN_DIR}/../sentences_mix.txt \
+  --text=${BIN_DIR}/../../assets/sentences_mix.txt \
   --output_dir=exp/default/test_e2e \
-  --phones_dict=fastspeech2_mix_ckpt_1.2.0/phone_id_map.txt \
-  --speaker_dict=fastspeech2_mix_ckpt_1.2.0/speaker_id_map.txt \
-  --spk_id=174 \
   --inference_dir=exp/default/inference
 ```
diff --git a/examples/zh_en_tts/tts3/local/inference.sh b/examples/zh_en_tts/tts3/local/inference.sh
index 16499ed0..e4168fd0 100755
--- a/examples/zh_en_tts/tts3/local/inference.sh
+++ b/examples/zh_en_tts/tts3/local/inference.sh
@@ -13,7 +13,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --inference_dir=${train_output_path}/inference \
         --am=fastspeech2_mix \
         --voc=pwgan_aishell3 \
-        --text=${BIN_DIR}/../sentences_mix.txt \
+        --text=${BIN_DIR}/../../assets/sentences_mix.txt \
         --output_dir=${train_output_path}/pd_infer_out \
         --phones_dict=dump/phone_id_map.txt \
         --speaker_dict=dump/speaker_id_map.txt \
@@ -30,7 +30,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --inference_dir=${train_output_path}/inference \
         --am=fastspeech2_mix \
         --voc=hifigan_aishell3 \
-        --text=${BIN_DIR}/../sentences_mix.txt \
+        --text=${BIN_DIR}/../../assets/sentences_mix.txt \
         --output_dir=${train_output_path}/pd_infer_out \
         --phones_dict=dump/phone_id_map.txt \
         --speaker_dict=dump/speaker_id_map.txt \
@@ -45,7 +45,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
         --inference_dir=${train_output_path}/inference \
         --am=fastspeech2_mix \
         --voc=hifigan_csmsc \
-        --text=${BIN_DIR}/../sentences_mix.txt \
+        --text=${BIN_DIR}/../../assets/sentences_mix.txt \
         --output_dir=${train_output_path}/pd_infer_out \
         --phones_dict=dump/phone_id_map.txt \
         --speaker_dict=dump/speaker_id_map.txt \
diff --git a/examples/zh_en_tts/tts3/local/mfa_download.sh b/examples/zh_en_tts/tts3/local/mfa_download.sh
new file mode 100755
index 00000000..1863c896
--- /dev/null
+++ b/examples/zh_en_tts/tts3/local/mfa_download.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+exp=exp
+mfa=$exp/mfa
+
+mkdir -p $mfa
+
+pushd $mfa
+
+wget -c https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz &
+wget -c https://paddlespeech.bj.bcebos.com/MFA/LJSpeech-1.1/ljspeech_alignment.tar.gz &
+wget -c https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz &
+wget -c https://paddlespeech.bj.bcebos.com/MFA/VCTK-Corpus-0.92/vctk_alignment.tar.gz &
+wait
+
+popd
diff --git a/examples/zh_en_tts/tts3/local/model_download.sh b/examples/zh_en_tts/tts3/local/model_download.sh
new file mode 100755
index 00000000..21a218a8
--- /dev/null
+++ b/examples/zh_en_tts/tts3/local/model_download.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+exp=exp
+pretrain=$exp/pretrain
+
+mkdir -p $pretrain
+
+pushd $pretrain
+
+wget -c https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_mix_ckpt_1.2.0.zip &
+wget -c https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip &
+wait
+
+popd
diff --git a/examples/zh_en_tts/tts3/local/ort_predict.sh b/examples/zh_en_tts/tts3/local/ort_predict.sh
index d80da9c9..0d5ac675 100755
--- a/examples/zh_en_tts/tts3/local/ort_predict.sh
+++ b/examples/zh_en_tts/tts3/local/ort_predict.sh
@@ -13,7 +13,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --am=fastspeech2_mix \
         --voc=pwgan_aishell3 \
         --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../sentences_mix.txt \
+        --text=${BIN_DIR}/../../assets/sentences_mix.txt \
         --phones_dict=dump/phone_id_map.txt \
         --device=cpu \
         --cpu_threads=4 \
@@ -31,7 +31,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --am=fastspeech2_mix \
         --voc=hifigan_aishell3 \
         --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../sentences_mix.txt \
+        --text=${BIN_DIR}/../../assets/sentences_mix.txt \
         --phones_dict=dump/phone_id_map.txt \
         --device=cpu \
         --cpu_threads=4 \
@@ -45,7 +45,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
         --am=fastspeech2_mix \
         --voc=hifigan_csmsc \
         --output_dir=${train_output_path}/onnx_infer_out_e2e \
-        --text=${BIN_DIR}/../sentences_mix.txt \
+        --text=${BIN_DIR}/../../assets/sentences_mix.txt \
         --phones_dict=dump/phone_id_map.txt \
         --device=cpu \
         --cpu_threads=4 \
diff --git a/examples/zh_en_tts/tts3/local/synthesize_e2e.sh b/examples/zh_en_tts/tts3/local/synthesize_e2e.sh
index f6ee04ae..daad7180 100755
--- a/examples/zh_en_tts/tts3/local/synthesize_e2e.sh
+++ b/examples/zh_en_tts/tts3/local/synthesize_e2e.sh
@@ -23,7 +23,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
         --voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
         --lang=mix \
-        --text=${BIN_DIR}/../sentences_mix.txt \
+        --text=${BIN_DIR}/../../assets/sentences_mix.txt \
         --output_dir=${train_output_path}/test_e2e \
         --phones_dict=dump/phone_id_map.txt \
         --speaker_dict=dump/speaker_id_map.txt \
@@ -48,7 +48,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --voc_ckpt=hifigan_aishell3_ckpt_0.2.0/snapshot_iter_2500000.pdz \
         --voc_stat=hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \
         --lang=mix \
-        --text=${BIN_DIR}/../sentences_mix.txt \
+        --text=${BIN_DIR}/../../assets/sentences_mix.txt \
         --output_dir=${train_output_path}/test_e2e \
         --phones_dict=dump/phone_id_map.txt \
         --speaker_dict=dump/speaker_id_map.txt \
@@ -73,7 +73,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
         --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
         --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
         --lang=mix \
-        --text=${BIN_DIR}/../sentences_mix.txt \
+        --text=${BIN_DIR}/../../assets/sentences_mix.txt \
         --output_dir=${train_output_path}/test_e2e \
         --phones_dict=dump/phone_id_map.txt \
         --speaker_dict=dump/speaker_id_map.txt \
diff --git a/examples/zh_en_tts/tts3/run.sh b/examples/zh_en_tts/tts3/run.sh
index 12f99081..a18421f5 100755
--- a/examples/zh_en_tts/tts3/run.sh
+++ b/examples/zh_en_tts/tts3/run.sh
@@ -7,8 +7,8 @@ gpus=0,1
 stage=0
 stop_stage=100
 
-datasets_root_dir=~/datasets
-mfa_root_dir=./mfa_results/
+datasets_root_dir=./data
+mfa_root_dir=./data/mfa
 conf_path=conf/default.yaml
 train_output_path=exp/default
 ckpt_name=snapshot_iter_99200.pdz
@@ -46,10 +46,7 @@ fi
 
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
     # install paddle2onnx
-    version=$(echo `pip list |grep "paddle2onnx"` |awk -F" " '{print $2}')
-    if [[ -z "$version" || ${version} != '1.0.0' ]]; then
-        pip install paddle2onnx==1.0.0
-    fi
+    pip install paddle2onnx --upgrade
     ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx fastspeech2_mix
     # considering the balance between speed and quality, we recommend that you use hifigan as vocoder
     ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_aishell3
diff --git a/paddlespeech/__init__.py b/paddlespeech/__init__.py
index 6c7e75c1..969d189f 100644
--- a/paddlespeech/__init__.py
+++ b/paddlespeech/__init__.py
@@ -13,3 +13,7 @@
 # limitations under the License.
 import _locale
 _locale._getdefaultlocale = (lambda *args: ['en_US', 'utf8'])
+
+__version__ = '0.0.0'
+
+__commit__ = '9cf8c1985a98bb380c183116123672976bdfe5c9'
diff --git a/paddlespeech/audio/transform/perturb.py b/paddlespeech/audio/transform/perturb.py
index 0825caec..757a2f1b 100644
--- a/paddlespeech/audio/transform/perturb.py
+++ b/paddlespeech/audio/transform/perturb.py
@@ -14,6 +14,7 @@
 # Modified from espnet(https://github.com/espnet/espnet)
 import io
 import os
+import sys
 
 import h5py
 import librosa
@@ -98,7 +99,7 @@ class SoundHDF5File():
     def __contains__(self, item):
         return item in self.file
 
-    def __len__(self, item):
+    def __len__(self):
         return len(self.file)
 
     def __enter__(self):
diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py
index 7a7aef8b..231a00f4 100644
--- a/paddlespeech/cli/asr/infer.py
+++ b/paddlespeech/cli/asr/infer.py
@@ -25,9 +25,6 @@ import librosa
 import numpy as np
 import paddle
 import soundfile
-from paddlespeech.audio.transform.transformation import Transformation
-from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
-from paddlespeech.s2t.utils.utility import UpdateConfig
 from yacs.config import CfgNode
 
 from ...utils.env import MODEL_HOME
@@ -37,6 +34,9 @@ from ..log import logger
 from ..utils import CLI_TIMER
 from ..utils import stats_wrapper
 from ..utils import timer_register
+from paddlespeech.audio.transform.transformation import Transformation
+from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
+from paddlespeech.s2t.utils.utility import UpdateConfig
 
 __all__ = ['ASRExecutor']
 
@@ -274,7 +274,7 @@ class ASRExecutor(BaseExecutor):
             # fbank
             audio = preprocessing(audio, **preprocess_args)
 
-            audio_len = paddle.to_tensor(audio.shape[0])
+            audio_len = paddle.to_tensor(audio.shape[0]).unsqueeze(axis=0)
             audio = paddle.to_tensor(audio, dtype='float32').unsqueeze(axis=0)
 
             self._inputs["audio"] = audio
diff --git a/paddlespeech/cli/download.py b/paddlespeech/cli/download.py
index 5661f18f..e77a05d2 100644
--- a/paddlespeech/cli/download.py
+++ b/paddlespeech/cli/download.py
@@ -133,10 +133,10 @@ def _get_download(url, fullname):
     total_size = req.headers.get('content-length')
     with open(tmp_fullname, 'wb') as f:
         if total_size:
-            with tqdm(total=(int(total_size) + 1023) // 1024) as pbar:
+            with tqdm(total=(int(total_size)), unit='B', unit_scale=True) as pbar:
                 for chunk in req.iter_content(chunk_size=1024):
                     f.write(chunk)
-                    pbar.update(1)
+                    pbar.update(len(chunk))
         else:
             for chunk in req.iter_content(chunk_size=1024):
                 if chunk:
diff --git a/paddlespeech/cli/ssl/infer.py b/paddlespeech/cli/ssl/infer.py
index dce7c778..9b4b0280 100644
--- a/paddlespeech/cli/ssl/infer.py
+++ b/paddlespeech/cli/ssl/infer.py
@@ -51,11 +51,8 @@ class SSLExecutor(BaseExecutor):
         self.parser.add_argument(
             '--model',
             type=str,
-            default=None,
-            choices=[
-                tag[:tag.index('-')]
-                for tag in self.task_resource.pretrained_models.keys()
-            ],
+            default='wav2vec2',
+            choices=['wav2vec2', 'hubert', "wavlm"],
             help='Choose model type of asr task.')
         self.parser.add_argument(
             '--task',
@@ -67,7 +64,7 @@ class SSLExecutor(BaseExecutor):
             '--lang',
             type=str,
             default='en',
-            help='Choose model language. zh or en, zh:[wav2vec2ASR_aishell1-zh-16k], en:[wav2vec2ASR_librispeech-en-16k]'
+            help='Choose model language. zh or en, zh:[wav2vec2ASR_aishell1-zh-16k], en:[wav2vec2ASR_librispeech-en-16k, hubertASR_librispeech_100-en-16k]'
         )
         self.parser.add_argument(
             "--sample_rate",
@@ -137,13 +134,6 @@ class SSLExecutor(BaseExecutor):
         logger.debug("start to init the model")
 
         if model_type is None:
-            if lang == 'en':
-                model_type = 'wav2vec2ASR_librispeech'
-            elif lang == 'zh':
-                model_type = 'wav2vec2ASR_aishell1'
-            else:
-                logger.error(
-                    "invalid lang, please input --lang en or --lang zh")
             logger.debug(
                 "Model type had not been specified, default {} was used.".
                 format(model_type))
@@ -155,9 +145,26 @@ class SSLExecutor(BaseExecutor):
         if cfg_path is None or ckpt_path is None:
             sample_rate_str = '16k' if sample_rate == 16000 else '8k'
             if task == 'asr':
-                tag = model_type + '-' + lang + '-' + sample_rate_str
+                if model_type == 'wav2vec2':
+                    if lang == 'en':
+                        model_prefix = 'wav2vec2ASR_librispeech'
+                    elif lang == 'zh':
+                        model_prefix = 'wav2vec2ASR_aishell1'
+                    tag = model_prefix + '-' + lang + '-' + sample_rate_str
+                elif model_type == 'hubert':
+                    if lang == 'en':
+                        model_prefix = 'hubertASR_librispeech-100h'
+                    elif lang == 'zh':
+                        logger.error("zh hubertASR is not supported yet")
+                    tag = model_prefix + '-' + lang + '-' + sample_rate_str
+                elif model_type == 'wavlm':
+                    if lang == "en":
+                        model_prefix = "wavlmASR_librispeech"
+                    elif lang == "zh":
+                        logger.error("zh wavlmASR is not supported yet")
+                    tag = model_prefix + '-' + lang + '-' + sample_rate_str
             else:
-                tag = 'wav2vec2' + '-' + lang + '-' + sample_rate_str
+                tag = model_type + '-' + lang + '-' + sample_rate_str
             self.task_resource.set_task_model(tag, version=None)
             self.res_path = self.task_resource.res_dir
 
@@ -184,16 +191,17 @@ class SSLExecutor(BaseExecutor):
                     self.text_feature = TextFeaturizer(
                         unit_type=self.config.unit_type,
                         vocab=self.config.vocab_filepath)
+                    self.config.output_dim = len(self.config.vocab_filepath)
                 elif lang == 'zh':
                     self.text_feature = AutoTokenizer.from_pretrained(
                         self.config.tokenizer)
+                    self.config.output_dim = self.text_feature.vocab_size
                 self.config.decode.decoding_method = decode_method
-            model_name = model_type[:model_type.rindex(
+            model_name = model_prefix[:model_prefix.rindex(
                 '_')]  # model_type: {model_name}_{dataset}
         else:
-            model_name = 'wav2vec2'
+            model_name = model_type
         model_class = self.task_resource.get_model_class(model_name)
-
         model_conf = self.config
         model = model_class.from_config(model_conf)
         self.model = model
@@ -204,9 +212,9 @@ class SSLExecutor(BaseExecutor):
         if task == 'asr':
             self.model.set_state_dict(model_dict)
         else:
-            self.model.wav2vec2.set_state_dict(model_dict)
+            getattr(self.model, model_type).set_state_dict(model_dict)
 
-    def preprocess(self, model_type: str, input: Union[str, os.PathLike]):
+    def preprocess(self, input: Union[str, os.PathLike]):
         """
         Input preprocess and return paddle.Tensor stored in self.input.
         Input content can be a text(tts), a file(asr, cls) or a streaming(not supported yet).
@@ -245,7 +253,7 @@ class SSLExecutor(BaseExecutor):
         # fbank
         audio = preprocessing(audio, **preprocess_args)
 
-        audio_len = paddle.to_tensor(audio.shape[0])
+        audio_len = paddle.to_tensor(audio.shape[0]).unsqueeze(axis=0)
         audio = paddle.to_tensor(audio, dtype='float32').unsqueeze(axis=0)
 
         self._inputs["audio"] = audio
@@ -263,8 +271,7 @@ class SSLExecutor(BaseExecutor):
         audio = self._inputs["audio"]
         if task == 'asr':
             cfg = self.config.decode
-            logger.debug(
-                f"we will use the wav2vec2ASR like model : {model_type}")
+            logger.debug(f"we will use the {model_type}ASR like model.")
             try:
                 result_transcripts = self.model.decode(
                     audio,
@@ -277,7 +284,8 @@ class SSLExecutor(BaseExecutor):
                 logger.exception(e)
         else:
             logger.debug(
-                "we will use the wav2vec2 like model to extract audio feature")
+                f"we will use the {model_type} like model to extract audio feature."
+            )
             try:
                 out_feature = self.model(audio[:, :, 0])
                 self._outputs["result"] = out_feature[0]
@@ -454,7 +462,7 @@ class SSLExecutor(BaseExecutor):
         if rtf:
             k = self.__class__.__name__
             CLI_TIMER[k]['start'].append(time.time())
-        self.preprocess(model, audio_file)
+        self.preprocess(audio_file)
         self.infer(model, task)
         res = self.postprocess()  # Retrieve result of asr.
 
diff --git a/paddlespeech/cli/st/infer.py b/paddlespeech/cli/st/infer.py
index bc2bdd1a..0867e815 100644
--- a/paddlespeech/cli/st/infer.py
+++ b/paddlespeech/cli/st/infer.py
@@ -252,7 +252,7 @@ class STExecutor(BaseExecutor):
             norm_feat = dict(kaldiio.load_ark(process.stdout))[utt_name]
             self._inputs["audio"] = paddle.to_tensor(norm_feat).unsqueeze(0)
             self._inputs["audio_len"] = paddle.to_tensor(
-                self._inputs["audio"].shape[1], dtype="int64")
+                self._inputs["audio"].shape[1:2], dtype="int64")
         else:
             raise ValueError("Wrong model type.")
 
diff --git a/paddlespeech/cli/tts/infer.py b/paddlespeech/cli/tts/infer.py
index 4787e1ee..beba7f60 100644
--- a/paddlespeech/cli/tts/infer.py
+++ b/paddlespeech/cli/tts/infer.py
@@ -491,7 +491,7 @@ class TTSExecutor(BaseExecutor):
                 # multi speaker
                 if am_dataset in {'aishell3', 'vctk', 'mix', 'canton'}:
                     mel = self.am_inference(
-                        part_phone_ids, spk_id=paddle.to_tensor(spk_id))
+                        part_phone_ids, spk_id=paddle.to_tensor([spk_id]))
                 else:
                     mel = self.am_inference(part_phone_ids)
             self.am_time += (time.time() - am_st)
diff --git a/paddlespeech/cli/whisper/infer.py b/paddlespeech/cli/whisper/infer.py
index ebcca890..17e8c0b8 100644
--- a/paddlespeech/cli/whisper/infer.py
+++ b/paddlespeech/cli/whisper/infer.py
@@ -253,7 +253,7 @@ class WhisperExecutor(BaseExecutor):
         # fbank
         audio = log_mel_spectrogram(audio, resource_path=self.resource_path)
 
-        audio_len = paddle.to_tensor(audio.shape[0])
+        audio_len = paddle.to_tensor(audio.shape[0]).unsqueeze(axis=0)
 
         self._inputs["audio"] = audio
         self._inputs["audio_len"] = audio_len
diff --git a/paddlespeech/dataset/__init__.py b/paddlespeech/dataset/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/dataset/aidatatang_200zh/README.md b/paddlespeech/dataset/aidatatang_200zh/README.md
similarity index 100%
rename from dataset/aidatatang_200zh/README.md
rename to paddlespeech/dataset/aidatatang_200zh/README.md
diff --git a/paddlespeech/dataset/aidatatang_200zh/__init__.py b/paddlespeech/dataset/aidatatang_200zh/__init__.py
new file mode 100644
index 00000000..9146247d
--- /dev/null
+++ b/paddlespeech/dataset/aidatatang_200zh/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .aidatatang_200zh import main as aidatatang_200zh_main
diff --git a/paddlespeech/dataset/aidatatang_200zh/aidatatang_200zh.py b/paddlespeech/dataset/aidatatang_200zh/aidatatang_200zh.py
new file mode 100644
index 00000000..5d914a43
--- /dev/null
+++ b/paddlespeech/dataset/aidatatang_200zh/aidatatang_200zh.py
@@ -0,0 +1,158 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Prepare aidatatang_200zh mandarin dataset
+
+Download, unpack and create manifest files.
+Manifest file is a json-format file with each line containing the
+meta data (i.e. audio filepath, transcript and audio duration)
+of each audio file in the data set.
+"""
+import argparse
+import codecs
+import json
+import os
+from pathlib import Path
+
+import soundfile
+
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unpack
+from paddlespeech.utils.argparse import print_arguments
+
+DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
+
+URL_ROOT = 'http://www.openslr.org/resources/62'
+# URL_ROOT = 'https://openslr.magicdatatech.com/resources/62'
+DATA_URL = URL_ROOT + '/aidatatang_200zh.tgz'
+MD5_DATA = '6e0f4f39cd5f667a7ee53c397c8d0949'
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--target_dir",
+    default=DATA_HOME + "/aidatatang_200zh",
+    type=str,
+    help="Directory to save the dataset. (default: %(default)s)")
+parser.add_argument(
+    "--manifest_prefix",
+    default="manifest",
+    type=str,
+    help="Filepath prefix for output manifests. (default: %(default)s)")
+args = parser.parse_args()
+
+
+def create_manifest(data_dir, manifest_path_prefix):
+    print("Creating manifest %s ..." % manifest_path_prefix)
+    json_lines = []
+    transcript_path = os.path.join(data_dir, 'transcript',
+                                   'aidatatang_200_zh_transcript.txt')
+    transcript_dict = {}
+    for line in codecs.open(transcript_path, 'r', 'utf-8'):
+        line = line.strip()
+        if line == '':
+            continue
+        audio_id, text = line.split(' ', 1)
+        # remove withespace, charactor text
+        text = ''.join(text.split())
+        transcript_dict[audio_id] = text
+
+    data_types = ['train', 'dev', 'test']
+    for dtype in data_types:
+        del json_lines[:]
+        total_sec = 0.0
+        total_text = 0.0
+        total_num = 0
+
+        audio_dir = os.path.join(data_dir, 'corpus/', dtype)
+        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
+            for fname in filelist:
+                if not fname.endswith('.wav'):
+                    continue
+
+                audio_path = os.path.abspath(os.path.join(subfolder, fname))
+                audio_id = os.path.basename(fname)[:-4]
+                utt2spk = Path(audio_path).parent.name
+
+                audio_data, samplerate = soundfile.read(audio_path)
+                duration = float(len(audio_data) / samplerate)
+                text = transcript_dict[audio_id]
+                json_lines.append(
+                    json.dumps(
+                        {
+                            'utt': audio_id,
+                            'utt2spk': str(utt2spk),
+                            'feat': audio_path,
+                            'feat_shape': (duration, ),  # second
+                            'text': text,
+                        },
+                        ensure_ascii=False))
+
+                total_sec += duration
+                total_text += len(text)
+                total_num += 1
+
+        manifest_path = manifest_path_prefix + '.' + dtype
+        with codecs.open(manifest_path, 'w', 'utf-8') as fout:
+            for line in json_lines:
+                fout.write(line + '\n')
+
+        manifest_dir = os.path.dirname(manifest_path_prefix)
+        meta_path = os.path.join(manifest_dir, dtype) + '.meta'
+        with open(meta_path, 'w') as f:
+            print(f"{dtype}:", file=f)
+            print(f"{total_num} utts", file=f)
+            print(f"{total_sec / (60*60)} h", file=f)
+            print(f"{total_text} text", file=f)
+            print(f"{total_text / total_sec} text/sec", file=f)
+            print(f"{total_sec / total_num} sec/utt", file=f)
+
+
+def prepare_dataset(url, md5sum, target_dir, manifest_path, subset):
+    """Download, unpack and create manifest file."""
+    data_dir = os.path.join(target_dir, subset)
+    if not os.path.exists(data_dir):
+        filepath = download(url, md5sum, target_dir)
+        unpack(filepath, target_dir)
+        # unpack all audio tar files
+        audio_dir = os.path.join(data_dir, 'corpus')
+        for subfolder, dirlist, filelist in sorted(os.walk(audio_dir)):
+            for sub in dirlist:
+                print(f"unpack dir {sub}...")
+                for folder, _, filelist in sorted(
+                        os.walk(os.path.join(subfolder, sub))):
+                    for ftar in filelist:
+                        unpack(os.path.join(folder, ftar), folder, True)
+    else:
+        print("Skip downloading and unpacking. Data already exists in %s." %
+              target_dir)
+
+    create_manifest(data_dir, manifest_path)
+
+
+def main():
+    print_arguments(args, globals())
+    if args.target_dir.startswith('~'):
+        args.target_dir = os.path.expanduser(args.target_dir)
+
+    prepare_dataset(
+        url=DATA_URL,
+        md5sum=MD5_DATA,
+        target_dir=args.target_dir,
+        manifest_path=args.manifest_prefix,
+        subset='aidatatang_200zh')
+
+    print("Data download and manifest prepare done!")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/paddlespeech/dataset/aishell/README.md b/paddlespeech/dataset/aishell/README.md
new file mode 100644
index 00000000..c46312df
--- /dev/null
+++ b/paddlespeech/dataset/aishell/README.md
@@ -0,0 +1,58 @@
+# [Aishell1](http://openslr.elda.org/33/)
+
+This Open Source Mandarin Speech Corpus, AISHELL-ASR0009-OS1, is 178 hours long. It is a part of AISHELL-ASR0009, of which utterance contains 11 domains, including smart home, autonomous driving, and industrial production. The whole recording was put in quiet indoor environment, using 3 different devices at the same time: high fidelity microphone (44.1kHz, 16-bit,); Android-system mobile phone (16kHz, 16-bit), iOS-system mobile phone (16kHz, 16-bit). Audios in high fidelity were re-sampled to 16kHz to build AISHELL- ASR0009-OS1. 400 speakers from different accent areas in China were invited to participate in the recording. The manual transcription accuracy rate is above 95%, through professional speech annotation and strict quality inspection. The corpus is divided into training, development and testing sets. ( This database is free for academic research, not in the commerce, if without permission. )
+
+
+## Dataset Architecture
+
+```bash
+data_aishell
+├── transcript      # text 目录
+└── wav             # wav 目录
+    ├── dev         # dev 目录
+    │   ├── S0724   # spk 目录
+    │   ├── S0725
+    │   ├── S0726
+    ├── train
+    │   ├── S0724
+    │   ├── S0725
+    │   ├── S0726
+    ├── test
+    │   ├── S0724
+    │   ├── S0725
+    │   ├── S0726
+ 
+
+data_aishell
+├── transcript
+│   └── aishell_transcript_v0.8.txt   # 文本标注文件
+└── wav
+    ├── dev
+    │   ├── S0724
+    │   │   ├── BAC009S0724W0121.wav  # S0724 的音频
+    │   │   ├── BAC009S0724W0122.wav
+    │   │   ├── BAC009S0724W0123.wav
+    ├── test
+    │   ├── S0724
+    │   │   ├── BAC009S0724W0121.wav
+    │   │   ├── BAC009S0724W0122.wav
+    │   │   ├── BAC009S0724W0123.wav
+    ├── train
+    │   ├── S0724
+    │   │   ├── BAC009S0724W0121.wav
+    │   │   ├── BAC009S0724W0122.wav
+    │   │   ├── BAC009S0724W0123.wav
+    
+标注文件格式： <utt> <tokens>
+> head data_aishell/transcript/aishell_transcript_v0.8.txt 
+BAC009S0002W0122 而 对 楼市 成交 抑制 作用 最 大 的 限 购
+BAC009S0002W0123 也 成为 地方 政府 的 眼中 钉
+BAC009S0002W0124 自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后
+BAC009S0002W0125 各地 政府 便 纷纷 跟进
+BAC009S0002W0126 仅 一 个 多 月 的 时间 里
+BAC009S0002W0127 除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外
+BAC009S0002W0128 四十六 个 限 购 城市 当中
+BAC009S0002W0129 四十一 个 已 正式 取消 或 变相 放松 了 限 购
+BAC009S0002W0130 财政 金融 政策 紧随 其后 而来
+BAC009S0002W0131 显示 出 了 极 强 的 威力
+```
diff --git a/paddlespeech/dataset/aishell/__init__.py b/paddlespeech/dataset/aishell/__init__.py
new file mode 100644
index 00000000..667680af
--- /dev/null
+++ b/paddlespeech/dataset/aishell/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .aishell import check_dataset
+from .aishell import create_manifest
+from .aishell import download_dataset
+from .aishell import main as aishell_main
+from .aishell import prepare_dataset
diff --git a/paddlespeech/dataset/aishell/aishell.py b/paddlespeech/dataset/aishell/aishell.py
new file mode 100644
index 00000000..7ea4d676
--- /dev/null
+++ b/paddlespeech/dataset/aishell/aishell.py
@@ -0,0 +1,230 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Prepare Aishell mandarin dataset
+
+Download, unpack and create manifest files.
+Manifest file is a json-format file with each line containing the
+meta data (i.e. audio filepath, transcript and audio duration)
+of each audio file in the data set.
+"""
+import argparse
+import codecs
+import json
+import os
+from pathlib import Path
+
+import soundfile
+
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unpack
+from paddlespeech.utils.argparse import print_arguments
+
+DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
+
+URL_ROOT = 'http://openslr.elda.org/resources/33'
+# URL_ROOT = 'https://openslr.magicdatatech.com/resources/33'
+DATA_URL = URL_ROOT + '/data_aishell.tgz'
+MD5_DATA = '2f494334227864a8a8fec932999db9d8'
+RESOURCE_URL = URL_ROOT + '/resource_aishell.tgz'
+MD5_RESOURCE = '957d480a0fcac85fc18e550756f624e5'
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--target_dir",
+    default=DATA_HOME + "/Aishell",
+    type=str,
+    help="Directory to save the dataset. (default: %(default)s)")
+parser.add_argument(
+    "--manifest_prefix",
+    default="manifest",
+    type=str,
+    help="Filepath prefix for output manifests. (default: %(default)s)")
+args = parser.parse_args()
+
+
+def create_manifest(data_dir, manifest_path_prefix):
+    print("Creating manifest %s ..." % os.path.join(data_dir,
+                                                    manifest_path_prefix))
+    json_lines = []
+    transcript_path = os.path.join(data_dir, 'transcript',
+                                   'aishell_transcript_v0.8.txt')
+    transcript_dict = {}
+    for line in codecs.open(transcript_path, 'r', 'utf-8'):
+        line = line.strip()
+        if line == '':
+            continue
+        audio_id, text = line.split(' ', 1)
+        # remove withespace, charactor text
+        text = ''.join(text.split())
+        transcript_dict[audio_id] = text
+
+    data_metas = dict()
+    data_types = ['train', 'dev', 'test']
+    for dtype in data_types:
+        del json_lines[:]
+        total_sec = 0.0
+        total_text = 0.0
+        total_num = 0
+
+        audio_dir = os.path.join(data_dir, 'wav', dtype)
+        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
+            for fname in filelist:
+                audio_path = os.path.abspath(os.path.join(subfolder, fname))
+                audio_id = os.path.basename(fname)[:-4]
+                # if no transcription for audio then skipped
+                if audio_id not in transcript_dict:
+                    continue
+
+                utt2spk = Path(audio_path).parent.name
+                audio_data, samplerate = soundfile.read(audio_path)
+                duration = float(len(audio_data) / samplerate)
+                text = transcript_dict[audio_id]
+                json_lines.append(
+                    json.dumps(
+                        {
+                            'utt': audio_id,
+                            'utt2spk': str(utt2spk),
+                            'feat': audio_path,
+                            'feat_shape': (duration, ),  # second
+                            'text': text
+                        },
+                        ensure_ascii=False))
+
+                total_sec += duration
+                total_text += len(text)
+                total_num += 1
+
+        manifest_path = manifest_path_prefix + '.' + dtype
+        with codecs.open(manifest_path, 'w', 'utf-8') as fout:
+            for line in json_lines:
+                fout.write(line + '\n')
+
+        meta = dict()
+        meta["dtype"] = dtype  # train, dev, test
+        meta["utts"] = total_num
+        meta["hours"] = total_sec / (60 * 60)
+        meta["text"] = total_text
+        meta["text/sec"] = total_text / total_sec
+        meta["sec/utt"] = total_sec / total_num
+        data_metas[dtype] = meta
+
+        manifest_dir = os.path.dirname(manifest_path_prefix)
+        meta_path = os.path.join(manifest_dir, dtype) + '.meta'
+        with open(meta_path, 'w') as f:
+            for key, val in meta.items():
+                print(f"{key}: {val}", file=f)
+
+    return data_metas
+
+
+def download_dataset(url, md5sum, target_dir):
+    """Download, unpack and create manifest file."""
+    data_dir = os.path.join(target_dir, 'data_aishell')
+    if not os.path.exists(data_dir):
+        filepath = download(url, md5sum, target_dir)
+        unpack(filepath, target_dir)
+        # unpack all audio tar files
+        audio_dir = os.path.join(data_dir, 'wav')
+        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
+            for ftar in filelist:
+                unpack(os.path.join(subfolder, ftar), subfolder, True)
+    else:
+        print("Skip downloading and unpacking. Data already exists in %s." %
+              os.path.abspath(target_dir))
+    return os.path.abspath(data_dir)
+
+
+def check_dataset(data_dir):
+    print(f"check dataset {os.path.abspath(data_dir)} ...")
+
+    transcript_path = os.path.join(data_dir, 'transcript',
+                                   'aishell_transcript_v0.8.txt')
+    if not os.path.exists(transcript_path):
+        raise FileNotFoundError(f"no transcript file found in {data_dir}.")
+
+    transcript_dict = {}
+    for line in codecs.open(transcript_path, 'r', 'utf-8'):
+        line = line.strip()
+        if line == '':
+            continue
+        audio_id, text = line.split(' ', 1)
+        # remove withespace, charactor text
+        text = ''.join(text.split())
+        transcript_dict[audio_id] = text
+
+    no_label = 0
+    data_types = ['train', 'dev', 'test']
+    for dtype in data_types:
+        audio_dir = os.path.join(data_dir, 'wav', dtype)
+        if not os.path.exists(audio_dir):
+            raise IOError(f"{audio_dir} does not exist.")
+
+        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
+            for fname in filelist:
+                audio_path = os.path.abspath(os.path.join(subfolder, fname))
+                audio_id = os.path.basename(fname)[:-4]
+                # if no transcription for audio then skipped
+                if audio_id not in transcript_dict:
+                    print(f"Warning: {audio_id} not has transcript.")
+                    no_label += 1
+                    continue
+
+                utt2spk = Path(audio_path).parent.name
+                audio_data, samplerate = soundfile.read(audio_path)
+                assert samplerate == 16000, f"{audio_path} sample rate is {samplerate} not 16k, please check."
+
+        print(f"Warning: {dtype} has {no_label} audio does not has transcript.")
+
+
+def prepare_dataset(url, md5sum, target_dir, manifest_path=None, check=False):
+    """Download, unpack and create manifest file."""
+    data_dir = download_dataset(url, md5sum, target_dir)
+
+    if check:
+        try:
+            check_dataset(data_dir)
+        except Exception as e:
+            raise ValueError(
+                f"{data_dir} dataset format not right, please check it.")
+
+    meta = None
+    if manifest_path:
+        meta = create_manifest(data_dir, manifest_path)
+
+    return data_dir, meta
+
+
+def main():
+    print_arguments(args, globals())
+    if args.target_dir.startswith('~'):
+        args.target_dir = os.path.expanduser(args.target_dir)
+
+    data_dir, meta = prepare_dataset(
+        url=DATA_URL,
+        md5sum=MD5_DATA,
+        target_dir=args.target_dir,
+        manifest_path=args.manifest_prefix,
+        check=True)
+
+    resource_dir, _ = prepare_dataset(
+        url=RESOURCE_URL,
+        md5sum=MD5_RESOURCE,
+        target_dir=args.target_dir,
+        manifest_path=None)
+
+    print("Data download and manifest prepare done!")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/utils/utility.py b/paddlespeech/dataset/download.py
similarity index 59%
rename from utils/utility.py
rename to paddlespeech/dataset/download.py
index dbf8b1d7..28dbd0eb 100755
--- a/utils/utility.py
+++ b/paddlespeech/dataset/download.py
@@ -19,91 +19,16 @@ import zipfile
 from typing import Text
 
 __all__ = [
-    "check_md5sum", "getfile_insensitive", "download_multi", "download",
-    "unpack", "unzip", "md5file", "print_arguments", "add_arguments",
-    "get_commandline_args"
+    "check_md5sum",
+    "getfile_insensitive",
+    "download_multi",
+    "download",
+    "unpack",
+    "unzip",
+    "md5file",
 ]
 
 
-def get_commandline_args():
-    extra_chars = [
-        " ",
-        ";",
-        "&",
-        "(",
-        ")",
-        "|",
-        "^",
-        "<",
-        ">",
-        "?",
-        "*",
-        "[",
-        "]",
-        "$",
-        "`",
-        '"',
-        "\\",
-        "!",
-        "{",
-        "}",
-    ]
-
-    # Escape the extra characters for shell
-    argv = [
-        arg.replace("'", "'\\''") if all(char not in arg
-                                         for char in extra_chars) else
-        "'" + arg.replace("'", "'\\''") + "'" for arg in sys.argv
-    ]
-
-    return sys.executable + " " + " ".join(argv)
-
-
-def print_arguments(args, info=None):
-    """Print argparse's arguments.
-
-    Usage:
-
-    .. code-block:: python
-
-        parser = argparse.ArgumentParser()
-        parser.add_argument("name", default="Jonh", type=str, help="User name.")
-        args = parser.parse_args()
-        print_arguments(args)
-
-    :param args: Input argparse.Namespace for printing.
-    :type args: argparse.Namespace
-    """
-    filename = ""
-    if info:
-        filename = info["__file__"]
-    filename = os.path.basename(filename)
-    print(f"----------- {filename} Configuration Arguments -----------")
-    for arg, value in sorted(vars(args).items()):
-        print("%s: %s" % (arg, value))
-    print("-----------------------------------------------------------")
-
-
-def add_arguments(argname, type, default, help, argparser, **kwargs):
-    """Add argparse's argument.
-
-    Usage:
-
-    .. code-block:: python
-
-        parser = argparse.ArgumentParser()
-        add_argument("name", str, "Jonh", "User name.", parser)
-        args = parser.parse_args()
-    """
-    type = distutils.util.strtobool if type == bool else type
-    argparser.add_argument(
-        "--" + argname,
-        default=default,
-        type=type,
-        help=help + ' Default: %(default)s.',
-        **kwargs)
-
-
 def md5file(fname):
     hash_md5 = hashlib.md5()
     f = open(fname, "rb")
diff --git a/paddlespeech/dataset/s2t/__init__.py b/paddlespeech/dataset/s2t/__init__.py
new file mode 100644
index 00000000..27ea9e77
--- /dev/null
+++ b/paddlespeech/dataset/s2t/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# s2t utils binaries.
+from .avg_model import main as avg_ckpts_main
+from .build_vocab import main as build_vocab_main
+from .compute_mean_std import main as compute_mean_std_main
+from .compute_wer import main as compute_wer_main
+from .format_data import main as format_data_main
+from .format_rsl import main as format_rsl_main
diff --git a/paddlespeech/dataset/s2t/avg_model.py b/paddlespeech/dataset/s2t/avg_model.py
new file mode 100755
index 00000000..5bd5cb1f
--- /dev/null
+++ b/paddlespeech/dataset/s2t/avg_model.py
@@ -0,0 +1,132 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import glob
+import json
+import os
+
+import numpy as np
+import paddle
+
+
+def average_checkpoints(dst_model="",
+                        ckpt_dir="",
+                        val_best=True,
+                        num=5,
+                        min_epoch=0,
+                        max_epoch=65536):
+    paddle.set_device('cpu')
+
+    val_scores = []
+    jsons = glob.glob(f'{ckpt_dir}/[!train]*.json')
+    jsons = sorted(jsons, key=os.path.getmtime, reverse=True)
+    for y in jsons:
+        with open(y, 'r') as f:
+            dic_json = json.load(f)
+        loss = dic_json['val_loss']
+        epoch = dic_json['epoch']
+        if epoch >= min_epoch and epoch <= max_epoch:
+            val_scores.append((epoch, loss))
+    assert val_scores, f"Not find any valid checkpoints: {val_scores}"
+    val_scores = np.array(val_scores)
+
+    if val_best:
+        sort_idx = np.argsort(val_scores[:, 1])
+        sorted_val_scores = val_scores[sort_idx]
+    else:
+        sorted_val_scores = val_scores
+
+    beat_val_scores = sorted_val_scores[:num, 1]
+    selected_epochs = sorted_val_scores[:num, 0].astype(np.int64)
+    avg_val_score = np.mean(beat_val_scores)
+    print("selected val scores = " + str(beat_val_scores))
+    print("selected epochs = " + str(selected_epochs))
+    print("averaged val score = " + str(avg_val_score))
+
+    path_list = [
+        ckpt_dir + '/{}.pdparams'.format(int(epoch))
+        for epoch in sorted_val_scores[:num, 0]
+    ]
+    print(path_list)
+
+    avg = None
+    num = num
+    assert num == len(path_list)
+    for path in path_list:
+        print(f'Processing {path}')
+        states = paddle.load(path)
+        if avg is None:
+            avg = states
+        else:
+            for k in avg.keys():
+                avg[k] += states[k]
+    # average
+    for k in avg.keys():
+        if avg[k] is not None:
+            avg[k] /= num
+
+    paddle.save(avg, dst_model)
+    print(f'Saving to {dst_model}')
+
+    meta_path = os.path.splitext(dst_model)[0] + '.avg.json'
+    with open(meta_path, 'w') as f:
+        data = json.dumps({
+            "mode": 'val_best' if val_best else 'latest',
+            "avg_ckpt": dst_model,
+            "val_loss_mean": avg_val_score,
+            "ckpts": path_list,
+            "epochs": selected_epochs.tolist(),
+            "val_losses": beat_val_scores.tolist(),
+        })
+        f.write(data + "\n")
+
+
+def define_argparse():
+    parser = argparse.ArgumentParser(description='average model')
+    parser.add_argument('--dst_model', required=True, help='averaged model')
+    parser.add_argument(
+        '--ckpt_dir', required=True, help='ckpt model dir for average')
+    parser.add_argument(
+        '--val_best', action="store_true", help='averaged model')
+    parser.add_argument(
+        '--num', default=5, type=int, help='nums for averaged model')
+    parser.add_argument(
+        '--min_epoch',
+        default=0,
+        type=int,
+        help='min epoch used for averaging model')
+    parser.add_argument(
+        '--max_epoch',
+        default=65536,  # Big enough
+        type=int,
+        help='max epoch used for averaging model')
+
+    args = parser.parse_args()
+    print(args)
+    return args
+
+
+def main():
+    args = define_argparse()
+    average_checkpoints(
+        dst_model=args.dst_model,
+        ckpt_dir=args.ckpt_dir,
+        val_best=args.val_best,
+        num=args.num,
+        min_epoch=args.min_epoch,
+        max_epoch=args.max_epoch)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/paddlespeech/dataset/s2t/build_vocab.py b/paddlespeech/dataset/s2t/build_vocab.py
new file mode 100755
index 00000000..081edf3d
--- /dev/null
+++ b/paddlespeech/dataset/s2t/build_vocab.py
@@ -0,0 +1,169 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Build vocabulary from manifest files.
+Each item in vocabulary file is a character.
+"""
+import argparse
+import functools
+import os
+import tempfile
+from collections import Counter
+
+import jsonlines
+
+from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
+from paddlespeech.s2t.frontend.utility import BLANK
+from paddlespeech.s2t.frontend.utility import SOS
+from paddlespeech.s2t.frontend.utility import SPACE
+from paddlespeech.s2t.frontend.utility import UNK
+from paddlespeech.utils.argparse import add_arguments
+from paddlespeech.utils.argparse import print_arguments
+
+
+def count_manifest(counter, text_feature, manifest_path):
+    manifest_jsons = []
+    with jsonlines.open(manifest_path, 'r') as reader:
+        for json_data in reader:
+            manifest_jsons.append(json_data)
+
+    for line_json in manifest_jsons:
+        if isinstance(line_json['text'], str):
+            tokens = text_feature.tokenize(
+                line_json['text'], replace_space=False)
+
+            counter.update(tokens)
+        else:
+            assert isinstance(line_json['text'], list)
+            for text in line_json['text']:
+                tokens = text_feature.tokenize(text, replace_space=False)
+                counter.update(tokens)
+
+
+def dump_text_manifest(fileobj, manifest_path, key='text'):
+    manifest_jsons = []
+    with jsonlines.open(manifest_path, 'r') as reader:
+        for json_data in reader:
+            manifest_jsons.append(json_data)
+
+    for line_json in manifest_jsons:
+        if isinstance(line_json[key], str):
+            fileobj.write(line_json[key] + "\n")
+        else:
+            assert isinstance(line_json[key], list)
+            for line in line_json[key]:
+                fileobj.write(line + "\n")
+
+
+def build_vocab(manifest_paths="",
+                vocab_path="examples/librispeech/data/vocab.txt",
+                unit_type="char",
+                count_threshold=0,
+                text_keys='text',
+                spm_mode="unigram",
+                spm_vocab_size=0,
+                spm_model_prefix="",
+                spm_character_coverage=0.9995):
+    manifest_paths = [manifest_paths] if isinstance(manifest_paths,
+                                                    str) else manifest_paths
+
+    fout = open(vocab_path, 'w', encoding='utf-8')
+    fout.write(BLANK + "\n")  # 0 will be used for "blank" in CTC
+    fout.write(UNK + '\n')  # <unk> must be 1
+
+    if unit_type == 'spm':
+        # tools/spm_train --input=$wave_data/lang_char/input.txt
+        # --vocab_size=${nbpe} --model_type=${bpemode}
+        # --model_prefix=${bpemodel} --input_sentence_size=100000000
+        import sentencepiece as spm
+
+        fp = tempfile.NamedTemporaryFile(mode='w', delete=False)
+        for manifest_path in manifest_paths:
+            _text_keys = [text_keys] if type(
+                text_keys) is not list else text_keys
+            for text_key in _text_keys:
+                dump_text_manifest(fp, manifest_path, key=text_key)
+        fp.close()
+        # train
+        spm.SentencePieceTrainer.Train(
+            input=fp.name,
+            vocab_size=spm_vocab_size,
+            model_type=spm_mode,
+            model_prefix=spm_model_prefix,
+            input_sentence_size=100000000,
+            character_coverage=spm_character_coverage)
+        os.unlink(fp.name)
+
+    # encode
+    text_feature = TextFeaturizer(unit_type, "", spm_model_prefix)
+    counter = Counter()
+
+    for manifest_path in manifest_paths:
+        count_manifest(counter, text_feature, manifest_path)
+
+    count_sorted = sorted(counter.items(), key=lambda x: x[1], reverse=True)
+    tokens = []
+    for token, count in count_sorted:
+        if count < count_threshold:
+            break
+        # replace space by `<space>`
+        token = SPACE if token == ' ' else token
+        tokens.append(token)
+
+    tokens = sorted(tokens)
+    for token in tokens:
+        fout.write(token + '\n')
+
+    fout.write(SOS + "\n")  # <sos/eos>
+    fout.close()
+
+
+def define_argparse():
+    parser = argparse.ArgumentParser(description=__doc__)
+    add_arg = functools.partial(add_arguments, argparser=parser)
+
+    # yapf: disable
+    add_arg('unit_type', str, "char", "Unit type, e.g. char, word, spm")
+    add_arg('count_threshold', int, 0,
+            "Truncation threshold for char/word counts.Default 0, no truncate.")
+    add_arg('vocab_path', str,
+            'examples/librispeech/data/vocab.txt',
+            "Filepath to write the vocabulary.")
+    add_arg('manifest_paths', str,
+            None,
+            "Filepaths of manifests for building vocabulary. "
+            "You can provide multiple manifest files.",
+            nargs='+',
+            required=True)
+    add_arg('text_keys', str,
+            'text',
+            "keys of the text in manifest for building vocabulary. "
+            "You can provide multiple k.",
+            nargs='+')
+    # bpe
+    add_arg('spm_vocab_size', int, 0, "Vocab size for spm.")
+    add_arg('spm_mode', str, 'unigram', "spm model type, e.g. unigram, spm, char, word. only need when `unit_type` is spm")
+    add_arg('spm_model_prefix', str, "", "spm_model_%(spm_mode)_%(count_threshold), spm model prefix, only need when `unit_type` is spm")
+    add_arg('spm_character_coverage', float, 0.9995, "character coverage to determine the minimum symbols")
+    # yapf: disable
+
+    args = parser.parse_args()
+    return args
+
+def main():
+    args = define_argparse()
+    print_arguments(args, globals())
+    build_vocab(**vars(args))
+
+if __name__ == '__main__':
+    main()
diff --git a/paddlespeech/dataset/s2t/compute_mean_std.py b/paddlespeech/dataset/s2t/compute_mean_std.py
new file mode 100755
index 00000000..8762ee57
--- /dev/null
+++ b/paddlespeech/dataset/s2t/compute_mean_std.py
@@ -0,0 +1,106 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Compute mean and std for feature normalizer, and save to file."""
+import argparse
+import functools
+
+from paddlespeech.s2t.frontend.augmentor.augmentation import AugmentationPipeline
+from paddlespeech.s2t.frontend.featurizer.audio_featurizer import AudioFeaturizer
+from paddlespeech.s2t.frontend.normalizer import FeatureNormalizer
+from paddlespeech.utils.argparse import add_arguments
+from paddlespeech.utils.argparse import print_arguments
+
+
+def compute_cmvn(manifest_path="data/librispeech/manifest.train",
+                 output_path="data/librispeech/mean_std.npz",
+                 num_samples=2000,
+                 num_workers=0,
+                 spectrum_type="linear",
+                 feat_dim=13,
+                 delta_delta=False,
+                 stride_ms=10,
+                 window_ms=20,
+                 sample_rate=16000,
+                 use_dB_normalization=True,
+                 target_dB=-20):
+
+    augmentation_pipeline = AugmentationPipeline('{}')
+    audio_featurizer = AudioFeaturizer(
+        spectrum_type=spectrum_type,
+        feat_dim=feat_dim,
+        delta_delta=delta_delta,
+        stride_ms=float(stride_ms),
+        window_ms=float(window_ms),
+        n_fft=None,
+        max_freq=None,
+        target_sample_rate=sample_rate,
+        use_dB_normalization=use_dB_normalization,
+        target_dB=target_dB,
+        dither=0.0)
+
+    def augment_and_featurize(audio_segment):
+        augmentation_pipeline.transform_audio(audio_segment)
+        return audio_featurizer.featurize(audio_segment)
+
+    normalizer = FeatureNormalizer(
+        mean_std_filepath=None,
+        manifest_path=manifest_path,
+        featurize_func=augment_and_featurize,
+        num_samples=num_samples,
+        num_workers=num_workers)
+    normalizer.write_to_file(output_path)
+
+
+def define_argparse():
+    parser = argparse.ArgumentParser(description=__doc__)
+    add_arg = functools.partial(add_arguments, argparser=parser)
+
+    # yapf: disable
+    add_arg('manifest_path', str,
+            'data/librispeech/manifest.train',
+            "Filepath of manifest to compute normalizer's mean and stddev.")
+
+    add_arg('output_path', str,
+            'data/librispeech/mean_std.npz',
+            "Filepath of write mean and stddev to (.npz).")
+    add_arg('num_samples',  int,    2000,    "# of samples to for statistics.")
+    add_arg('num_workers',
+                            default=0,
+                            type=int,
+                            help='num of subprocess workers for processing')
+
+
+    add_arg('spectrum_type', str,
+            'linear',
+            "Audio feature type. Options: linear, mfcc, fbank.",
+            choices=['linear', 'mfcc', 'fbank'])
+    add_arg('feat_dim', int, 13, "Audio feature dim.")
+    add_arg('delta_delta', bool,  False, "Audio feature with delta delta.")
+    add_arg('stride_ms', int, 10,  "stride length in ms.")
+    add_arg('window_ms', int, 20,  "stride length in ms.")
+    add_arg('sample_rate',  int, 16000,  "target sample rate.")
+    add_arg('use_dB_normalization', bool, True, "do dB normalization.")
+    add_arg('target_dB',   int, -20,  "target dB.")
+    # yapf: disable
+
+    args = parser.parse_args()
+    return args
+
+def main():
+    args = define_argparse()
+    print_arguments(args, globals())
+    compute_cmvn(**vars(args))
+
+if __name__ == '__main__':
+    main()
diff --git a/paddlespeech/dataset/s2t/compute_wer.py b/paddlespeech/dataset/s2t/compute_wer.py
new file mode 100755
index 00000000..5711c725
--- /dev/null
+++ b/paddlespeech/dataset/s2t/compute_wer.py
@@ -0,0 +1,558 @@
+# Copyright 2021 Mobvoi Inc. All Rights Reserved.
+# flake8: noqa
+import codecs
+import re
+import sys
+import unicodedata
+
+remove_tag = True
+spacelist = [' ', '\t', '\r', '\n']
+puncts = [
+    '!', ',', '?', '、', '。', '！', '，', '；', '？', '：', '「', '」', '︰', '『', '』',
+    '《', '》'
+]
+
+
+def characterize(string):
+    res = []
+    i = 0
+    while i < len(string):
+        char = string[i]
+        if char in puncts:
+            i += 1
+            continue
+        cat1 = unicodedata.category(char)
+        #https://unicodebook.readthedocs.io/unicode.html#unicode-categories
+        if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist:  # space or not assigned
+            i += 1
+            continue
+        if cat1 == 'Lo':  # letter-other
+            res.append(char)
+            i += 1
+        else:
+            # some input looks like: <unk><noise>, we want to separate it to two words.
+            sep = ' '
+            if char == '<': sep = '>'
+            j = i + 1
+            while j < len(string):
+                c = string[j]
+                if ord(c) >= 128 or (c in spacelist) or (c == sep):
+                    break
+                j += 1
+            if j < len(string) and string[j] == '>':
+                j += 1
+            res.append(string[i:j])
+            i = j
+    return res
+
+
+def stripoff_tags(x):
+    if not x: return ''
+    chars = []
+    i = 0
+    T = len(x)
+    while i < T:
+        if x[i] == '<':
+            while i < T and x[i] != '>':
+                i += 1
+            i += 1
+        else:
+            chars.append(x[i])
+            i += 1
+    return ''.join(chars)
+
+
+def normalize(sentence, ignore_words, cs, split=None):
+    """ sentence, ignore_words are both in unicode
+    """
+    new_sentence = []
+    for token in sentence:
+        x = token
+        if not cs:
+            x = x.upper()
+        if x in ignore_words:
+            continue
+        if remove_tag:
+            x = stripoff_tags(x)
+        if not x:
+            continue
+        if split and x in split:
+            new_sentence += split[x]
+        else:
+            new_sentence.append(x)
+    return new_sentence
+
+
+class Calculator:
+    def __init__(self):
+        self.data = {}
+        self.space = []
+        self.cost = {}
+        self.cost['cor'] = 0
+        self.cost['sub'] = 1
+        self.cost['del'] = 1
+        self.cost['ins'] = 1
+
+    def calculate(self, lab, rec):
+        # Initialization
+        lab.insert(0, '')
+        rec.insert(0, '')
+        while len(self.space) < len(lab):
+            self.space.append([])
+        for row in self.space:
+            for element in row:
+                element['dist'] = 0
+                element['error'] = 'non'
+            while len(row) < len(rec):
+                row.append({'dist': 0, 'error': 'non'})
+        for i in range(len(lab)):
+            self.space[i][0]['dist'] = i
+            self.space[i][0]['error'] = 'del'
+        for j in range(len(rec)):
+            self.space[0][j]['dist'] = j
+            self.space[0][j]['error'] = 'ins'
+        self.space[0][0]['error'] = 'non'
+        for token in lab:
+            if token not in self.data and len(token) > 0:
+                self.data[token] = {
+                    'all': 0,
+                    'cor': 0,
+                    'sub': 0,
+                    'ins': 0,
+                    'del': 0
+                }
+        for token in rec:
+            if token not in self.data and len(token) > 0:
+                self.data[token] = {
+                    'all': 0,
+                    'cor': 0,
+                    'sub': 0,
+                    'ins': 0,
+                    'del': 0
+                }
+        # Computing edit distance
+        for i, lab_token in enumerate(lab):
+            for j, rec_token in enumerate(rec):
+                if i == 0 or j == 0:
+                    continue
+                min_dist = sys.maxsize
+                min_error = 'none'
+                dist = self.space[i - 1][j]['dist'] + self.cost['del']
+                error = 'del'
+                if dist < min_dist:
+                    min_dist = dist
+                    min_error = error
+                dist = self.space[i][j - 1]['dist'] + self.cost['ins']
+                error = 'ins'
+                if dist < min_dist:
+                    min_dist = dist
+                    min_error = error
+                if lab_token == rec_token:
+                    dist = self.space[i - 1][j - 1]['dist'] + self.cost['cor']
+                    error = 'cor'
+                else:
+                    dist = self.space[i - 1][j - 1]['dist'] + self.cost['sub']
+                    error = 'sub'
+                if dist < min_dist:
+                    min_dist = dist
+                    min_error = error
+                self.space[i][j]['dist'] = min_dist
+                self.space[i][j]['error'] = min_error
+        # Tracing back
+        result = {
+            'lab': [],
+            'rec': [],
+            'all': 0,
+            'cor': 0,
+            'sub': 0,
+            'ins': 0,
+            'del': 0
+        }
+        i = len(lab) - 1
+        j = len(rec) - 1
+        while True:
+            if self.space[i][j]['error'] == 'cor':  # correct
+                if len(lab[i]) > 0:
+                    self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1
+                    self.data[lab[i]]['cor'] = self.data[lab[i]]['cor'] + 1
+                    result['all'] = result['all'] + 1
+                    result['cor'] = result['cor'] + 1
+                result['lab'].insert(0, lab[i])
+                result['rec'].insert(0, rec[j])
+                i = i - 1
+                j = j - 1
+            elif self.space[i][j]['error'] == 'sub':  # substitution
+                if len(lab[i]) > 0:
+                    self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1
+                    self.data[lab[i]]['sub'] = self.data[lab[i]]['sub'] + 1
+                    result['all'] = result['all'] + 1
+                    result['sub'] = result['sub'] + 1
+                result['lab'].insert(0, lab[i])
+                result['rec'].insert(0, rec[j])
+                i = i - 1
+                j = j - 1
+            elif self.space[i][j]['error'] == 'del':  # deletion
+                if len(lab[i]) > 0:
+                    self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1
+                    self.data[lab[i]]['del'] = self.data[lab[i]]['del'] + 1
+                    result['all'] = result['all'] + 1
+                    result['del'] = result['del'] + 1
+                result['lab'].insert(0, lab[i])
+                result['rec'].insert(0, "")
+                i = i - 1
+            elif self.space[i][j]['error'] == 'ins':  # insertion
+                if len(rec[j]) > 0:
+                    self.data[rec[j]]['ins'] = self.data[rec[j]]['ins'] + 1
+                    result['ins'] = result['ins'] + 1
+                result['lab'].insert(0, "")
+                result['rec'].insert(0, rec[j])
+                j = j - 1
+            elif self.space[i][j]['error'] == 'non':  # starting point
+                break
+            else:  # shouldn't reach here
+                print(
+                    'this should not happen , i = {i} , j = {j} , error = {error}'.
+                    format(i=i, j=j, error=self.space[i][j]['error']))
+        return result
+
+    def overall(self):
+        result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0}
+        for token in self.data:
+            result['all'] = result['all'] + self.data[token]['all']
+            result['cor'] = result['cor'] + self.data[token]['cor']
+            result['sub'] = result['sub'] + self.data[token]['sub']
+            result['ins'] = result['ins'] + self.data[token]['ins']
+            result['del'] = result['del'] + self.data[token]['del']
+        return result
+
+    def cluster(self, data):
+        result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0}
+        for token in data:
+            if token in self.data:
+                result['all'] = result['all'] + self.data[token]['all']
+                result['cor'] = result['cor'] + self.data[token]['cor']
+                result['sub'] = result['sub'] + self.data[token]['sub']
+                result['ins'] = result['ins'] + self.data[token]['ins']
+                result['del'] = result['del'] + self.data[token]['del']
+        return result
+
+    def keys(self):
+        return list(self.data.keys())
+
+
+def width(string):
+    return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string)
+
+
+def default_cluster(word):
+    unicode_names = [unicodedata.name(char) for char in word]
+    for i in reversed(range(len(unicode_names))):
+        if unicode_names[i].startswith('DIGIT'):  # 1
+            unicode_names[i] = 'Number'  # 'DIGIT'
+        elif (unicode_names[i].startswith('CJK UNIFIED IDEOGRAPH') or
+              unicode_names[i].startswith('CJK COMPATIBILITY IDEOGRAPH')):
+            # 明 / 郎
+            unicode_names[i] = 'Mandarin'  # 'CJK IDEOGRAPH'
+        elif (unicode_names[i].startswith('LATIN CAPITAL LETTER') or
+              unicode_names[i].startswith('LATIN SMALL LETTER')):
+            # A / a
+            unicode_names[i] = 'English'  # 'LATIN LETTER'
+        elif unicode_names[i].startswith('HIRAGANA LETTER'):  # は こ め
+            unicode_names[i] = 'Japanese'  # 'GANA LETTER'
+        elif (unicode_names[i].startswith('AMPERSAND') or
+              unicode_names[i].startswith('APOSTROPHE') or
+              unicode_names[i].startswith('COMMERCIAL AT') or
+              unicode_names[i].startswith('DEGREE CELSIUS') or
+              unicode_names[i].startswith('EQUALS SIGN') or
+              unicode_names[i].startswith('FULL STOP') or
+              unicode_names[i].startswith('HYPHEN-MINUS') or
+              unicode_names[i].startswith('LOW LINE') or
+              unicode_names[i].startswith('NUMBER SIGN') or
+              unicode_names[i].startswith('PLUS SIGN') or
+              unicode_names[i].startswith('SEMICOLON')):
+            # & / ' / @ / ℃ / = / . / - / _ / # / + / ;
+            del unicode_names[i]
+        else:
+            return 'Other'
+    if len(unicode_names) == 0:
+        return 'Other'
+    if len(unicode_names) == 1:
+        return unicode_names[0]
+    for i in range(len(unicode_names) - 1):
+        if unicode_names[i] != unicode_names[i + 1]:
+            return 'Other'
+    return unicode_names[0]
+
+
+def usage():
+    print(
+        "compute-wer.py : compute word error rate (WER) and align recognition results and references."
+    )
+    print(
+        "         usage : python compute-wer.py [--cs={0,1}] [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] [--padding-symbol={space,underline}] test.ref test.hyp > test.wer"
+    )
+
+
+def main():
+    # python utils/compute-wer.py --char=1 --v=1 ref hyp > rsl.error
+    if len(sys.argv) == 1:
+        usage()
+        sys.exit(0)
+    calculator = Calculator()
+    cluster_file = ''
+    ignore_words = set()
+    tochar = False
+    verbose = 1
+    padding_symbol = ' '
+    case_sensitive = False
+    max_words_per_line = sys.maxsize
+    split = None
+    while len(sys.argv) > 3:
+        a = '--maxw='
+        if sys.argv[1].startswith(a):
+            b = sys.argv[1][len(a):]
+            del sys.argv[1]
+            max_words_per_line = int(b)
+            continue
+        a = '--rt='
+        if sys.argv[1].startswith(a):
+            b = sys.argv[1][len(a):].lower()
+            del sys.argv[1]
+            remove_tag = (b == 'true') or (b != '0')
+            continue
+        a = '--cs='
+        if sys.argv[1].startswith(a):
+            b = sys.argv[1][len(a):].lower()
+            del sys.argv[1]
+            case_sensitive = (b == 'true') or (b != '0')
+            continue
+        a = '--cluster='
+        if sys.argv[1].startswith(a):
+            cluster_file = sys.argv[1][len(a):]
+            del sys.argv[1]
+            continue
+        a = '--splitfile='
+        if sys.argv[1].startswith(a):
+            split_file = sys.argv[1][len(a):]
+            del sys.argv[1]
+            split = dict()
+            with codecs.open(split_file, 'r', 'utf-8') as fh:
+                for line in fh:  # line in unicode
+                    words = line.strip().split()
+                    if len(words) >= 2:
+                        split[words[0]] = words[1:]
+            continue
+        a = '--ig='
+        if sys.argv[1].startswith(a):
+            ignore_file = sys.argv[1][len(a):]
+            del sys.argv[1]
+            with codecs.open(ignore_file, 'r', 'utf-8') as fh:
+                for line in fh:  # line in unicode
+                    line = line.strip()
+                    if len(line) > 0:
+                        ignore_words.add(line)
+            continue
+        a = '--char='
+        if sys.argv[1].startswith(a):
+            b = sys.argv[1][len(a):].lower()
+            del sys.argv[1]
+            tochar = (b == 'true') or (b != '0')
+            continue
+        a = '--v='
+        if sys.argv[1].startswith(a):
+            b = sys.argv[1][len(a):].lower()
+            del sys.argv[1]
+            verbose = 0
+            try:
+                verbose = int(b)
+            except:
+                if b == 'true' or b != '0':
+                    verbose = 1
+            continue
+        a = '--padding-symbol='
+        if sys.argv[1].startswith(a):
+            b = sys.argv[1][len(a):].lower()
+            del sys.argv[1]
+            if b == 'space':
+                padding_symbol = ' '
+            elif b == 'underline':
+                padding_symbol = '_'
+            continue
+        if True or sys.argv[1].startswith('-'):
+            #ignore invalid switch
+            del sys.argv[1]
+            continue
+
+    if not case_sensitive:
+        ig = set([w.upper() for w in ignore_words])
+        ignore_words = ig
+
+    default_clusters = {}
+    default_words = {}
+
+    ref_file = sys.argv[1]
+    hyp_file = sys.argv[2]
+    rec_set = {}
+    if split and not case_sensitive:
+        newsplit = dict()
+        for w in split:
+            words = split[w]
+            for i in range(len(words)):
+                words[i] = words[i].upper()
+            newsplit[w.upper()] = words
+        split = newsplit
+
+    with codecs.open(hyp_file, 'r', 'utf-8') as fh:
+        for line in fh:
+            if tochar:
+                array = characterize(line)
+            else:
+                array = line.strip().split()
+            if len(array) == 0: continue
+            fid = array[0]
+            rec_set[fid] = normalize(array[1:], ignore_words, case_sensitive,
+                                     split)
+
+    # compute error rate on the interaction of reference file and hyp file
+    for line in open(ref_file, 'r', encoding='utf-8'):
+        if tochar:
+            array = characterize(line)
+        else:
+            array = line.rstrip('\n').split()
+        if len(array) == 0: continue
+        fid = array[0]
+        if fid not in rec_set:
+            continue
+        lab = normalize(array[1:], ignore_words, case_sensitive, split)
+        rec = rec_set[fid]
+        if verbose:
+            print('\nutt: %s' % fid)
+
+        for word in rec + lab:
+            if word not in default_words:
+                default_cluster_name = default_cluster(word)
+                if default_cluster_name not in default_clusters:
+                    default_clusters[default_cluster_name] = {}
+                if word not in default_clusters[default_cluster_name]:
+                    default_clusters[default_cluster_name][word] = 1
+                default_words[word] = default_cluster_name
+
+        result = calculator.calculate(lab, rec)
+        if verbose:
+            if result['all'] != 0:
+                wer = float(result['ins'] + result['sub'] + result[
+                    'del']) * 100.0 / result['all']
+            else:
+                wer = 0.0
+            print('WER: %4.2f %%' % wer, end=' ')
+            print('N=%d C=%d S=%d D=%d I=%d' %
+                  (result['all'], result['cor'], result['sub'], result['del'],
+                   result['ins']))
+            space = {}
+            space['lab'] = []
+            space['rec'] = []
+            for idx in range(len(result['lab'])):
+                len_lab = width(result['lab'][idx])
+                len_rec = width(result['rec'][idx])
+                length = max(len_lab, len_rec)
+                space['lab'].append(length - len_lab)
+                space['rec'].append(length - len_rec)
+            upper_lab = len(result['lab'])
+            upper_rec = len(result['rec'])
+            lab1, rec1 = 0, 0
+            while lab1 < upper_lab or rec1 < upper_rec:
+                if verbose > 1:
+                    print('lab(%s):' % fid.encode('utf-8'), end=' ')
+                else:
+                    print('lab:', end=' ')
+                lab2 = min(upper_lab, lab1 + max_words_per_line)
+                for idx in range(lab1, lab2):
+                    token = result['lab'][idx]
+                    print('{token}'.format(token=token), end='')
+                    for n in range(space['lab'][idx]):
+                        print(padding_symbol, end='')
+                    print(' ', end='')
+                print()
+                if verbose > 1:
+                    print('rec(%s):' % fid.encode('utf-8'), end=' ')
+                else:
+                    print('rec:', end=' ')
+                rec2 = min(upper_rec, rec1 + max_words_per_line)
+                for idx in range(rec1, rec2):
+                    token = result['rec'][idx]
+                    print('{token}'.format(token=token), end='')
+                    for n in range(space['rec'][idx]):
+                        print(padding_symbol, end='')
+                    print(' ', end='')
+                print('\n', end='\n')
+                lab1 = lab2
+                rec1 = rec2
+
+    if verbose:
+        print(
+            '==========================================================================='
+        )
+        print()
+
+    result = calculator.overall()
+    if result['all'] != 0:
+        wer = float(result['ins'] + result['sub'] + result[
+            'del']) * 100.0 / result['all']
+    else:
+        wer = 0.0
+    print('Overall -> %4.2f %%' % wer, end=' ')
+    print('N=%d C=%d S=%d D=%d I=%d' %
+          (result['all'], result['cor'], result['sub'], result['del'],
+           result['ins']))
+    if not verbose:
+        print()
+
+    if verbose:
+        for cluster_id in default_clusters:
+            result = calculator.cluster(
+                [k for k in default_clusters[cluster_id]])
+            if result['all'] != 0:
+                wer = float(result['ins'] + result['sub'] + result[
+                    'del']) * 100.0 / result['all']
+            else:
+                wer = 0.0
+            print('%s -> %4.2f %%' % (cluster_id, wer), end=' ')
+            print('N=%d C=%d S=%d D=%d I=%d' %
+                  (result['all'], result['cor'], result['sub'], result['del'],
+                   result['ins']))
+        if len(cluster_file) > 0:  # compute separated WERs for word clusters
+            cluster_id = ''
+            cluster = []
+            for line in open(cluster_file, 'r', encoding='utf-8'):
+                for token in line.decode('utf-8').rstrip('\n').split():
+                    # end of cluster reached, like </Keyword>
+                    if token[0:2] == '</' and token[len(token)-1] == '>' and \
+                       token.lstrip('</').rstrip('>') == cluster_id :
+                        result = calculator.cluster(cluster)
+                        if result['all'] != 0:
+                            wer = float(result['ins'] + result['sub'] + result[
+                                'del']) * 100.0 / result['all']
+                        else:
+                            wer = 0.0
+                        print('%s -> %4.2f %%' % (cluster_id, wer), end=' ')
+                        print('N=%d C=%d S=%d D=%d I=%d' %
+                              (result['all'], result['cor'], result['sub'],
+                               result['del'], result['ins']))
+                        cluster_id = ''
+                        cluster = []
+                    # begin of cluster reached, like <Keyword>
+                    elif token[0] == '<' and token[len(token)-1] == '>' and \
+                         cluster_id == '' :
+                        cluster_id = token.lstrip('<').rstrip('>')
+                        cluster = []
+                    # general terms, like WEATHER / CAR / ...
+                    else:
+                        cluster.append(token)
+        print()
+        print(
+            '==========================================================================='
+        )
+
+
+if __name__ == '__main__':
+    main()
diff --git a/paddlespeech/dataset/s2t/format_data.py b/paddlespeech/dataset/s2t/format_data.py
new file mode 100755
index 00000000..addd6fdc
--- /dev/null
+++ b/paddlespeech/dataset/s2t/format_data.py
@@ -0,0 +1,155 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""format manifest with more metadata."""
+import argparse
+import functools
+import json
+
+import jsonlines
+
+from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
+from paddlespeech.s2t.frontend.utility import load_cmvn
+from paddlespeech.s2t.io.utility import feat_type
+from paddlespeech.utils.argparse import add_arguments
+from paddlespeech.utils.argparse import print_arguments
+
+
+def define_argparse():
+    parser = argparse.ArgumentParser(description=__doc__)
+    add_arg = functools.partial(add_arguments, argparser=parser)
+    # yapf: disable
+    add_arg('manifest_paths',   str,
+            None,
+            "Filepaths of manifests for building vocabulary. "
+            "You can provide multiple manifest files.",
+            nargs='+',
+            required=True)
+    add_arg('output_path',  str, None, "filepath of formated manifest.", required=True)
+    add_arg('cmvn_path',       str,
+            'examples/librispeech/data/mean_std.json',
+            "Filepath of cmvn.")
+    add_arg('unit_type', str, "char", "Unit type, e.g. char, word, spm")
+    add_arg('vocab_path',       str,
+            'examples/librispeech/data/vocab.txt',
+            "Filepath of the vocabulary.")
+    # bpe
+    add_arg('spm_model_prefix', str, None,
+        "spm model prefix, spm_model_%(bpe_mode)_%(count_threshold), only need when `unit_type` is spm")
+
+    # yapf: disable
+    args = parser.parse_args()
+    return args
+
+def format_data(
+    manifest_paths="",
+    output_path="",
+    cmvn_path="examples/librispeech/data/mean_std.json",
+    unit_type="char",
+    vocab_path="examples/librispeech/data/vocab.txt",
+    spm_model_prefix=""):
+    manifest_paths = [manifest_paths] if isinstance(manifest_paths, str) else manifest_paths
+
+    fout = open(output_path, 'w', encoding='utf-8')
+
+    # get feat dim
+    filetype = cmvn_path.split(".")[-1]
+    mean, istd = load_cmvn(cmvn_path, filetype=filetype)
+    feat_dim = mean.shape[0] #(D)
+    print(f"Feature dim: {feat_dim}")
+
+    text_feature = TextFeaturizer(unit_type, vocab_path, spm_model_prefix)
+    vocab_size = text_feature.vocab_size
+    print(f"Vocab size: {vocab_size}")
+
+    # josnline like this
+    # {
+    #   "input": [{"name": "input1", "shape": (100, 83), "feat": "xxx.ark:123"}],
+    #   "output": [{"name":"target1", "shape": (40, 5002), "text": "a b c de"}],
+    #   "utt2spk": "111-2222",
+    #   "utt": "111-2222-333"
+    # }
+    count = 0
+    for manifest_path in manifest_paths:
+        with jsonlines.open(str(manifest_path), 'r') as reader:
+            manifest_jsons = list(reader)
+
+        for line_json in manifest_jsons:
+            output_json = {
+                "input": [],
+                "output": [],
+                'utt': line_json['utt'],
+                'utt2spk': line_json.get('utt2spk', 'global'),
+            }
+
+            # output
+            line = line_json['text']
+            if isinstance(line, str):
+                # only one target
+                tokens = text_feature.tokenize(line)
+                tokenids = text_feature.featurize(line)
+                output_json['output'].append({
+                    'name': 'target1',
+                    'shape': (len(tokenids), vocab_size),
+                    'text': line,
+                    'token': ' '.join(tokens),
+                    'tokenid': ' '.join(map(str, tokenids)),
+                })
+            else:
+                # isinstance(line, list), multi target in one vocab
+                for i, item in enumerate(line, 1):
+                    tokens = text_feature.tokenize(item)
+                    tokenids = text_feature.featurize(item)
+                    output_json['output'].append({
+                        'name': f'target{i}',
+                        'shape': (len(tokenids), vocab_size),
+                        'text': item,
+                        'token': ' '.join(tokens),
+                        'tokenid': ' '.join(map(str, tokenids)),
+                    })
+
+            # input
+            line = line_json['feat']
+            if isinstance(line, str):
+                # only one input
+                feat_shape = line_json['feat_shape']
+                assert isinstance(feat_shape, (list, tuple)), type(feat_shape)
+                filetype = feat_type(line)
+                if filetype == 'sound':
+                    feat_shape.append(feat_dim)
+                else: # kaldi
+                    raise NotImplementedError('no support kaldi feat now!')
+
+                output_json['input'].append({
+                    "name": "input1",
+                    "shape": feat_shape,
+                    "feat": line,
+                    "filetype": filetype,
+                })
+            else:
+                # isinstance(line, list), multi input 
+                raise NotImplementedError("not support multi input now!")
+
+            fout.write(json.dumps(output_json) + '\n')
+            count += 1
+
+    print(f"{manifest_paths} Examples number: {count}")
+    fout.close()
+
+def main():
+    args = define_argparse()
+    print_arguments(args, globals())
+    format_data(**vars(args))
+
+if __name__ == '__main__':
+    main()
diff --git a/paddlespeech/dataset/s2t/format_rsl.py b/paddlespeech/dataset/s2t/format_rsl.py
new file mode 100644
index 00000000..0a58e7e6
--- /dev/null
+++ b/paddlespeech/dataset/s2t/format_rsl.py
@@ -0,0 +1,143 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+format ref/hyp file for `utt text` format to compute CER/WER/MER.
+
+norm:
+BAC009S0764W0196 明确了发展目标和重点任务
+BAC009S0764W0186 实现我国房地产市场的平稳运行
+
+
+sclite:
+加大对结构机械化环境和收集谈控机制力度(BAC009S0906W0240.wav)
+河南省新乡市丰秋县刘光镇政府东五零左右(BAC009S0770W0441.wav)
+"""
+import argparse
+
+import jsonlines
+
+from paddlespeech.utils.argparse import print_arguments
+
+
+def transform_hyp(origin, trans, trans_sclite):
+    """
+    Args:
+        origin: The input json file which contains the model output
+        trans: The output file for caculate CER/WER
+        trans_sclite: The output file for caculate CER/WER using sclite
+    """
+    input_dict = {}
+
+    with open(origin, "r+", encoding="utf8") as f:
+        for item in jsonlines.Reader(f):
+            input_dict[item["utt"]] = item["hyps"][0]
+
+    if trans:
+        with open(trans, "w+", encoding="utf8") as f:
+            for key in input_dict.keys():
+                f.write(key + " " + input_dict[key] + "\n")
+        print(f"transform_hyp output: {trans}")
+
+    if trans_sclite:
+        with open(trans_sclite, "w+") as f:
+            for key in input_dict.keys():
+                line = input_dict[key] + "(" + key + ".wav" + ")" + "\n"
+                f.write(line)
+        print(f"transform_hyp output: {trans_sclite}")
+
+
+def transform_ref(origin, trans, trans_sclite):
+    """
+    Args:
+        origin: The input json file which contains the model output
+        trans: The output file for caculate CER/WER
+        trans_sclite: The output file for caculate CER/WER using sclite
+    """
+    input_dict = {}
+
+    with open(origin, "r", encoding="utf8") as f:
+        for item in jsonlines.Reader(f):
+            input_dict[item["utt"]] = item["text"]
+
+    if trans:
+        with open(trans, "w", encoding="utf8") as f:
+            for key in input_dict.keys():
+                f.write(key + " " + input_dict[key] + "\n")
+        print(f"transform_hyp output: {trans}")
+
+    if trans_sclite:
+        with open(trans_sclite, "w") as f:
+            for key in input_dict.keys():
+                line = input_dict[key] + "(" + key + ".wav" + ")" + "\n"
+                f.write(line)
+        print(f"transform_hyp output: {trans_sclite}")
+
+
+def define_argparse():
+    parser = argparse.ArgumentParser(
+        prog='format ref/hyp file for compute CER/WER', add_help=True)
+    parser.add_argument(
+        '--origin_hyp', type=str, default="", help='origin hyp file')
+    parser.add_argument(
+        '--trans_hyp',
+        type=str,
+        default="",
+        help='hyp file for caculating CER/WER')
+    parser.add_argument(
+        '--trans_hyp_sclite',
+        type=str,
+        default="",
+        help='hyp file for caculating CER/WER by sclite')
+
+    parser.add_argument(
+        '--origin_ref', type=str, default="", help='origin ref file')
+    parser.add_argument(
+        '--trans_ref',
+        type=str,
+        default="",
+        help='ref file for caculating CER/WER')
+    parser.add_argument(
+        '--trans_ref_sclite',
+        type=str,
+        default="",
+        help='ref file for caculating CER/WER by sclite')
+    parser_args = parser.parse_args()
+    return parser_args
+
+
+def format_result(origin_hyp="",
+                  trans_hyp="",
+                  trans_hyp_sclite="",
+                  origin_ref="",
+                  trans_ref="",
+                  trans_ref_sclite=""):
+
+    if origin_hyp:
+        transform_hyp(
+            origin=origin_hyp, trans=trans_hyp, trans_sclite=trans_hyp_sclite)
+
+    if origin_ref:
+        transform_ref(
+            origin=origin_ref, trans=trans_ref, trans_sclite=trans_ref_sclite)
+
+
+def main():
+    args = define_argparse()
+    print_arguments(args, globals())
+
+    format_result(**vars(args))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/paddlespeech/resource/model_alias.py b/paddlespeech/resource/model_alias.py
index ab0b1828..6bf9b588 100644
--- a/paddlespeech/resource/model_alias.py
+++ b/paddlespeech/resource/model_alias.py
@@ -23,6 +23,9 @@ model_alias = {
     # ---------------------------------
     "wav2vec2ASR": ["paddlespeech.s2t.models.wav2vec2:Wav2vec2ASR"],
     "wav2vec2": ["paddlespeech.s2t.models.wav2vec2:Wav2vec2Base"],
+    "hubertASR": ["paddlespeech.s2t.models.hubert:HubertASR"],
+    "hubert": ["paddlespeech.s2t.models.hubert:HubertBase"],
+    "wavlmASR": ["paddlespeech.s2t.models.wavlm:WavLMASR"],
 
     # ---------------------------------
     # -------------- ASR --------------
diff --git a/paddlespeech/resource/pretrained_models.py b/paddlespeech/resource/pretrained_models.py
index 3c5db64b..e539c001 100644
--- a/paddlespeech/resource/pretrained_models.py
+++ b/paddlespeech/resource/pretrained_models.py
@@ -117,6 +117,48 @@ ssl_dynamic_pretrained_models = {
             'exp/wav2vec2ASR/checkpoints/avg_1.pdparams',
         },
     },
+    "hubert-en-16k": {
+        '1.4': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/hubert/hubert-large-lv60_ckpt_1.4.0.model.tar.gz',
+            'md5':
+            'efecfb87a8718aa9253b7459c1fe9b54',
+            'cfg_path':
+            'model.yaml',
+            'ckpt_path':
+            'hubert-large-lv60',
+            'model':
+            'hubert-large-lv60.pdparams',
+            'params':
+            'hubert-large-lv60.pdparams',
+        },
+    },
+    "hubertASR_librispeech-100h-en-16k": {
+        '1.4': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/hubert/hubertASR-large-100h-librispeech_ckpt_1.4.0.model.tar.gz',
+            'md5':
+            '574cefd11aaef5737969ce22a7f33ea2',
+            'cfg_path':
+            'model.yaml',
+            'ckpt_path':
+            'exp/hubertASR/checkpoints/avg_1',
+            'model':
+            'exp/hubertASR/checkpoints/avg_1.pdparams',
+            'params':
+            'exp/hubertASR/checkpoints/avg_1.pdparams',
+        },
+    },
+    "wavlmASR_librispeech-en-16k": {
+        "1.0": {
+            "url": "https://paddlespeech.bj.bcebos.com/wavlm/wavlm_baseplus_libriclean_100h.tar.gz",
+            "md5": "f2238e982bb8bcf046e536201f5ea629",
+            "cfg_path": "model.yaml",
+            "ckpt_path": "exp/wavlmASR/checkpoints/46",
+            "model": "exp/wavlmASR/checkpoints/46.pdparams",
+            "params": "exp/wavlmASR/checkpoints/46.pdparams",
+        }
+    }
 }
 
 # ---------------------------------
@@ -228,6 +270,16 @@ asr_dynamic_pretrained_models = {
             'ckpt_path':
             'exp/chunk_conformer/checkpoints/avg_30',
         },
+        '1.4': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_conformer_aishell_ckpt_1.5.0.model.tar.gz',
+            'md5':
+            '38924b8adc28ef458847c3571e87e3cb',
+            'cfg_path':
+            'model.yaml',
+            'ckpt_path':
+            'exp/chunk_conformer/checkpoints/avg_30',
+        },
     },
     "transformer_librispeech-en-16k": {
         '1.0': {
diff --git a/paddlespeech/s2t/__init__.py b/paddlespeech/s2t/__init__.py
index 6663bcf8..37d99226 100644
--- a/paddlespeech/s2t/__init__.py
+++ b/paddlespeech/s2t/__init__.py
@@ -267,7 +267,7 @@ def to(x: paddle.Tensor, *args, **kwargs) -> paddle.Tensor:
 
 
 if not hasattr(paddle.Tensor, 'to'):
-    logger.debug("register user to to paddle.Tensor, remove this when fixed!")
+    logger.debug("register user to paddle.Tensor, remove this when fixed!")
     setattr(paddle.Tensor, 'to', to)
     setattr(paddle.static.Variable, 'to', to)
 
diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/deploy/runtime.py b/paddlespeech/s2t/exps/deepspeech2/bin/deploy/runtime.py
index 5755a5f1..f6b1ed09 100644
--- a/paddlespeech/s2t/exps/deepspeech2/bin/deploy/runtime.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/deploy/runtime.py
@@ -28,8 +28,8 @@ from paddlespeech.s2t.training.cli import default_argument_parser
 from paddlespeech.s2t.utils.socket_server import AsrRequestHandler
 from paddlespeech.s2t.utils.socket_server import AsrTCPServer
 from paddlespeech.s2t.utils.socket_server import warm_up_test
-from paddlespeech.s2t.utils.utility import add_arguments
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import add_arguments
+from paddlespeech.utils.argparse import print_arguments
 
 
 def init_predictor(args):
diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/deploy/server.py b/paddlespeech/s2t/exps/deepspeech2/bin/deploy/server.py
index 0d0b4f21..fc57399d 100644
--- a/paddlespeech/s2t/exps/deepspeech2/bin/deploy/server.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/deploy/server.py
@@ -26,8 +26,8 @@ from paddlespeech.s2t.training.cli import default_argument_parser
 from paddlespeech.s2t.utils.socket_server import AsrRequestHandler
 from paddlespeech.s2t.utils.socket_server import AsrTCPServer
 from paddlespeech.s2t.utils.socket_server import warm_up_test
-from paddlespeech.s2t.utils.utility import add_arguments
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import add_arguments
+from paddlespeech.utils.argparse import print_arguments
 
 
 def start_server(config, args):
diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/export.py b/paddlespeech/s2t/exps/deepspeech2/bin/export.py
index 8acd46df..07228e98 100644
--- a/paddlespeech/s2t/exps/deepspeech2/bin/export.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/export.py
@@ -16,7 +16,7 @@ from yacs.config import CfgNode
 
 from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Tester as Tester
 from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments
 
 
 def main_sp(config, args):
diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/test.py b/paddlespeech/s2t/exps/deepspeech2/bin/test.py
index 030168a9..a8e20ff9 100644
--- a/paddlespeech/s2t/exps/deepspeech2/bin/test.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/test.py
@@ -16,7 +16,7 @@ from yacs.config import CfgNode
 
 from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Tester as Tester
 from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments
 
 
 def main_sp(config, args):
diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py b/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py
index d7a9402b..1e07aa80 100644
--- a/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py
@@ -16,7 +16,7 @@ from yacs.config import CfgNode
 
 from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2ExportTester as ExportTester
 from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments
 
 
 def main_sp(config, args):
diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py b/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py
index 66ea29d0..32a583b6 100644
--- a/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py
@@ -27,8 +27,8 @@ from paddlespeech.s2t.training.cli import default_argument_parser
 from paddlespeech.s2t.utils import mp_tools
 from paddlespeech.s2t.utils.checkpoint import Checkpoint
 from paddlespeech.s2t.utils.log import Log
-from paddlespeech.s2t.utils.utility import print_arguments
 from paddlespeech.s2t.utils.utility import UpdateConfig
+from paddlespeech.utils.argparse import print_arguments
 
 logger = Log(__name__).getlog()
 
diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/train.py b/paddlespeech/s2t/exps/deepspeech2/bin/train.py
index 2c9942f9..1340aaa3 100644
--- a/paddlespeech/s2t/exps/deepspeech2/bin/train.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/train.py
@@ -16,7 +16,7 @@ from yacs.config import CfgNode
 
 from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Trainer as Trainer
 from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments
 
 
 def main_sp(config, args):
diff --git a/paddlespeech/s2t/exps/deepspeech2/model.py b/paddlespeech/s2t/exps/deepspeech2/model.py
index 7ab8cf85..d007a9e3 100644
--- a/paddlespeech/s2t/exps/deepspeech2/model.py
+++ b/paddlespeech/s2t/exps/deepspeech2/model.py
@@ -27,7 +27,6 @@ from paddlespeech.audio.text.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.io.dataloader import BatchDataLoader
 from paddlespeech.s2t.models.ds2 import DeepSpeech2InferModel
 from paddlespeech.s2t.models.ds2 import DeepSpeech2Model
-from paddlespeech.s2t.training.gradclip import ClipGradByGlobalNormWithLog
 from paddlespeech.s2t.training.reporter import report
 from paddlespeech.s2t.training.timer import Timer
 from paddlespeech.s2t.training.trainer import Trainer
@@ -148,7 +147,7 @@ class DeepSpeech2Trainer(Trainer):
         if not self.train:
             return
 
-        grad_clip = ClipGradByGlobalNormWithLog(config.global_grad_clip)
+        grad_clip = paddle.nn.ClipGradByGlobalNorm(config.global_grad_clip)
         lr_scheduler = paddle.optimizer.lr.ExponentialDecay(
             learning_rate=config.lr, gamma=config.lr_decay, verbose=True)
         optimizer = paddle.optimizer.Adam(
diff --git a/paddlespeech/s2t/exps/hubert/__init__.py b/paddlespeech/s2t/exps/hubert/__init__.py
new file mode 100644
index 00000000..97043fd7
--- /dev/null
+++ b/paddlespeech/s2t/exps/hubert/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddlespeech/s2t/exps/hubert/bin/__init__.py b/paddlespeech/s2t/exps/hubert/bin/__init__.py
new file mode 100644
index 00000000..595add0a
--- /dev/null
+++ b/paddlespeech/s2t/exps/hubert/bin/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddlespeech/s2t/exps/hubert/bin/test.py b/paddlespeech/s2t/exps/hubert/bin/test.py
new file mode 100644
index 00000000..e0ad09f0
--- /dev/null
+++ b/paddlespeech/s2t/exps/hubert/bin/test.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Evaluation for hubert model."""
+import cProfile
+
+from yacs.config import CfgNode
+
+from paddlespeech.s2t.exps.hubert.model import HubertASRTester as Tester
+from paddlespeech.s2t.training.cli import default_argument_parser
+from paddlespeech.s2t.utils.utility import print_arguments
+
+
+def main_sp(config, args):
+    exp = Tester(config, args)
+    with exp.eval():
+        exp.setup()
+        exp.run_test()
+
+
+def main(config, args):
+    main_sp(config, args)
+
+
+if __name__ == "__main__":
+    parser = default_argument_parser()
+    # save asr result to
+    parser.add_argument(
+        '--dict-path', type=str, default=None, help='dict path.')
+    parser.add_argument(
+        "--result_file", type=str, help="path of save the asr result")
+    args = parser.parse_args()
+    print_arguments(args, globals())
+
+    # https://yaml.org/type/float.html
+    config = CfgNode(new_allowed=True)
+    if args.config:
+        config.merge_from_file(args.config)
+    if args.decode_cfg:
+        decode_confs = CfgNode(new_allowed=True)
+        decode_confs.merge_from_file(args.decode_cfg)
+        config.decode = decode_confs
+    if args.opts:
+        config.merge_from_list(args.opts)
+    config.freeze()
+    print(config)
+    if args.dump_config:
+        with open(args.dump_config, 'w') as f:
+            print(config, file=f)
+
+    # Setting for profiling
+    pr = cProfile.Profile()
+    pr.runcall(main, config, args)
+    pr.dump_stats('test.profile')
diff --git a/paddlespeech/s2t/exps/hubert/bin/test_wav.py b/paddlespeech/s2t/exps/hubert/bin/test_wav.py
new file mode 100644
index 00000000..94d7f76a
--- /dev/null
+++ b/paddlespeech/s2t/exps/hubert/bin/test_wav.py
@@ -0,0 +1,118 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Evaluation for hubert model."""
+import os
+import sys
+from pathlib import Path
+
+import paddle
+import soundfile
+from yacs.config import CfgNode
+
+from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
+from paddlespeech.s2t.models.hubert.hubert_ASR import HubertASR
+from paddlespeech.s2t.training.cli import default_argument_parser
+from paddlespeech.s2t.utils.log import Log
+from paddlespeech.s2t.utils.utility import UpdateConfig
+logger = Log(__name__).getlog()
+
+
+class HubertInfer():
+    def __init__(self, config, args):
+        self.args = args
+        self.config = config
+        self.audio_file = args.audio_file
+
+        self.text_feature = TextFeaturizer(
+            unit_type=config.unit_type, vocab=config.vocab_filepath)
+        paddle.set_device('gpu' if self.args.ngpu > 0 else 'cpu')
+
+        # model
+        model_conf = config
+        with UpdateConfig(model_conf):
+            model_conf.output_dim = self.text_feature.vocab_size
+        model = HubertASR.from_config(model_conf)
+        self.model = model
+        self.model.eval()
+
+        # load model
+        params_path = self.args.checkpoint_path + ".pdparams"
+        model_dict = paddle.load(params_path)
+        self.model.set_state_dict(model_dict)
+
+    def run(self):
+        check(args.audio_file)
+
+        with paddle.no_grad():
+            # read
+            audio, _ = soundfile.read(
+                self.audio_file, dtype="int16", always_2d=True)
+            logger.info(f"audio shape: {audio.shape}")
+
+            xs = paddle.to_tensor(audio, dtype='float32').unsqueeze(axis=0)
+            decode_config = self.config.decode
+            result_transcripts, result_tokenids = self.model.decode(
+                xs,
+                text_feature=self.text_feature,
+                decoding_method=decode_config.decoding_method,
+                beam_size=decode_config.beam_size)
+            rsl = result_transcripts[0]
+            utt = Path(self.audio_file).name
+            logger.info(f"hyp: {utt} {rsl}")
+            return rsl
+
+
+def check(audio_file):
+    if not os.path.isfile(audio_file):
+        print("Please input the right audio file path")
+        sys.exit(-1)
+
+    logger.info("checking the audio file format......")
+    try:
+        sig, sample_rate = soundfile.read(audio_file)
+    except Exception as e:
+        logger.error(str(e))
+        logger.error(
+            "can not open the wav file, please check the audio file format")
+        sys.exit(-1)
+    logger.info("The sample rate is %d" % sample_rate)
+    assert (sample_rate == 16000)
+    logger.info("The audio file format is right")
+
+
+def main(config, args):
+    HubertInfer(config, args).run()
+
+
+if __name__ == "__main__":
+    parser = default_argument_parser()
+    # save asr result to
+    parser.add_argument(
+        "--result_file", type=str, help="path of save the asr result")
+    parser.add_argument(
+        "--audio_file", type=str, help="path of the input audio file")
+    args = parser.parse_args()
+
+    config = CfgNode(new_allowed=True)
+
+    if args.config:
+        config.merge_from_file(args.config)
+    if args.decode_cfg:
+        decode_confs = CfgNode(new_allowed=True)
+        decode_confs.merge_from_file(args.decode_cfg)
+        config.decode = decode_confs
+    if args.opts:
+        config.merge_from_list(args.opts)
+    config.freeze()
+    main(config, args)
diff --git a/paddlespeech/s2t/exps/hubert/bin/train.py b/paddlespeech/s2t/exps/hubert/bin/train.py
new file mode 100644
index 00000000..b7c0a924
--- /dev/null
+++ b/paddlespeech/s2t/exps/hubert/bin/train.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Trainer for hubert model."""
+import cProfile
+import os
+
+from yacs.config import CfgNode
+
+from paddlespeech.s2t.exps.hubert.model import HubertASRTrainer as Trainer
+from paddlespeech.s2t.training.cli import default_argument_parser
+from paddlespeech.s2t.utils.utility import print_arguments
+
+
+def main_sp(config, args):
+    exp = Trainer(config, args)
+    exp.setup()
+    exp.run()
+
+
+def main(config, args):
+    main_sp(config, args)
+
+
+if __name__ == "__main__":
+    parser = default_argument_parser()
+    parser.add_argument(
+        '--resume', type=str, default="", nargs="?", help='resume ckpt path.')
+    args = parser.parse_args()
+    print_arguments(args, globals())
+    # https://yaml.org/type/float.html
+    config = CfgNode(new_allowed=True)
+    if args.config:
+        config.merge_from_file(args.config)
+    if args.opts:
+        config.merge_from_list(args.opts)
+    config.freeze()
+    if args.dump_config:
+        with open(args.dump_config, 'w') as f:
+            print(config, file=f)
+
+    # Setting for profiling
+    pr = cProfile.Profile()
+    pr.runcall(main, config, args)
+    pr.dump_stats(os.path.join(args.output, 'train.profile'))
diff --git a/paddlespeech/s2t/exps/hubert/model.py b/paddlespeech/s2t/exps/hubert/model.py
new file mode 100644
index 00000000..bc05921d
--- /dev/null
+++ b/paddlespeech/s2t/exps/hubert/model.py
@@ -0,0 +1,918 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains hubert model."""
+import json
+import math
+import os
+import re
+import time
+from collections import OrderedDict
+from contextlib import nullcontext
+
+import jsonlines
+import numpy as np
+import paddle
+from hyperpyyaml import load_hyperpyyaml
+from paddle import distributed as dist
+from paddlenlp.transformers import AutoTokenizer
+
+from paddlespeech.s2t.frontend.featurizer import TextFeaturizer
+from paddlespeech.s2t.io.dataloader import DataLoaderFactory
+from paddlespeech.s2t.io.speechbrain import data_pipeline
+from paddlespeech.s2t.io.speechbrain import dataio
+from paddlespeech.s2t.io.speechbrain import dataset
+from paddlespeech.s2t.io.speechbrain.dataloader import make_dataloader
+from paddlespeech.s2t.models.hubert.hubert_ASR import HubertASR
+from paddlespeech.s2t.models.wav2vec2.processing.speech_augmentation import TimeDomainSpecAugment
+from paddlespeech.s2t.training.optimizer import OptimizerFactory
+from paddlespeech.s2t.training.reporter import ObsScope
+from paddlespeech.s2t.training.reporter import report
+from paddlespeech.s2t.training.scheduler import LRSchedulerFactory
+from paddlespeech.s2t.training.timer import Timer
+from paddlespeech.s2t.training.trainer import Trainer
+from paddlespeech.s2t.utils import error_rate
+from paddlespeech.s2t.utils import layer_tools
+from paddlespeech.s2t.utils import mp_tools
+from paddlespeech.s2t.utils.log import Log
+from paddlespeech.s2t.utils.utility import UpdateConfig
+
+logger = Log(__name__).getlog()
+
+
+# Todo: change this when paddle supports this api
+def clip_grad_norm_(
+        parameters,
+        max_norm,
+        norm_type=2.0,
+        error_if_nonfinite=False, ):
+    r"""Clips gradient norm of the iteratable parameters.
+
+    Norms are calculated together on all gradients, just as they are
+    connected into one vector. The gradient will be modified in place.
+
+    This API can only run in dynamic graph mode, not static graph mode.
+
+    Args:
+        parameters (Iterable[paddle.Tensor] or paddle.Tensor): Tensors or a single Tensor
+            that will be normalized gradients
+        max_norm (float or int): max norm of the gradients
+        norm_type (float or int): type of the used p-norm. Can be `inf` for
+            infinity norm.
+        error_if_nonfinite (bool): if True, throw an error if the total
+            norm of the gradients from :attr:`parameters` is `nan`,
+            `inf`, or `-inf`.
+
+    Returns:
+        Total norm of the parameter gradients (treated as a single vector).
+    Example:
+        .. code-block:: python
+            import paddle
+
+            x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
+            max_norm = float(5.0)
+            linear = paddle.nn.Linear(in_features=10, out_features=10)
+            out = linear(x)
+            loss = paddle.mean(out)
+            loss.backward()
+
+            paddle.nn.utils.clip_grad_norm_(linear.parameters(), max_norm)
+
+            sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters())
+            sdg.step()
+    """
+    if not paddle.in_dynamic_mode():
+        raise RuntimeError('this API can only run in dynamic mode.')
+
+    if isinstance(parameters, paddle.Tensor):
+        parameters = [parameters]
+
+    support_norm_type = [float("inf"), 0, 1, 2]
+    if norm_type not in support_norm_type:
+        raise ValueError(f'norm_type only support {support_norm_type}')
+
+    grads = [p.grad for p in parameters if p.grad is not None]
+    max_norm = float(max_norm)
+    norm_type = float(norm_type)
+    if len(grads) == 0:
+        return paddle.to_tensor(0.0)
+    if norm_type == float("inf"):
+        norms = [g.detach().abs().max() for g in grads]
+        total_norm = (norms[0]
+                      if len(norms) == 1 else paddle.max(paddle.stack(norms)))
+    else:
+        total_norm = paddle.linalg.norm(
+            paddle.stack(
+                [paddle.linalg.norm(g.detach(), norm_type) for g in grads]),
+            norm_type, )
+
+    if error_if_nonfinite and paddle.logical_or(total_norm.isnan(),
+                                                total_norm.isinf()):
+        raise RuntimeError(
+            f'The total norm of {norm_type} order of the gradients from '
+            '`parameters` is non-finite, so it cannot be clipped. In any case, '
+            'disable this error and scale the gradient by non-finite norm, '
+            'set `error_if_nonfinite=False`')
+    clip_coef = max_norm / (total_norm + 1e-6)
+    # Note: when the coef is clamped to 1, it is redundant to multiply the clamped coef, but this
+    # avoids the `if clip_coef < 1:` condition.
+    clip_coef_clamped = paddle.clip(clip_coef, max=1.0)
+    with paddle.no_grad():
+        for _, p in enumerate(parameters):
+            g = p.grad
+            if g is not None:
+                p.grad = paddle.multiply(x=g, y=clip_coef_clamped)
+    return total_norm
+
+
+class HubertASRTrainer(Trainer):
+    def __init__(self, config, args):
+        super().__init__(config, args)
+        self.avg_train_loss = 0.0
+        self.loss_isfinite = True  # while flag is 'False', loss in Nan or inf, and can not be avg
+        self.use_sb = True  # whether use speech brain dataloader
+
+    def update_average(self, batch_index, loss):
+        """Update running average of the loss.
+        Arguments
+        ---------
+        batch_index : int
+            current batch index
+        loss : paddle.tensor
+            detached loss, a single float value.
+        """
+        if math.isfinite(loss):
+            self.avg_train_loss -= self.avg_train_loss / (batch_index + 1)
+            self.avg_train_loss += loss / (batch_index + 1)
+        else:
+            self.loss_isfinite = False
+            logger.info('loss:{} in Nan or inf, error'.format(loss))
+
+    def before_train(self):
+        from_scratch = self.resume_or_scratch()
+        if from_scratch:
+            # scratch: save init model, i.e. 0 epoch
+            self.save(tag='init', infos=None)
+        else:
+            # resume: train next_epoch and next_iteration
+            self.epoch += 1
+            logger.info(
+                f"Resume train: epoch {self.epoch }, step {self.iteration}!")
+
+        self.maybe_batch_sampler_step()
+
+    def train_batch(self, batch_index, batch, msg):
+        train_conf = self.config
+        start = time.time()
+
+        # forward
+        ## sb data pipeline
+        if self.use_sb:
+            wav, wavs_lens_rate = batch['sig']
+            target, target_lens_rate = batch['tokens']
+            target_lens = (target_lens_rate *
+                           target.shape[1]).round().astype(paddle.int64)
+        else:
+            utt, wav, wavs_lens, target, target_lens = batch
+            wavs_lens_rate = wavs_lens / wav.shape[1]
+            wav = wav[:, :, 0]
+        logger.info('training utt ids: {}'.format(utt))
+        if hasattr(train_conf, 'audio_augment'):
+            wav = self.speech_augmentation(wav, wavs_lens_rate)
+
+        loss = self.model(wav, wavs_lens_rate, target, target_lens)
+
+        # loss div by `batch_size * accum_grad`
+        loss /= train_conf.accum_grad
+        # update self.avg_train_loss
+        self.update_average(batch_index, float(loss))
+
+        # loss backward
+        if (batch_index + 1) % train_conf.accum_grad != 0:
+            # Disable gradient synchronizations across DDP processes.
+            # Within this context, gradients will be accumulated on module
+            # variables, which will later be synchronized.
+            # When using cpu w/o DDP, model does not have `no_sync`
+            context = self.model.no_sync if (hasattr(self.model, "no_sync") and
+                                             self.parallel) else nullcontext
+        else:
+            # Used for single gpu training and DDP gradient synchronization
+            # processes.
+            context = nullcontext
+        with context():
+            loss.backward()
+
+            layer_tools.print_grads(self.model, print_func=None)
+
+        # optimizer step old
+        if (batch_index + 1) % train_conf.accum_grad == 0:
+            #do global grad clip
+            if train_conf.global_grad_clip != 0:
+                clip_grad_norm_(self.model.parameters(),
+                                train_conf.global_grad_clip)
+            self.model_optimizer.step()
+            self.model_optimizer.clear_grad()
+            if not train_conf.freeze_hubert:
+                self.hubert_optimizer.step()
+                self.hubert_optimizer.clear_grad()
+            if self.config.model_scheduler != 'newbobscheduler':
+                self.model_lr_scheduler.step()
+            if self.config.hubert_scheduler != 'newbobscheduler':
+                if not train_conf.freeze_hubert:
+                    self.hubert_lr_scheduler.step()
+            self.iteration += 1
+
+        losses_np = {'loss': self.avg_train_loss * train_conf.accum_grad}
+        iteration_time = time.time() - start
+        for k, v in losses_np.items():
+            report(k, v)
+        report("loss_whitoutavg", float(loss))
+        report("batch_size", self.config.batch_size)
+        report("accum", train_conf.accum_grad)
+        report("step_cost", iteration_time)
+
+        if (batch_index + 1) % train_conf.accum_grad == 0:
+            if dist.get_rank() == 0 and self.visualizer:
+                losses_np_v = losses_np.copy()
+                losses_np_v.update({
+                    "model_lr": self.model_lr_scheduler(),
+                    "hubert_lr": self.hubert_lr_scheduler()
+                })
+                for key, val in losses_np_v.items():
+                    self.visualizer.add_scalar(
+                        tag='train/' + key, value=val, step=self.iteration - 1)
+
+    @paddle.no_grad()
+    def valid(self):
+        self.model.eval()
+        if not self.use_streamdata:
+            logger.info(
+                f"Valid Total Examples: {len(self.valid_loader.dataset)}")
+        valid_losses = {}
+        step = 0
+        total_loss = 0.0
+        num_seen_utts = 1  # use update_average and no need for num_seen_utts here
+        for i, batch in enumerate(self.valid_loader):
+            if self.use_sb:
+                wav, wavs_lens_rate = batch['sig']
+                target, target_lens_rate = batch['tokens']
+                target_lens = (target_lens_rate *
+                               target.shape[1]).round().astype(paddle.int64)
+            else:
+                utt, wav, wavs_lens, target, target_lens = batch
+                wavs_lens_rate = wavs_lens / wav.shape[1]
+                wav = wav[:, :, 0]
+
+            loss = self.model(wav, wavs_lens_rate, target, target_lens)
+            # use update_average
+            total_loss -= total_loss / (step + 1)
+            total_loss += loss / (step + 1)
+
+            if math.isfinite(float(loss)):
+                step += 1
+                valid_losses['val_loss'] = float(loss)
+            else:
+                logger.info('loss:{} in Nan or inf, error'.format(float(loss)))
+
+            if (i + 1) % self.config.log_interval == 0:
+                valid_losses['val_history_loss'] = float(total_loss)
+
+                # logging
+                msg = f"Valid: Rank: {dist.get_rank()}, "
+                msg += "epoch: {}, ".format(self.epoch)
+                msg += "step: {}, ".format(self.iteration)
+                if not self.use_streamdata:
+                    msg += "batch: {}/{}, ".format(i + 1,
+                                                   len(self.valid_loader))
+                msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                                 for k, v in valid_losses.items())
+                logger.info(msg)
+
+        logger.info(
+            'Rank {} Val info val_loss {}'.format(dist.get_rank(), total_loss))
+        return total_loss, num_seen_utts
+
+    @mp_tools.rank_zero_only
+    def save(self, tag=None, infos: dict=None):
+        """Save checkpoint (model parameters and optimizer states).
+
+        Args:
+            tag (int or str, optional): None for step, else using tag, e.g epoch. Defaults to None.
+            infos (dict, optional): meta data to save. Defaults to None.
+        """
+
+        infos = infos if infos else dict()
+        infos.update({
+            "epoch": self.epoch,
+            "model_lr": self.model_optimizer.get_lr(),
+            "hubert_lr": self.hubert_optimizer.get_lr()
+        })
+
+        checkpoint_path = os.path.join(
+            self.checkpoint_dir,
+            "{}".format(self.iteration if tag is None else tag))
+
+        model_dict = self.model.state_dict()
+        params_path = checkpoint_path + ".pdparams"
+        paddle.save(model_dict, params_path)
+        logger.info("Saved model to {}".format(params_path))
+
+        model_opt_dict = self.model_optimizer.state_dict()
+        hubert_opt_dict = self.hubert_optimizer.state_dict()
+
+        opt_dict = {'model': model_opt_dict, 'hubert': hubert_opt_dict}
+
+        optimizer_path = checkpoint_path + ".pdopt"
+        paddle.save(opt_dict, optimizer_path)
+        logger.info("Saved optimzier state to {}".format(optimizer_path))
+
+        scheduler_dict = {}
+
+        if self.config.model_scheduler == 'newbobscheduler':
+            scheduler_dict['model'] = self.model_lr_scheduler.save()
+        if self.config.hubert_scheduler == 'newbobscheduler':
+            scheduler_dict['hubert'] = self.hubert_lr_scheduler.save()
+        if scheduler_dict:
+            scheduler_path = checkpoint_path + ".pdlrs"
+            paddle.save(scheduler_dict, scheduler_path)
+            logger.info("Saved scheduler state to {}".format(scheduler_path))
+        info_path = re.sub('.pdparams$', '.json', params_path)
+        infos = {} if infos is None else infos
+        with open(info_path, 'w', encoding='utf8') as fout:
+            data = json.dumps(infos)
+            fout.write(data)
+
+    def resume_or_scratch(self):
+        """Resume from latest checkpoint at checkpoints in the output
+        directory or load a specified checkpoint.
+
+        If ``args.checkpoint_path`` is not None, load the checkpoint, else
+        resume training.
+        """
+        scratch = None
+        if self.args.resume:
+            # just restore ckpt
+            # lr will resotre from optimizer ckpt
+            resume_json_path = os.path.join(self.checkpoint_dir,
+                                            self.args.resume + '.json')
+            with open(resume_json_path, 'r', encoding='utf8') as f:
+                resume_json = json.load(f)
+            self.iteration = 0
+            self.epoch = resume_json["epoch"]
+
+            # resotre model from *.pdparams
+            params_path = os.path.join(self.checkpoint_dir,
+                                       "{}".format(self.epoch)) + '.pdparams'
+            model_dict = paddle.load(params_path)
+            self.model.set_state_dict(model_dict)
+
+            # resotre optimizer from *.pdopt
+            optimizer_path = os.path.join(self.checkpoint_dir,
+                                          "{}".format(self.epoch)) + '.pdopt'
+            optimizer_dict = paddle.load(optimizer_path)
+            self.model_optimizer.set_state_dict(optimizer_dict['model'])
+            self.hubert_optimizer.set_state_dict(optimizer_dict['hubert'])
+
+            # resotre lr_scheduler from *.pdlrs
+            scheduler_path = os.path.join(self.checkpoint_dir,
+                                          "{}".format(self.epoch)) + '.pdlrs'
+            if os.path.isfile(os.path.join(scheduler_path)):
+                scheduler_dict = paddle.load(scheduler_path)
+                if self.config.model_scheduler == 'newbobscheduler':
+                    self.model_lr_scheduler.load(scheduler_dict['model'])
+                if self.config.hubert_scheduler == 'newbobscheduler':
+                    self.hubert_lr_scheduler.load(scheduler_dict['hubert'])
+            logger.info(
+                f"Restore ckpt: epoch {self.epoch }, step {self.iteration}!")
+            scratch = False
+        else:
+            self.iteration = 0
+            self.epoch = 0
+            scratch = True
+            logger.info("Init from scratch!")
+        return scratch
+
+    def do_train(self):
+        """The training process control by step."""
+        # !!!IMPORTANT!!!
+        # Try to export the model by script, if fails, we should refine
+        # the code to satisfy the script export requirements
+        # script_model = paddle.jit.to_static(self.model)
+        # script_model_path = str(self.checkpoint_dir / 'init')
+        # paddle.jit.save(script_model, script_model_path)
+
+        self.before_train()
+        if not self.use_streamdata:
+            logger.info(
+                f"Train Total Examples: {len(self.train_loader.dataset)}")
+        while self.epoch < self.config.n_epoch:
+            with Timer("Epoch-Train Time Cost: {}"):
+                self.model.train()
+                try:
+                    data_start_time = time.time()
+                    for batch_index, batch in enumerate(self.train_loader):
+                        dataload_time = time.time() - data_start_time
+                        msg = "Train:"
+                        observation = OrderedDict()
+                        with ObsScope(observation):
+                            report("Rank", dist.get_rank())
+                            report("epoch", self.epoch)
+                            report('step', self.iteration)
+                            report("model_lr", self.model_optimizer.get_lr())
+                            report("hubert_lr", self.hubert_optimizer.get_lr())
+                            self.train_batch(batch_index, batch, msg)
+                            self.after_train_batch()
+                            report('iter', batch_index + 1)
+                            if not self.use_streamdata:
+                                report('total', len(self.train_loader))
+                            report('reader_cost', dataload_time)
+                        observation['batch_cost'] = observation[
+                            'reader_cost'] + observation['step_cost']
+                        observation['samples'] = observation['batch_size']
+                        observation['ips,samples/s'] = observation[
+                            'batch_size'] / observation['batch_cost']
+                        for k, v in observation.items():
+                            msg += f" {k.split(',')[0]}: "
+                            msg += f"{v:>.8f}" if isinstance(v,
+                                                             float) else f"{v}"
+                            msg += f" {k.split(',')[1]}" if len(
+                                k.split(',')) == 2 else ""
+                            msg += ","
+                        msg = msg[:-1]  # remove the last ","
+                        if (batch_index + 1) % self.config.log_interval == 0:
+                            logger.info(msg)
+                        data_start_time = time.time()
+                except Exception as e:
+                    logger.error(e)
+                    raise e
+            with Timer("Eval Time Cost: {}"):
+                total_loss, num_seen_utts = self.valid()
+                if dist.get_world_size() > 1:
+                    num_seen_utts = paddle.to_tensor(num_seen_utts)
+                    dist.all_reduce(num_seen_utts)
+                    total_loss = paddle.to_tensor(total_loss)
+                    dist.all_reduce(total_loss)
+                    cv_loss = total_loss / num_seen_utts
+                    cv_loss = float(cv_loss)
+                else:
+                    cv_loss = float(total_loss)
+            logger.info(
+                'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))
+            if self.visualizer:
+                self.visualizer.add_scalar(
+                    tag='eval/cv_loss', value=cv_loss, step=self.epoch)
+                self.visualizer.add_scalar(
+                    tag='eval/model_lr',
+                    value=self.model_lr_scheduler(),
+                    step=self.epoch)
+                self.visualizer.add_scalar(
+                    tag='eval/hubert_lr',
+                    value=self.hubert_lr_scheduler(),
+                    step=self.epoch)
+
+            if self.config.model_scheduler == 'newbobscheduler':
+                self.model_lr_scheduler.step(cv_loss)
+            if self.config.hubert_scheduler == 'newbobscheduler':
+                if not self.config.freeze_hubert:
+                    self.hubert_lr_scheduler.step(cv_loss)
+            self.save(tag=self.epoch, infos={'val_loss': cv_loss})
+            self.avg_train_loss = 0.0
+            self.new_epoch()
+
+    def dataio_prepare(self, hparams):
+        """This function prepares the datasets to be used in the brain class.
+        It also defines the data processing pipeline through user-defined functions."""
+        data_folder = hparams["data_folder"]
+
+        train_data = dataset.DynamicItemDataset.from_csv(
+            csv_path=hparams["train_data"],
+            replacements={"data_root": data_folder}, )
+
+        if hparams["sorting"] == "ascending":
+            # we sort training data to speed up training and get better results.
+            train_data = train_data.filtered_sorted(sort_key="duration")
+            # when sorting do not shuffle in dataloader ! otherwise is pointless
+            hparams["train_dataloader_opts"]["shuffle"] = False
+
+        elif hparams["sorting"] == "descending":
+            train_data = train_data.filtered_sorted(
+                sort_key="duration", reverse=True)
+            # when sorting do not shuffle in dataloader ! otherwise is pointless
+            hparams["train_dataloader_opts"]["shuffle"] = False
+
+        elif hparams["sorting"] == "random":
+            pass
+
+        else:
+            raise NotImplementedError(
+                "sorting must be random, ascending or descending")
+
+        valid_data = dataset.DynamicItemDataset.from_csv(
+            csv_path=hparams["valid_data"],
+            replacements={"data_root": data_folder}, )
+        valid_data = valid_data.filtered_sorted(sort_key="duration")
+
+        test_data = dataset.DynamicItemDataset.from_csv(
+            csv_path=hparams["test_data"],
+            replacements={"data_root": data_folder}, )
+        test_data = test_data.filtered_sorted(sort_key="duration")
+
+        datasets = [train_data, valid_data, test_data]
+
+        # Defining tokenizer and loading it
+        tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese')
+        self.tokenizer = tokenizer
+
+        # 2. Define audio pipeline:
+        @data_pipeline.takes("wav")
+        @data_pipeline.provides("sig")
+        def audio_pipeline(wav):
+            sig = dataio.read_audio(wav)
+            return sig
+
+        dataset.add_dynamic_item(datasets, audio_pipeline)
+
+        # 3. Define text pipeline:
+        @data_pipeline.takes("transcript")
+        @data_pipeline.provides("wrd", "tokens_list", "tokens")
+        def text_pipeline(wrd):
+            wrd = "".join(wrd.split(" "))
+            yield wrd
+            tokens_list = tokenizer(wrd)["input_ids"]
+            yield tokens_list
+            tokens = np.array(tokens_list, dtype="int64")
+            # tokens = paddle.to_tensor(tokens_list, dtype="int64")
+            yield tokens
+
+        dataset.add_dynamic_item(datasets, text_pipeline)
+
+        # 4. Set output:
+        dataset.set_output_keys(
+            datasets,
+            ["id", "sig", "wrd", "tokens"], )
+
+        # 5. If Dynamic Batching is used, we instantiate the needed samplers.
+        train_batch_sampler = None
+        valid_batch_sampler = None
+        if hparams["dynamic_batching"]:
+            from sampler import DynamicBatchSampler  # noqa
+
+            dynamic_hparams = hparams["dynamic_batch_sampler"]
+            num_buckets = dynamic_hparams["num_buckets"]
+
+            train_batch_sampler = DynamicBatchSampler(
+                train_data,
+                dynamic_hparams["max_batch_len"],
+                num_buckets=num_buckets,
+                length_func=lambda x: x["duration"],
+                shuffle=dynamic_hparams["shuffle_ex"],
+                batch_ordering=dynamic_hparams["batch_ordering"], )
+
+            valid_batch_sampler = DynamicBatchSampler(
+                valid_data,
+                dynamic_hparams["max_batch_len"],
+                num_buckets=num_buckets,
+                length_func=lambda x: x["duration"],
+                shuffle=dynamic_hparams["shuffle_ex"],
+                batch_ordering=dynamic_hparams["batch_ordering"], )
+
+        return (train_data, valid_data, test_data, tokenizer,
+                train_batch_sampler, valid_batch_sampler, )
+
+    def setup_dataloader(self):
+        config = self.config.clone()
+        self.use_streamdata = config.get("use_stream_data", False)
+        self.use_sb = config.get("use_sb_pipeline", False)
+        if self.use_sb:
+            hparams_file = config.sb_pipeline_conf
+            with open(hparams_file, 'r', encoding='utf8') as fin:
+                hparams = load_hyperpyyaml(fin, None)
+
+            (train_data, valid_data, test_data, tokenizer, train_bsampler,
+             valid_bsampler, ) = self.dataio_prepare(hparams)
+
+            train_dataloader_opts = hparams["train_dataloader_opts"]
+            valid_dataloader_opts = hparams["valid_dataloader_opts"]
+
+            if train_bsampler is not None:
+                train_dataloader_opts = {
+                    "batch_sampler": train_bsampler,
+                    "num_workers": hparams["num_workers"],
+                }
+
+            if valid_bsampler is not None:
+                valid_dataloader_opts = {"batch_sampler": valid_bsampler}
+
+            if self.train:
+                self.train_loader = make_dataloader(
+                    train_data, stage='train', **train_dataloader_opts)
+                self.valid_loader = make_dataloader(
+                    valid_data,
+                    stage='val',
+                    **valid_dataloader_opts, )
+                logger.info("Setup train/valid Dataloader!")
+            else:
+                self.test_loader = make_dataloader(
+                    test_data, stage='test', **hparams["test_dataloader_opts"])
+        else:
+            if self.train:
+                self.train_loader = DataLoaderFactory.get_dataloader(
+                    'train', config, self.args)
+                self.valid_loader = DataLoaderFactory.get_dataloader(
+                    'valid', config, self.args)
+                logger.info("Setup train/valid Dataloader!")
+            else:
+                decode_batch_size = config.get('decode', dict()).get(
+                    'decode_batch_size', 1)
+                self.test_loader = DataLoaderFactory.get_dataloader(
+                    'test', config, self.args)
+                self.align_loader = DataLoaderFactory.get_dataloader(
+                    'align', config, self.args)
+                logger.info("Setup test/align Dataloader!")
+
+    def setup_model(self):
+        config = self.config
+        model_conf = config
+
+        with UpdateConfig(model_conf):
+            if self.use_sb:
+                model_conf.output_dim = self.tokenizer.vocab_size
+            else:
+                if self.train:
+                    model_conf.input_dim = self.train_loader.feat_dim
+                    model_conf.output_dim = self.train_loader.vocab_size
+                else:
+                    model_conf.input_dim = self.test_loader.feat_dim
+                    model_conf.output_dim = self.test_loader.vocab_size
+
+        model = HubertASR.from_config(model_conf)
+
+        model_dict = paddle.load(config.hubert_params_path)
+        model.set_state_dict(model_dict)
+
+        if self.parallel:
+            model = paddle.DataParallel(model, find_unused_parameters=True)
+
+        layer_tools.print_params(model, logger.info)
+        self.model = model
+        logger.info("Setup model!")
+
+        # setup speech augmentation for hubert
+        if hasattr(config, 'audio_augment') and self.train:
+            self.speech_augmentation = TimeDomainSpecAugment(
+                **config.audio_augment)
+
+        if not self.train:
+            return
+
+        train_config = config
+        model_optim_type = train_config.model_optim
+        model_optim_conf = train_config.model_optim_conf
+        logger.info("optim_model:{},{}", model_optim_type, model_optim_conf)
+        hubert_optim_type = train_config.hubert_optim
+        hubert_optim_conf = train_config.hubert_optim_conf
+        logger.info("optim_model:{},{}", hubert_optim_type, hubert_optim_conf)
+
+        model_scheduler_type = train_config.model_scheduler
+        model_scheduler_conf = train_config.model_scheduler_conf
+        hubert_scheduler_type = train_config.hubert_scheduler
+        hubert_scheduler_conf = train_config.hubert_scheduler_conf
+
+        model_scheduler_args = dict(
+            **{"learning_rate": model_optim_conf.lr,
+               "verbose": False}, **(dict(model_scheduler_conf)))
+
+        hubert_scheduler_args = dict(
+            **{"learning_rate": hubert_optim_conf.lr,
+               "verbose": False}, **(dict(hubert_scheduler_conf)))
+
+        model_lr_scheduler = LRSchedulerFactory.from_args(model_scheduler_type,
+                                                          model_scheduler_args)
+        hubert_lr_scheduler = LRSchedulerFactory.from_args(
+            hubert_scheduler_type, hubert_scheduler_args)
+
+        def optimizer_args(
+                config,
+                optim_type,
+                optim_conf,
+                parameters,
+                lr_scheduler=None, ):
+            optim_arg = dict(optim_conf)
+            optim_arg.update({
+                "learning_rate":
+                lr_scheduler if lr_scheduler else optim_conf.lr,
+                "parameters":
+                parameters
+            })
+            return optim_arg
+
+        model_optimizer_args = optimizer_args(config, model_optim_type,
+                                              model_optim_conf, [{
+                                                  'params':
+                                                  model._layers.enc.parameters()
+                                              }, {
+                                                  'params':
+                                                  model._layers.ctc.parameters()
+                                              }] if self.parallel else [{
+                                                  'params':
+                                                  model.enc.parameters()
+                                              }, {
+                                                  'params':
+                                                  model.ctc.parameters()
+                                              }], model_lr_scheduler)
+
+        hubert_optimizer_args = optimizer_args(
+            config, hubert_optim_type, hubert_optim_conf,
+            model._layers.hubert.parameters() if self.parallel else
+            model.hubert.parameters(), hubert_lr_scheduler)
+
+        model_optimizer = OptimizerFactory.from_args(model_optim_type,
+                                                     model_optimizer_args)
+        hubert_optimizer = OptimizerFactory.from_args(hubert_optim_type,
+                                                      hubert_optimizer_args)
+
+        self.model_optimizer = model_optimizer
+        self.hubert_optimizer = hubert_optimizer
+        self.model_lr_scheduler = model_lr_scheduler
+        self.hubert_lr_scheduler = hubert_lr_scheduler
+        logger.info("Setup optimizer/lr_scheduler!")
+
+
+class HubertASRTester(HubertASRTrainer):
+    def __init__(self, config, args):
+        super().__init__(config, args)
+        self.text_featurizer = TextFeaturizer(
+            unit_type=config.unit_type, vocab=config.vocab_filepath)
+        self.vocab_list = self.text_featurizer.vocab_list
+
+    def id2token(self, texts, texts_len):
+        """ ord() id to chr() chr """
+        trans = []
+        for text, n in zip(texts, texts_len):
+            n = n.numpy().item()
+            ids = text[:n]
+            trans.append(self.text_featurizer.defeaturize(ids.numpy().tolist()))
+        return trans
+
+    def compute_metrics(self, id, audio, audio_len, texts, texts_len,
+                        fout=None):
+        decode_cfg = self.config.decode
+        errors_sum, len_refs, num_ins = 0.0, 0, 0
+        errors_func = error_rate.char_errors if decode_cfg.error_rate_type == 'cer' else error_rate.word_errors
+        error_rate_func = error_rate.cer if decode_cfg.error_rate_type == 'cer' else error_rate.wer
+
+        start_time = time.time()
+        target_transcripts = self.id2token(texts, texts_len)
+        result_transcripts, result_tokenids = self.model.decode(
+            audio,
+            text_feature=self.text_featurizer,
+            decoding_method=decode_cfg.decoding_method,
+            beam_size=decode_cfg.beam_size)
+        decode_time = time.time() - start_time
+
+        for utt, target, result, rec_tids in zip(
+                id, target_transcripts, result_transcripts, result_tokenids):
+            errors, len_ref = errors_func(target, result)
+            errors_sum += errors
+            len_refs += len_ref
+            num_ins += 1
+            if fout:
+                fout.write({
+                    "utt": utt,
+                    "refs": [target],
+                    "hyps": [result],
+                    "hyps_tokenid": [rec_tids],
+                })
+            logger.info(f"Utt: {utt}")
+            logger.info(f"Ref: {target}")
+            logger.info(f"Hyp: {result}")
+            logger.info("One example error rate [%s] = %f" % (
+                decode_cfg.error_rate_type, error_rate_func(target, result)))
+
+        return dict(
+            errors_sum=errors_sum,
+            len_refs=len_refs,
+            num_ins=num_ins,  # num examples
+            error_rate=errors_sum / len_refs,
+            error_rate_type=decode_cfg.error_rate_type,
+            num_frames=audio_len.sum().numpy().item(),
+            decode_time=decode_time)
+
+    def sb_compute_metrics(self, id, sig, wrd, tokens, fout=None):
+        decode_cfg = self.config.decode
+        errors_sum, len_refs, num_ins = 0.0, 0, 0
+        errors_func = error_rate.char_errors if decode_cfg.error_rate_type == 'cer' else error_rate.word_errors
+        error_rate_func = error_rate.cer if decode_cfg.error_rate_type == 'cer' else error_rate.wer
+        start_time = time.time()
+        target_transcripts = wrd
+        result_transcripts, result_tokenids = self.model.decode(
+            sig[0],
+            text_feature=self.tokenizer,
+            decoding_method=decode_cfg.decoding_method,
+            beam_size=decode_cfg.beam_size,
+            sb_pipeline=True)
+        decode_time = time.time() - start_time
+
+        for utt, target, result, rec_tids in zip(
+                id, target_transcripts, result_transcripts, result_tokenids):
+            errors, len_ref = errors_func(target, result)
+            errors_sum += errors
+            len_refs += len_ref
+            num_ins += 1
+            if fout:
+                fout.write({
+                    "utt": utt,
+                    "refs": [target],
+                    "hyps": [result],
+                    "hyps_tokenid": [rec_tids],
+                })
+            logger.info(f"Utt: {utt}")
+            logger.info(f"Ref: {target}")
+            logger.info(f"Hyp: {result}")
+            logger.info("One example error rate [%s] = %f" % (
+                decode_cfg.error_rate_type, error_rate_func(target, result)))
+
+        return dict(
+            errors_sum=errors_sum,
+            len_refs=len_refs,
+            num_ins=num_ins,  # num examples
+            error_rate=errors_sum / len_refs,
+            error_rate_type=decode_cfg.error_rate_type,
+            num_frames=sig[1].sum().numpy().item(),
+            decode_time=decode_time)
+
+    @mp_tools.rank_zero_only
+    @paddle.no_grad()
+    def test(self):
+        logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}")
+        self.model.eval()
+
+        error_rate_type = None
+        errors_sum, len_refs, num_ins = 0.0, 0, 0
+        num_frames = 0.0
+        num_time = 0.0
+        # Initialized the decoder in model
+        decode_cfg = self.config.decode
+        vocab_list = self.vocab_list
+        decode_batch_size = decode_cfg.decode_batch_size
+
+        with jsonlines.open(self.args.result_file, 'w') as fout:
+            for i, batch in enumerate(self.test_loader):
+                if self.use_sb:
+                    metrics = self.sb_compute_metrics(**batch, fout=fout)
+                else:
+                    metrics = self.compute_metrics(*batch, fout=fout)
+                num_frames += metrics['num_frames']
+                num_time += metrics["decode_time"]
+                errors_sum += metrics['errors_sum']
+                len_refs += metrics['len_refs']
+                num_ins += metrics['num_ins']
+                error_rate_type = metrics['error_rate_type']
+                rtf = num_time / (num_frames)
+                logger.info(
+                    "RTF: %f, Error rate [%s] (%d/?) = %f" %
+                    (rtf, error_rate_type, num_ins, errors_sum / len_refs))
+
+        # logging
+        msg = "Test: "
+        msg += "epoch: {}, ".format(self.epoch)
+        msg += "step: {}, ".format(self.iteration)
+        msg += "Final error rate [%s] (%d/%d) = %f" % (
+            error_rate_type, num_ins, num_ins, errors_sum / len_refs)
+        logger.info(msg)
+
+        err_meta_path = os.path.splitext(self.args.result_file)[0] + '.err'
+        err_type_str = "{}".format(error_rate_type)
+        with open(err_meta_path, 'w', encoding='utf8') as f:
+            data = json.dumps({
+                "epoch":
+                self.epoch,
+                "step":
+                self.iteration,
+                "rtf":
+                rtf,
+                error_rate_type:
+                errors_sum / len_refs,
+                "dataset_hour": (num_frames) / 1000.0 / 3600.0,
+                "process_hour":
+                num_time / 1000.0 / 3600.0,
+                "num_examples":
+                num_ins,
+                "err_sum":
+                errors_sum,
+                "ref_len":
+                len_refs,
+                "decode_method":
+                self.config.decode.decoding_method,
+            })
+            f.write(data + '\n')
diff --git a/paddlespeech/s2t/exps/u2/bin/alignment.py b/paddlespeech/s2t/exps/u2/bin/alignment.py
index e3390feb..64cafc48 100644
--- a/paddlespeech/s2t/exps/u2/bin/alignment.py
+++ b/paddlespeech/s2t/exps/u2/bin/alignment.py
@@ -12,11 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Alignment for U2 model."""
-from yacs.config import CfgNode
-
 from paddlespeech.s2t.exps.u2.model import U2Tester as Tester
+from paddlespeech.s2t.training.cli import config_from_args
 from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.s2t.training.cli import maybe_dump_config
+from paddlespeech.utils.argparse import print_arguments
 
 
 def main_sp(config, args):
@@ -32,26 +32,10 @@ def main(config, args):
 
 if __name__ == "__main__":
     parser = default_argument_parser()
-    # save asr result to
-    parser.add_argument(
-        "--result_file", type=str, help="path of save the asr result")
     args = parser.parse_args()
     print_arguments(args, globals())
 
-    # https://yaml.org/type/float.html
-    config = CfgNode(new_allowed=True)
-    if args.config:
-        config.merge_from_file(args.config)
-    if args.decode_cfg:
-        decode_confs = CfgNode(new_allowed=True)
-        decode_confs.merge_from_file(args.decode_cfg)
-        config.decode = decode_confs
-    if args.opts:
-        config.merge_from_list(args.opts)
-    config.freeze()
+    config = config_from_args(args)
     print(config)
-    if args.dump_config:
-        with open(args.dump_config, 'w') as f:
-            print(config, file=f)
-
+    maybe_dump_config(args.dump_config, config)
     main(config, args)
diff --git a/paddlespeech/s2t/exps/u2/bin/export.py b/paddlespeech/s2t/exps/u2/bin/export.py
index 592b1237..de4a55a4 100644
--- a/paddlespeech/s2t/exps/u2/bin/export.py
+++ b/paddlespeech/s2t/exps/u2/bin/export.py
@@ -12,11 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Export for U2 model."""
-from yacs.config import CfgNode
-
 from paddlespeech.s2t.exps.u2.model import U2Tester as Tester
+from paddlespeech.s2t.training.cli import config_from_args
 from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.s2t.training.cli import maybe_dump_config
+from paddlespeech.utils.argparse import print_arguments
 
 
 def main_sp(config, args):
@@ -32,22 +32,10 @@ def main(config, args):
 
 if __name__ == "__main__":
     parser = default_argument_parser()
-    # save jit model to
-    parser.add_argument(
-        "--export_path", type=str, help="path of the jit model to save")
     args = parser.parse_args()
     print_arguments(args, globals())
 
-    # https://yaml.org/type/float.html
-    config = CfgNode(new_allowed=True)
-    if args.config:
-        config.merge_from_file(args.config)
-    if args.opts:
-        config.merge_from_list(args.opts)
-    config.freeze()
+    config = config_from_args(args)
     print(config)
-    if args.dump_config:
-        with open(args.dump_config, 'w') as f:
-            print(config, file=f)
-
+    maybe_dump_config(args.dump_config, config)
     main(config, args)
diff --git a/paddlespeech/s2t/exps/u2/bin/quant.py b/paddlespeech/s2t/exps/u2/bin/quant.py
index 6d361c5f..73a9794f 100755
--- a/paddlespeech/s2t/exps/u2/bin/quant.py
+++ b/paddlespeech/s2t/exps/u2/bin/quant.py
@@ -15,14 +15,15 @@
 import paddle
 from kaldiio import ReadHelper
 from paddleslim import PTQ
-from yacs.config import CfgNode
 
 from paddlespeech.audio.transform.transformation import Transformation
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.models.u2 import U2Model
+from paddlespeech.s2t.training.cli import config_from_args
 from paddlespeech.s2t.training.cli import default_argument_parser
 from paddlespeech.s2t.utils.log import Log
 from paddlespeech.s2t.utils.utility import UpdateConfig
+
 logger = Log(__name__).getlog()
 
 
@@ -173,32 +174,7 @@ def main(config, args):
 
 if __name__ == "__main__":
     parser = default_argument_parser()
-    # save asr result to
-    parser.add_argument(
-        "--result_file", type=str, help="path of save the asr result")
-    parser.add_argument(
-        "--audio_scp", type=str, help="path of the input audio file")
-    parser.add_argument(
-        "--num_utts",
-        type=int,
-        default=200,
-        help="num utts for quant calibrition.")
-    parser.add_argument(
-        "--export_path",
-        type=str,
-        default='export.jit.quant',
-        help="path of the input audio file")
     args = parser.parse_args()
 
-    config = CfgNode(new_allowed=True)
-
-    if args.config:
-        config.merge_from_file(args.config)
-    if args.decode_cfg:
-        decode_confs = CfgNode(new_allowed=True)
-        decode_confs.merge_from_file(args.decode_cfg)
-        config.decode = decode_confs
-    if args.opts:
-        config.merge_from_list(args.opts)
-    config.freeze()
+    config = config_from_args(args)
     main(config, args)
diff --git a/paddlespeech/s2t/exps/u2/bin/test.py b/paddlespeech/s2t/exps/u2/bin/test.py
index b13fd0d3..ea187862 100644
--- a/paddlespeech/s2t/exps/u2/bin/test.py
+++ b/paddlespeech/s2t/exps/u2/bin/test.py
@@ -14,11 +14,11 @@
 """Evaluation for U2 model."""
 import cProfile
 
-from yacs.config import CfgNode
-
 from paddlespeech.s2t.exps.u2.model import U2Tester as Tester
+from paddlespeech.s2t.training.cli import config_from_args
 from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.s2t.training.cli import maybe_dump_config
+from paddlespeech.utils.argparse import print_arguments
 
 
 def main_sp(config, args):
@@ -34,27 +34,12 @@ def main(config, args):
 
 if __name__ == "__main__":
     parser = default_argument_parser()
-    # save asr result to
-    parser.add_argument(
-        "--result_file", type=str, help="path of save the asr result")
     args = parser.parse_args()
     print_arguments(args, globals())
 
-    # https://yaml.org/type/float.html
-    config = CfgNode(new_allowed=True)
-    if args.config:
-        config.merge_from_file(args.config)
-    if args.decode_cfg:
-        decode_confs = CfgNode(new_allowed=True)
-        decode_confs.merge_from_file(args.decode_cfg)
-        config.decode = decode_confs
-    if args.opts:
-        config.merge_from_list(args.opts)
-    config.freeze()
+    config = config_from_args(args)
     print(config)
-    if args.dump_config:
-        with open(args.dump_config, 'w') as f:
-            print(config, file=f)
+    maybe_dump_config(args.dump_config, config)
 
     # Setting for profiling
     pr = cProfile.Profile()
diff --git a/paddlespeech/s2t/exps/u2/bin/test_wav.py b/paddlespeech/s2t/exps/u2/bin/test_wav.py
index 0df44319..a6228a12 100644
--- a/paddlespeech/s2t/exps/u2/bin/test_wav.py
+++ b/paddlespeech/s2t/exps/u2/bin/test_wav.py
@@ -16,15 +16,14 @@ import os
 import sys
 from pathlib import Path
 
-import distutils
 import numpy as np
 import paddle
 import soundfile
-from yacs.config import CfgNode
 
 from paddlespeech.audio.transform.transformation import Transformation
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.models.u2 import U2Model
+from paddlespeech.s2t.training.cli import config_from_args
 from paddlespeech.s2t.training.cli import default_argument_parser
 from paddlespeech.s2t.utils.log import Log
 from paddlespeech.s2t.utils.utility import UpdateConfig
@@ -125,27 +124,7 @@ def main(config, args):
 
 if __name__ == "__main__":
     parser = default_argument_parser()
-    # save asr result to
-    parser.add_argument(
-        "--result_file", type=str, help="path of save the asr result")
-    parser.add_argument(
-        "--audio_file", type=str, help="path of the input audio file")
-    parser.add_argument(
-        "--debug",
-        type=distutils.util.strtobool,
-        default=False,
-        help="for debug.")
     args = parser.parse_args()
 
-    config = CfgNode(new_allowed=True)
-
-    if args.config:
-        config.merge_from_file(args.config)
-    if args.decode_cfg:
-        decode_confs = CfgNode(new_allowed=True)
-        decode_confs.merge_from_file(args.decode_cfg)
-        config.decode = decode_confs
-    if args.opts:
-        config.merge_from_list(args.opts)
-    config.freeze()
+    config = config_from_args(args)
     main(config, args)
diff --git a/paddlespeech/s2t/exps/u2/bin/train.py b/paddlespeech/s2t/exps/u2/bin/train.py
index dc3a87c1..b52d5e90 100644
--- a/paddlespeech/s2t/exps/u2/bin/train.py
+++ b/paddlespeech/s2t/exps/u2/bin/train.py
@@ -15,13 +15,11 @@
 import cProfile
 import os
 
-from yacs.config import CfgNode
-
 from paddlespeech.s2t.exps.u2.model import U2Trainer as Trainer
+from paddlespeech.s2t.training.cli import config_from_args
 from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.s2t.utils.utility import print_arguments
-
-# from paddlespeech.s2t.exps.u2.trainer import U2Trainer as Trainer
+from paddlespeech.s2t.training.cli import maybe_dump_config
+from paddlespeech.utils.argparse import print_arguments
 
 
 def main_sp(config, args):
@@ -39,17 +37,9 @@ if __name__ == "__main__":
     args = parser.parse_args()
     print_arguments(args, globals())
 
-    # https://yaml.org/type/float.html
-    config = CfgNode(new_allowed=True)
-    if args.config:
-        config.merge_from_file(args.config)
-    if args.opts:
-        config.merge_from_list(args.opts)
-    config.freeze()
+    config = config_from_args(args)
     print(config)
-    if args.dump_config:
-        with open(args.dump_config, 'w') as f:
-            print(config, file=f)
+    maybe_dump_config(args.dump_path, config)
 
     # Setting for profiling
     pr = cProfile.Profile()
diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py
index 5b7654d4..11dd0b06 100644
--- a/paddlespeech/s2t/exps/u2/model.py
+++ b/paddlespeech/s2t/exps/u2/model.py
@@ -23,6 +23,7 @@ import jsonlines
 import numpy as np
 import paddle
 from paddle import distributed as dist
+from paddle.nn.utils import clip_grad_norm_
 
 from paddlespeech.s2t.frontend.featurizer import TextFeaturizer
 from paddlespeech.s2t.io.dataloader import DataLoaderFactory
@@ -47,14 +48,16 @@ class U2Trainer(Trainer):
     def __init__(self, config, args):
         super().__init__(config, args)
 
-    def train_batch(self, batch_index, batch_data, msg):
+    def train_batch(self, batch_index, batch_data, scaler, msg):
         train_conf = self.config
         start = time.time()
 
         # forward
         utt, audio, audio_len, text, text_len = batch_data
-        loss, attention_loss, ctc_loss = self.model(audio, audio_len, text,
-                                                    text_len)
+        with paddle.amp.auto_cast(
+                level=self.amp_level, enable=True if scaler else False):
+            loss, attention_loss, ctc_loss = self.model(audio, audio_len, text,
+                                                        text_len)
 
         # loss div by `batch_size * accum_grad`
         loss /= train_conf.accum_grad
@@ -77,12 +80,26 @@ class U2Trainer(Trainer):
             # processes.
             context = nullcontext
         with context():
-            loss.backward()
+            if scaler:
+                scaler.scale(loss).backward()
+            else:
+                loss.backward()
             layer_tools.print_grads(self.model, print_func=None)
 
         # optimizer step
         if (batch_index + 1) % train_conf.accum_grad == 0:
-            self.optimizer.step()
+            # do global grad clip
+            if train_conf.global_grad_clip != 0:
+                if scaler:
+                    scaler.unscale_(self.optimizer)
+                # need paddlepaddle==develop or paddlepaddle>=2.5
+                clip_grad_norm_(self.model.parameters(),
+                                train_conf.global_grad_clip)
+            if scaler:
+                scaler.step(self.optimizer)
+                scaler.update()
+            else:
+                self.optimizer.step()
             self.optimizer.clear_grad()
             self.lr_scheduler.step()
             self.iteration += 1
@@ -173,7 +190,8 @@ class U2Trainer(Trainer):
                             report("epoch", self.epoch)
                             report('step', self.iteration)
                             report("lr", self.lr_scheduler())
-                            self.train_batch(batch_index, batch, msg)
+                            self.train_batch(batch_index, batch, self.scaler,
+                                             msg)
                             self.after_train_batch()
                             report('iter', batch_index + 1)
                             if not self.use_streamdata:
@@ -253,6 +271,19 @@ class U2Trainer(Trainer):
                 model_conf.output_dim = self.test_loader.vocab_size
 
         model = U2Model.from_config(model_conf)
+
+        # For Mixed Precision Training
+        self.use_amp = self.config.get("use_amp", True)
+        self.amp_level = self.config.get("amp_level", "O1")
+        if self.train and self.use_amp:
+            self.scaler = paddle.amp.GradScaler(
+                init_loss_scaling=self.config.get(
+                    "scale_loss", 32768.0))  #amp default num 32768.0
+            #Set amp_level
+            if self.amp_level == 'O2':
+                model = paddle.amp.decorate(models=model, level=self.amp_level)
+        else:
+            self.scaler = None
         if self.parallel:
             model = paddle.DataParallel(model)
 
@@ -290,7 +321,6 @@ class U2Trainer(Trainer):
             scheduler_type = train_config.scheduler
             scheduler_conf = train_config.scheduler_conf
             return {
-                "grad_clip": train_config.global_grad_clip,
                 "weight_decay": optim_conf.weight_decay,
                 "learning_rate": lr_scheduler
                 if lr_scheduler else optim_conf.lr,
diff --git a/paddlespeech/s2t/exps/u2_kaldi/bin/test.py b/paddlespeech/s2t/exps/u2_kaldi/bin/test.py
index 422483b9..4137537e 100644
--- a/paddlespeech/s2t/exps/u2_kaldi/bin/test.py
+++ b/paddlespeech/s2t/exps/u2_kaldi/bin/test.py
@@ -18,7 +18,7 @@ from yacs.config import CfgNode
 
 from paddlespeech.s2t.training.cli import default_argument_parser
 from paddlespeech.s2t.utils.dynamic_import import dynamic_import
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments
 
 model_test_alias = {
     "u2": "paddlespeech.s2t.exps.u2.model:U2Tester",
diff --git a/paddlespeech/s2t/exps/u2_kaldi/bin/train.py b/paddlespeech/s2t/exps/u2_kaldi/bin/train.py
index b11da715..011aabac 100644
--- a/paddlespeech/s2t/exps/u2_kaldi/bin/train.py
+++ b/paddlespeech/s2t/exps/u2_kaldi/bin/train.py
@@ -19,7 +19,7 @@ from yacs.config import CfgNode
 
 from paddlespeech.s2t.training.cli import default_argument_parser
 from paddlespeech.s2t.utils.dynamic_import import dynamic_import
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments
 
 model_train_alias = {
     "u2": "paddlespeech.s2t.exps.u2.model:U2Trainer",
diff --git a/paddlespeech/s2t/exps/u2_st/bin/export.py b/paddlespeech/s2t/exps/u2_st/bin/export.py
index c641152f..a2a7424c 100644
--- a/paddlespeech/s2t/exps/u2_st/bin/export.py
+++ b/paddlespeech/s2t/exps/u2_st/bin/export.py
@@ -16,7 +16,7 @@ from yacs.config import CfgNode
 
 from paddlespeech.s2t.exps.u2_st.model import U2STTester as Tester
 from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments
 
 
 def main_sp(config, args):
diff --git a/paddlespeech/s2t/exps/u2_st/bin/test.py b/paddlespeech/s2t/exps/u2_st/bin/test.py
index c07c95bd..30a903ce 100644
--- a/paddlespeech/s2t/exps/u2_st/bin/test.py
+++ b/paddlespeech/s2t/exps/u2_st/bin/test.py
@@ -18,7 +18,7 @@ from yacs.config import CfgNode
 
 from paddlespeech.s2t.exps.u2_st.model import U2STTester as Tester
 from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments
 
 
 def main_sp(config, args):
diff --git a/paddlespeech/s2t/exps/u2_st/bin/train.py b/paddlespeech/s2t/exps/u2_st/bin/train.py
index 574942e5..b36a0af4 100644
--- a/paddlespeech/s2t/exps/u2_st/bin/train.py
+++ b/paddlespeech/s2t/exps/u2_st/bin/train.py
@@ -19,7 +19,7 @@ from yacs.config import CfgNode
 
 from paddlespeech.s2t.exps.u2_st.model import U2STTrainer as Trainer
 from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments
 
 
 def main_sp(config, args):
diff --git a/paddlespeech/s2t/exps/wav2vec2/bin/test.py b/paddlespeech/s2t/exps/wav2vec2/bin/test.py
index a376651d..c17cee0f 100644
--- a/paddlespeech/s2t/exps/wav2vec2/bin/test.py
+++ b/paddlespeech/s2t/exps/wav2vec2/bin/test.py
@@ -18,7 +18,7 @@ from yacs.config import CfgNode
 
 from paddlespeech.s2t.exps.wav2vec2.model import Wav2Vec2ASRTester as Tester
 from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments
 
 
 def main_sp(config, args):
diff --git a/paddlespeech/s2t/exps/wav2vec2/bin/train.py b/paddlespeech/s2t/exps/wav2vec2/bin/train.py
index 29e7ef55..0c37f796 100644
--- a/paddlespeech/s2t/exps/wav2vec2/bin/train.py
+++ b/paddlespeech/s2t/exps/wav2vec2/bin/train.py
@@ -19,7 +19,7 @@ from yacs.config import CfgNode
 
 from paddlespeech.s2t.exps.wav2vec2.model import Wav2Vec2ASRTrainer as Trainer
 from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments
 
 
 def main_sp(config, args):
diff --git a/paddlespeech/s2t/exps/wavlm/__init__.py b/paddlespeech/s2t/exps/wavlm/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/paddlespeech/s2t/exps/wavlm/bin/__init__.py b/paddlespeech/s2t/exps/wavlm/bin/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/paddlespeech/s2t/exps/wavlm/bin/test.py b/paddlespeech/s2t/exps/wavlm/bin/test.py
new file mode 100644
index 00000000..f56b418b
--- /dev/null
+++ b/paddlespeech/s2t/exps/wavlm/bin/test.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Evaluation for WavLM model."""
+import cProfile
+
+from yacs.config import CfgNode
+
+from paddlespeech.s2t.exps.wavlm.model import WavLMASRTester as Tester
+from paddlespeech.s2t.training.cli import default_argument_parser
+from paddlespeech.utils.argparse import print_arguments, add_arguments
+
+
+def main_sp(config, args):
+    exp = Tester(config, args)
+    with exp.eval():
+        exp.setup()
+        exp.run_test()
+
+
+def main(config, args):
+    main_sp(config, args)
+
+
+if __name__ == "__main__":
+    parser = default_argument_parser()
+    # save asr result to
+    parser.add_argument(
+        '--dict-path', type=str, default=None, help='dict path.')
+    parser.add_argument(
+        "--result_file", type=str, help="path of save the asr result")
+    args = parser.parse_args()
+    print_arguments(args, globals())
+
+    # https://yaml.org/type/float.html
+    config = CfgNode(new_allowed=True)
+    if args.config:
+        config.merge_from_file(args.config)
+    if args.decode_cfg:
+        decode_confs = CfgNode(new_allowed=True)
+        decode_confs.merge_from_file(args.decode_cfg)
+        config.decode = decode_confs
+    if args.opts:
+        config.merge_from_list(args.opts)
+    config.freeze()
+    print(config)
+    if args.dump_config:
+        with open(args.dump_config, 'w') as f:
+            print(config, file=f)
+
+    # Setting for profiling
+    pr = cProfile.Profile()
+    pr.runcall(main, config, args)
+    pr.dump_stats('test.profile')
diff --git a/paddlespeech/s2t/exps/wavlm/bin/test_wav.py b/paddlespeech/s2t/exps/wavlm/bin/test_wav.py
new file mode 100644
index 00000000..e6c07629
--- /dev/null
+++ b/paddlespeech/s2t/exps/wavlm/bin/test_wav.py
@@ -0,0 +1,125 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Evaluation for wavlm model."""
+import os
+import sys
+from pathlib import Path
+
+import paddle
+import soundfile
+from paddlenlp.transformers import AutoTokenizer
+from yacs.config import CfgNode
+
+from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
+from paddlespeech.s2t.models.wavlm.wavlm_asr import WavLMASR
+from paddlespeech.s2t.training.cli import default_argument_parser
+from paddlespeech.s2t.utils.log import Log
+from paddlespeech.s2t.utils.utility import UpdateConfig
+logger = Log(__name__).getlog()
+
+
+class WavLMInfer():
+    def __init__(self, config, args):
+        self.args = args
+        self.config = config
+        self.audio_file = args.audio_file
+        self.tokenizer = config.get("tokenizer", None)
+
+        if self.tokenizer:
+            self.text_feature = AutoTokenizer.from_pretrained(
+                self.config.tokenizer)
+        else:
+            self.text_feature = TextFeaturizer(
+                unit_type=config.unit_type, vocab=config.vocab_filepath)
+
+        paddle.set_device('gpu' if self.args.ngpu > 0 else 'cpu')
+
+        # model
+        model_conf = config
+        with UpdateConfig(model_conf):
+            model_conf.output_dim = self.text_feature.vocab_size
+        model = WavLMASR.from_config(model_conf)
+        self.model = model
+        self.model.eval()
+
+        # load model
+        params_path = self.args.checkpoint_path + ".pdparams"
+        model_dict = paddle.load(params_path)
+        self.model.set_state_dict(model_dict)
+
+    def run(self):
+        check(args.audio_file)
+
+        with paddle.no_grad():
+            # read
+            audio, _ = soundfile.read(
+                self.audio_file, dtype="int16", always_2d=True)
+            logger.info(f"audio shape: {audio.shape}")
+            xs = paddle.to_tensor(audio, dtype='float32').unsqueeze(axis=0)
+            decode_config = self.config.decode
+            result_transcripts, result_tokenids = self.model.decode(
+                xs,
+                text_feature=self.text_feature,
+                decoding_method=decode_config.decoding_method,
+                beam_size=decode_config.beam_size,
+                tokenizer=self.tokenizer, )
+            rsl = result_transcripts[0]
+            utt = Path(self.audio_file).name
+            logger.info(f"hyp: {utt} {rsl}")
+            return rsl
+
+
+def check(audio_file):
+    if not os.path.isfile(audio_file):
+        print("Please input the right audio file path")
+        sys.exit(-1)
+
+    logger.info("checking the audio file format......")
+    try:
+        sig, sample_rate = soundfile.read(audio_file)
+    except Exception as e:
+        logger.error(str(e))
+        logger.error(
+            "can not open the wav file, please check the audio file format")
+        sys.exit(-1)
+    logger.info("The sample rate is %d" % sample_rate)
+    assert (sample_rate == 16000)
+    logger.info("The audio file format is right")
+
+
+def main(config, args):
+    WavLMInfer(config, args).run()
+
+
+if __name__ == "__main__":
+    parser = default_argument_parser()
+    # save asr result to
+    parser.add_argument(
+        "--result_file", type=str, help="path of save the asr result")
+    parser.add_argument(
+        "--audio_file", type=str, help="path of the input audio file")
+    args = parser.parse_args()
+
+    config = CfgNode(new_allowed=True)
+
+    if args.config:
+        config.merge_from_file(args.config)
+    if args.decode_cfg:
+        decode_confs = CfgNode(new_allowed=True)
+        decode_confs.merge_from_file(args.decode_cfg)
+        config.decode = decode_confs
+    if args.opts:
+        config.merge_from_list(args.opts)
+    config.freeze()
+    main(config, args)
diff --git a/paddlespeech/s2t/exps/wavlm/bin/train.py b/paddlespeech/s2t/exps/wavlm/bin/train.py
new file mode 100644
index 00000000..4ad966b7
--- /dev/null
+++ b/paddlespeech/s2t/exps/wavlm/bin/train.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Trainer for wavlm model."""
+import cProfile
+import os
+
+from yacs.config import CfgNode
+
+from paddlespeech.s2t.exps.wavlm.model import WavLMASRTrainer as Trainer
+from paddlespeech.s2t.training.cli import default_argument_parser
+from paddlespeech.utils.argparse import print_arguments, add_arguments
+
+
+def main_sp(config, args):
+    exp = Trainer(config, args)
+    exp.setup()
+    exp.run()
+
+
+def main(config, args):
+    main_sp(config, args)
+
+
+if __name__ == "__main__":
+    parser = default_argument_parser()
+    parser.add_argument(
+        '--resume', type=str, default="", nargs="?", help='resume ckpt path.')
+    args = parser.parse_args()
+    print_arguments(args, globals())
+    # https://yaml.org/type/float.html
+    config = CfgNode(new_allowed=True)
+    if args.config:
+        config.merge_from_file(args.config)
+    if args.opts:
+        config.merge_from_list(args.opts)
+    config.freeze()
+    if args.dump_config:
+        with open(args.dump_config, 'w') as f:
+            print(config, file=f)
+
+    # Setting for profiling
+    pr = cProfile.Profile()
+    pr.runcall(main, config, args)
+    pr.dump_stats(os.path.join(args.output, 'train.profile'))
diff --git a/paddlespeech/s2t/exps/wavlm/model.py b/paddlespeech/s2t/exps/wavlm/model.py
new file mode 100644
index 00000000..6ed2c5d8
--- /dev/null
+++ b/paddlespeech/s2t/exps/wavlm/model.py
@@ -0,0 +1,912 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Contains wavlm model."""
+import json
+import math
+import os
+import re
+import time
+from collections import OrderedDict
+from contextlib import nullcontext
+
+import jsonlines
+import numpy as np
+import paddle
+from hyperpyyaml import load_hyperpyyaml
+from paddle import distributed as dist
+from paddlenlp.transformers import AutoTokenizer
+
+from paddlespeech.s2t.frontend.featurizer import TextFeaturizer
+from paddlespeech.s2t.io.dataloader import DataLoaderFactory
+from paddlespeech.s2t.io.speechbrain import data_pipeline
+from paddlespeech.s2t.io.speechbrain import dataio
+from paddlespeech.s2t.io.speechbrain import dataset
+from paddlespeech.s2t.io.speechbrain.dataloader import make_dataloader
+from paddlespeech.s2t.models.wavlm.processing.speech_augmentation import TimeDomainSpecAugment
+from paddlespeech.s2t.models.wavlm.wavlm_asr import WavLMASR
+from paddlespeech.s2t.training.optimizer import OptimizerFactory
+from paddlespeech.s2t.training.reporter import ObsScope
+from paddlespeech.s2t.training.reporter import report
+from paddlespeech.s2t.training.scheduler import LRSchedulerFactory
+from paddlespeech.s2t.training.timer import Timer
+from paddlespeech.s2t.training.trainer import Trainer
+from paddlespeech.s2t.utils import error_rate
+from paddlespeech.s2t.utils import layer_tools
+from paddlespeech.s2t.utils import mp_tools
+from paddlespeech.s2t.utils.log import Log
+from paddlespeech.s2t.utils.utility import UpdateConfig
+
+logger = Log(__name__).getlog()
+
+
+def clip_grad_norm_(
+        parameters,
+        max_norm,
+        norm_type=2.0,
+        error_if_nonfinite=False, ):
+    r"""Clips gradient norm of the iteratable parameters.
+
+    Norms are calculated together on all gradients, just as they are
+    connected into one vector. The gradient will be modified in place.
+
+    This API can only run in dynamic graph mode, not static graph mode.
+
+    Args:
+        parameters (Iterable[paddle.Tensor] or paddle.Tensor): Tensors or a single Tensor
+            that will be normalized gradients
+        max_norm (float or int): max norm of the gradients
+        norm_type (float or int): type of the used p-norm. Can be `inf` for
+            infinity norm.
+        error_if_nonfinite (bool): if True, throw an error if the total
+            norm of the gradients from :attr:`parameters` is `nan`,
+            `inf`, or `-inf`.
+
+    Returns:
+        Total norm of the parameter gradients (treated as a single vector).
+    Example:
+        .. code-block:: python
+            import paddle
+
+            x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
+            max_norm = float(5.0)
+            linear = paddle.nn.Linear(in_features=10, out_features=10)
+            out = linear(x)
+            loss = paddle.mean(out)
+            loss.backward()
+
+            paddle.nn.utils.clip_grad_norm_(linear.parameters(), max_norm)
+
+            sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters())
+            sdg.step()
+    """
+    if not paddle.in_dynamic_mode():
+        raise RuntimeError('this API can only run in dynamic mode.')
+
+    if isinstance(parameters, paddle.Tensor):
+        parameters = [parameters]
+
+    support_norm_type = [float("inf"), 0, 1, 2]
+    if norm_type not in support_norm_type:
+        raise ValueError(f'norm_type only support {support_norm_type}')
+
+    grads = [p.grad for p in parameters if p.grad is not None]
+    max_norm = float(max_norm)
+    norm_type = float(norm_type)
+    if len(grads) == 0:
+        return paddle.to_tensor(0.0)
+    if norm_type == float("inf"):
+        norms = [g.detach().abs().max() for g in grads]
+        total_norm = (norms[0]
+                      if len(norms) == 1 else paddle.max(paddle.stack(norms)))
+    else:
+        total_norm = paddle.linalg.norm(
+            paddle.stack(
+                [paddle.linalg.norm(g.detach(), norm_type) for g in grads]),
+            norm_type, )
+
+    if error_if_nonfinite and paddle.logical_or(total_norm.isnan(),
+                                                total_norm.isinf()):
+        raise RuntimeError(
+            f'The total norm of {norm_type} order of the gradients from '
+            '`parameters` is non-finite, so it cannot be clipped. In any case, '
+            'disable this error and scale the gradient by non-finite norm, '
+            'set `error_if_nonfinite=False`')
+    clip_coef = max_norm / (total_norm + 1e-6)
+    # Note: when the coef is clamped to 1, it is redundant to multiply the clamped coef, but this
+    # avoids the `if clip_coef < 1:` condition.
+    clip_coef_clamped = paddle.clip(clip_coef, max=1.0)
+    with paddle.no_grad():
+        for _, p in enumerate(parameters):
+            g = p.grad
+            if g is not None:
+                p.grad = paddle.multiply(x=g, y=clip_coef_clamped)
+    return total_norm
+
+
+class WavLMASRTrainer(Trainer):
+    def __init__(self, config, args):
+        super().__init__(config, args)
+        self.avg_train_loss = 0.0
+        self.loss_isfinite = True  # while flag is 'False', loss in Nan or inf, and can not be avg
+        self.use_sb = True  # whether use speech brain dataloader
+
+    def update_average(self, batch_index, loss):
+        """Update running average of the loss.
+        Arguments
+        ---------
+        batch_index : int
+            current batch index
+        loss : paddle.tensor
+            detached loss, a single float value.
+        """
+        if math.isfinite(loss):
+            self.avg_train_loss -= self.avg_train_loss / (batch_index + 1)
+            self.avg_train_loss += loss / (batch_index + 1)
+        else:
+            self.loss_isfinite = False
+            logger.info('loss:{} in Nan or inf, error'.format(loss))
+
+    def before_train(self):
+        from_scratch = self.resume_or_scratch()
+        if from_scratch:
+            # scratch: save init model, i.e. 0 epoch
+            self.save(tag='init', infos=None)
+        else:
+            # resume: train next_epoch and next_iteration
+            self.epoch += 1
+            logger.info(
+                f"Resume train: epoch {self.epoch }, step {self.iteration}!")
+
+        self.maybe_batch_sampler_step()
+
+    def train_batch(self, batch_index, batch, msg):
+        train_conf = self.config
+        start = time.time()
+
+        # forward
+        ## sb data pipeline
+        if self.use_sb:
+            wav, wavs_lens_rate = batch['sig']
+            target, target_lens_rate = batch['tokens']
+            target_lens = (target_lens_rate *
+                           target.shape[1]).round().astype(paddle.int64)
+        else:
+            utt, wav, wavs_lens, target, target_lens = batch
+            wavs_lens_rate = wavs_lens / wav.shape[1]
+            wav = wav[:, :, 0]
+
+        if hasattr(train_conf, 'audio_augment'):
+            wav = self.speech_augmentation(wav, wavs_lens_rate)
+        loss = self.model(wav, wavs_lens_rate, target, target_lens)
+
+        # loss div by `batch_size * accum_grad`
+        loss /= train_conf.accum_grad
+        # update self.avg_train_loss
+        self.update_average(batch_index, float(loss))
+
+        # loss backward
+        if (batch_index + 1) % train_conf.accum_grad != 0:
+            # Disable gradient synchronizations across DDP processes.
+            # Within this context, gradients will be accumulated on module
+            # variables, which will later be synchronized.
+            # When using cpu w/o DDP, model does not have `no_sync`
+            context = self.model.no_sync if (hasattr(self.model, "no_sync") and
+                                             self.parallel) else nullcontext
+        else:
+            # Used for single gpu training and DDP gradient synchronization
+            # processes.
+            context = nullcontext
+        with context():
+            loss.backward()
+
+            layer_tools.print_grads(self.model, print_func=None)
+        
+        # NOTE: the code below asserted that the backward() is problematic, and as more steps are accumulated, the output from wavlm alone will be the same for all frames
+        # optimizer step old
+        if (batch_index + 1) % train_conf.accum_grad == 0:
+            #do global grad clip
+            if train_conf.global_grad_clip != 0:
+                clip_grad_norm_(self.model.parameters(),
+                                train_conf.global_grad_clip)
+            self.model_optimizer.step()
+            self.model_optimizer.clear_grad()
+            if not train_conf.freeze_wavlm:
+                self.wavlm_optimizer.step()
+                self.wavlm_optimizer.clear_grad()
+            if self.config.model_scheduler != 'newbobscheduler':
+                self.model_lr_scheduler.step()
+            if self.config.wavlm_scheduler != 'newbobscheduler':
+                if not train_conf.freeze_wavlm:
+                    self.wavlm_lr_scheduler.step()
+            self.iteration += 1
+
+        losses_np = {'loss': self.avg_train_loss * train_conf.accum_grad}
+        iteration_time = time.time() - start
+        for k, v in losses_np.items():
+            report(k, v)
+        report("loss_whitoutavg", float(loss))
+        report("batch_size", self.config.batch_size)
+        report("accum", train_conf.accum_grad)
+        report("step_cost", iteration_time)
+
+        if (batch_index + 1) % train_conf.accum_grad == 0:
+            if dist.get_rank() == 0 and self.visualizer:
+                losses_np_v = losses_np.copy()
+                losses_np_v.update({
+                    "model_lr": self.model_lr_scheduler(),
+                    "wavlm_lr": self.wavlm_lr_scheduler()
+                })
+                for key, val in losses_np_v.items():
+                    self.visualizer.add_scalar(
+                        tag='train/' + key, value=val, step=self.iteration - 1)
+
+    @paddle.no_grad()
+    def valid(self):
+        self.model.eval()
+        if not self.use_streamdata:
+            logger.info(
+                f"Valid Total Examples: {len(self.valid_loader.dataset)}")
+        valid_losses = {}
+        step = 0
+        total_loss = 0.0
+        num_seen_utts = 1  # use update_average and no need for num_seen_utts here
+        for i, batch in enumerate(self.valid_loader):
+            if self.use_sb:
+                wav, wavs_lens_rate = batch['sig']
+                target, target_lens_rate = batch['tokens']
+                target_lens = (target_lens_rate *
+                               target.shape[1]).round().astype(paddle.int64)
+            else:
+                utt, wav, wavs_lens, target, target_lens = batch
+                wavs_lens_rate = wavs_lens / wav.shape[1]
+                wav = wav[:, :, 0]
+
+            loss = self.model(wav, wavs_lens_rate, target, target_lens)
+            # use update_average
+            total_loss -= total_loss / (step + 1)
+            total_loss += loss / (step + 1)
+
+            if math.isfinite(float(loss)):
+                step += 1
+                valid_losses['val_loss'] = float(loss)
+            else:
+                logger.info('loss:{} in Nan or inf, error'.format(float(loss)))
+
+            if (i + 1) % self.config.log_interval == 0:
+                valid_losses['val_history_loss'] = float(total_loss)
+
+                # logging
+                msg = f"Valid: Rank: {dist.get_rank()}, "
+                msg += "epoch: {}, ".format(self.epoch)
+                msg += "step: {}, ".format(self.iteration)
+                if not self.use_streamdata:
+                    msg += "batch: {}/{}, ".format(i + 1,
+                                                   len(self.valid_loader))
+                msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                                 for k, v in valid_losses.items())
+                logger.info(msg)
+
+        logger.info(
+            'Rank {} Val info val_loss {}'.format(dist.get_rank(), total_loss))
+        return total_loss, num_seen_utts
+
+    @mp_tools.rank_zero_only
+    def save(self, tag=None, infos: dict=None):
+        """Save checkpoint (model parameters and optimizer states).
+
+        Args:
+            tag (int or str, optional): None for step, else using tag, e.g epoch. Defaults to None.
+            infos (dict, optional): meta data to save. Defaults to None.
+        """
+
+        infos = infos if infos else dict()
+        infos.update({
+            "epoch": self.epoch,
+            "model_lr": self.model_optimizer.get_lr(),
+            "wavlm_lr": self.wavlm_optimizer.get_lr()
+        })
+
+        checkpoint_path = os.path.join(
+            self.checkpoint_dir,
+            "{}".format(self.iteration if tag is None else tag))
+
+        model_dict = self.model.state_dict()
+        params_path = checkpoint_path + ".pdparams"
+        paddle.save(model_dict, params_path)
+        logger.info("Saved model to {}".format(params_path))
+
+        model_opt_dict = self.model_optimizer.state_dict()
+        wavlm_opt_dict = self.wavlm_optimizer.state_dict()
+
+        opt_dict = {'model': model_opt_dict, 'wavlm': wavlm_opt_dict}
+
+        optimizer_path = checkpoint_path + ".pdopt"
+        paddle.save(opt_dict, optimizer_path)
+        logger.info("Saved optimzier state to {}".format(optimizer_path))
+
+        scheduler_dict = {}
+
+        if self.config.model_scheduler == 'newbobscheduler':
+            scheduler_dict['model'] = self.model_lr_scheduler.save()
+        if self.config.wavlm_scheduler == 'newbobscheduler':
+            scheduler_dict['wavlm'] = self.wavlm_lr_scheduler.save()
+        if scheduler_dict:
+            scheduler_path = checkpoint_path + ".pdlrs"
+            paddle.save(scheduler_dict, scheduler_path)
+            logger.info("Saved scheduler state to {}".format(scheduler_path))
+        info_path = re.sub('.pdparams$', '.json', params_path)
+        infos = {} if infos is None else infos
+        with open(info_path, 'w', encoding='utf8') as fout:
+            data = json.dumps(infos)
+            fout.write(data)
+
+    def resume_or_scratch(self):
+        """Resume from latest checkpoint at checkpoints in the output
+        directory or load a specified checkpoint.
+
+        If ``args.checkpoint_path`` is not None, load the checkpoint, else
+        resume training.
+        """
+        scratch = None
+        if self.args.resume:
+            # just restore ckpt
+            # lr will resotre from optimizer ckpt
+            resume_json_path = os.path.join(self.checkpoint_dir,
+                                            self.args.resume + '.json')
+            with open(resume_json_path, 'r', encoding='utf8') as f:
+                resume_json = json.load(f)
+            self.iteration = 0
+            self.epoch = resume_json["epoch"]
+
+            # resotre model from *.pdparams
+            params_path = os.path.join(self.checkpoint_dir,
+                                       "{}".format(self.epoch)) + '.pdparams'
+            model_dict = paddle.load(params_path)
+            self.model.set_state_dict(model_dict)
+
+            # resotre optimizer from *.pdopt
+            optimizer_path = os.path.join(self.checkpoint_dir,
+                                          "{}".format(self.epoch)) + '.pdopt'
+            optimizer_dict = paddle.load(optimizer_path)
+            self.model_optimizer.set_state_dict(optimizer_dict['model'])
+            self.wavlm_optimizer.set_state_dict(optimizer_dict['wavlm'])
+
+            # resotre lr_scheduler from *.pdlrs
+            scheduler_path = os.path.join(self.checkpoint_dir,
+                                          "{}".format(self.epoch)) + '.pdlrs'
+            if os.path.isfile(os.path.join(scheduler_path)):
+                scheduler_dict = paddle.load(scheduler_path)
+                if self.config.model_scheduler == 'newbobscheduler':
+                    self.model_lr_scheduler.load(scheduler_dict['model'])
+                if self.config.wavlm_scheduler == 'newbobscheduler':
+                    self.wavlm_lr_scheduler.load(scheduler_dict['wavlm'])
+            logger.info(
+                f"Restore ckpt: epoch {self.epoch }, step {self.iteration}!")
+            scratch = False
+        else:
+            self.iteration = 0
+            self.epoch = 0
+            scratch = True
+            logger.info("Init from scratch!")
+        return scratch
+
+    def do_train(self):
+        """The training process control by step."""
+        # !!!IMPORTANT!!!
+        # Try to export the model by script, if fails, we should refine
+        # the code to satisfy the script export requirements
+        # script_model = paddle.jit.to_static(self.model)
+        # script_model_path = str(self.checkpoint_dir / 'init')
+        # paddle.jit.save(script_model, script_model_path)
+
+        self.before_train()
+        if not self.use_streamdata:
+            logger.info(
+                f"Train Total Examples: {len(self.train_loader.dataset)}")
+        while self.epoch < self.config.n_epoch:
+            with Timer("Epoch-Train Time Cost: {}"):
+                self.model.train()
+                try:
+                    data_start_time = time.time()
+                    for batch_index, batch in enumerate(self.train_loader):
+                        dataload_time = time.time() - data_start_time
+                        msg = "Train:"
+                        observation = OrderedDict()
+                        with ObsScope(observation):
+                            report("Rank", dist.get_rank())
+                            report("epoch", self.epoch)
+                            report('step', self.iteration)
+                            report("model_lr", self.model_optimizer.get_lr())
+                            report("wavlm_lr",
+                                   self.wavlm_optimizer.get_lr())
+                            self.train_batch(batch_index, batch, msg)
+                            self.after_train_batch()
+                            report('iter', batch_index + 1)
+                            if not self.use_streamdata:
+                                report('total', len(self.train_loader))
+                            report('reader_cost', dataload_time)
+                        observation['batch_cost'] = observation[
+                            'reader_cost'] + observation['step_cost']
+                        observation['samples'] = observation['batch_size']
+                        observation['ips,samples/s'] = observation[
+                            'batch_size'] / observation['batch_cost']
+                        for k, v in observation.items():
+                            msg += f" {k.split(',')[0]}: "
+                            msg += f"{v:>.8f}" if isinstance(v,
+                                                             float) else f"{v}"
+                            msg += f" {k.split(',')[1]}" if len(
+                                k.split(',')) == 2 else ""
+                            msg += ","
+                        msg = msg[:-1]  # remove the last ","
+                        if (batch_index + 1) % self.config.log_interval == 0:
+                            logger.info(msg)
+                        data_start_time = time.time()
+                except Exception as e:
+                    logger.error(e)
+                    raise e
+            with Timer("Eval Time Cost: {}"):
+                total_loss, num_seen_utts = self.valid()
+                if dist.get_world_size() > 1:
+                    num_seen_utts = paddle.to_tensor(num_seen_utts)
+                    dist.all_reduce(num_seen_utts)
+                    total_loss = paddle.to_tensor(total_loss)
+                    dist.all_reduce(total_loss)
+                    cv_loss = total_loss / num_seen_utts
+                    cv_loss = float(cv_loss)
+                else:
+                    cv_loss = float(total_loss)
+            logger.info(
+                'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))
+            if self.visualizer:
+                self.visualizer.add_scalar(
+                    tag='eval/cv_loss', value=cv_loss, step=self.epoch)
+                self.visualizer.add_scalar(
+                    tag='eval/model_lr',
+                    value=self.model_lr_scheduler(),
+                    step=self.epoch)
+                self.visualizer.add_scalar(
+                    tag='eval/wavlm_lr',
+                    value=self.wavlm_lr_scheduler(),
+                    step=self.epoch)
+
+            if self.config.model_scheduler == 'newbobscheduler':
+                self.model_lr_scheduler.step(cv_loss)
+            if self.config.wavlm_scheduler == 'newbobscheduler':
+                if not self.config.freeze_wavlm:
+                    self.wavlm_lr_scheduler.step(cv_loss)
+            self.save(tag=self.epoch, infos={'val_loss': cv_loss})
+            self.avg_train_loss = 0.0
+            self.new_epoch()
+
+    def dataio_prepare(self, hparams):
+        """This function prepares the datasets to be used in the brain class.
+        It also defines the data processing pipeline through user-defined functions."""
+        data_folder = hparams["data_folder"]
+
+        train_data = dataset.DynamicItemDataset.from_csv(
+            csv_path=hparams["train_data"],
+            replacements={"data_root": data_folder}, )
+
+        if hparams["sorting"] == "ascending":
+            # we sort training data to speed up training and get better results.
+            train_data = train_data.filtered_sorted(sort_key="duration")
+            # when sorting do not shuffle in dataloader ! otherwise is pointless
+            hparams["train_dataloader_opts"]["shuffle"] = False
+
+        elif hparams["sorting"] == "descending":
+            train_data = train_data.filtered_sorted(
+                sort_key="duration", reverse=True)
+            # when sorting do not shuffle in dataloader ! otherwise is pointless
+            hparams["train_dataloader_opts"]["shuffle"] = False
+
+        elif hparams["sorting"] == "random":
+            pass
+
+        else:
+            raise NotImplementedError(
+                "sorting must be random, ascending or descending")
+
+        valid_data = dataset.DynamicItemDataset.from_csv(
+            csv_path=hparams["valid_data"],
+            replacements={"data_root": data_folder}, )
+        valid_data = valid_data.filtered_sorted(sort_key="duration")
+
+        test_data = dataset.DynamicItemDataset.from_csv(
+            csv_path=hparams["test_data"],
+            replacements={"data_root": data_folder}, )
+        test_data = test_data.filtered_sorted(sort_key="duration")
+
+        datasets = [train_data, valid_data, test_data]
+
+        # Defining tokenizer and loading it
+        tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese')
+        self.tokenizer = tokenizer
+        # 2. Define audio pipeline:
+        @data_pipeline.takes("wav")
+        @data_pipeline.provides("sig")
+        def audio_pipeline(wav):
+            sig = dataio.read_audio(wav)
+            return sig
+
+        dataset.add_dynamic_item(datasets, audio_pipeline)
+
+        # 3. Define text pipeline:
+        @data_pipeline.takes("transcript")
+        @data_pipeline.provides("wrd", "tokens_list", "tokens")
+        def text_pipeline(wrd):
+            wrd = "".join(wrd.split(" "))
+            yield wrd
+            tokens_list = tokenizer(wrd)["input_ids"]
+            yield tokens_list
+            tokens = np.array(tokens_list, dtype="int64")
+            # tokens = paddle.to_tensor(tokens_list, dtype="int64")
+            yield tokens
+
+        dataset.add_dynamic_item(datasets, text_pipeline)
+
+        # 4. Set output:
+        dataset.set_output_keys(
+            datasets,
+            ["id", "sig", "wrd", "tokens"], )
+
+        # 5. If Dynamic Batching is used, we instantiate the needed samplers.
+        train_batch_sampler = None
+        valid_batch_sampler = None
+        if hparams["dynamic_batching"]:
+            from sampler import DynamicBatchSampler  # noqa
+
+            dynamic_hparams = hparams["dynamic_batch_sampler"]
+            num_buckets = dynamic_hparams["num_buckets"]
+
+            train_batch_sampler = DynamicBatchSampler(
+                train_data,
+                dynamic_hparams["max_batch_len"],
+                num_buckets=num_buckets,
+                length_func=lambda x: x["duration"],
+                shuffle=dynamic_hparams["shuffle_ex"],
+                batch_ordering=dynamic_hparams["batch_ordering"], )
+
+            valid_batch_sampler = DynamicBatchSampler(
+                valid_data,
+                dynamic_hparams["max_batch_len"],
+                num_buckets=num_buckets,
+                length_func=lambda x: x["duration"],
+                shuffle=dynamic_hparams["shuffle_ex"],
+                batch_ordering=dynamic_hparams["batch_ordering"], )
+
+        return (train_data, valid_data, test_data, tokenizer,
+                train_batch_sampler, valid_batch_sampler, )
+
+    def setup_dataloader(self):
+        config = self.config.clone()
+        self.use_streamdata = config.get("use_stream_data", False)
+        self.use_sb = config.get("use_sb_pipeline", False)
+        if self.use_sb:
+            hparams_file = config.sb_pipeline_conf
+            with open(hparams_file, 'r', encoding='utf8') as fin:
+                hparams = load_hyperpyyaml(fin, None)
+
+            (train_data, valid_data, test_data, tokenizer, train_bsampler,
+             valid_bsampler, ) = self.dataio_prepare(hparams)
+
+            train_dataloader_opts = hparams["train_dataloader_opts"]
+            valid_dataloader_opts = hparams["valid_dataloader_opts"]
+
+            if train_bsampler is not None:
+                train_dataloader_opts = {
+                    "batch_sampler": train_bsampler,
+                    "num_workers": hparams["num_workers"],
+                }
+
+            if valid_bsampler is not None:
+                valid_dataloader_opts = {"batch_sampler": valid_bsampler}
+
+            if self.train:
+                self.train_loader = make_dataloader(
+                    train_data, stage='train', **train_dataloader_opts)
+                self.valid_loader = make_dataloader(
+                    valid_data,
+                    stage='val',
+                    **valid_dataloader_opts, )
+                logger.info("Setup train/valid Dataloader!")
+            else:
+                self.test_loader = make_dataloader(
+                    test_data, stage='test', **hparams["test_dataloader_opts"])
+        else:
+            if self.train:
+                self.train_loader = DataLoaderFactory.get_dataloader(
+                    'train', config, self.args)
+                self.valid_loader = DataLoaderFactory.get_dataloader(
+                    'valid', config, self.args)
+                logger.info("Setup train/valid Dataloader!")
+            else:
+                decode_batch_size = config.get('decode', dict()).get(
+                    'decode_batch_size', 1)
+                self.test_loader = DataLoaderFactory.get_dataloader(
+                    'test', config, self.args)
+                self.align_loader = DataLoaderFactory.get_dataloader(
+                    'align', config, self.args)
+                logger.info("Setup test/align Dataloader!")
+
+    def setup_model(self):
+        config = self.config
+        model_conf = config
+
+        with UpdateConfig(model_conf):
+            if self.use_sb:
+                model_conf.output_dim = self.tokenizer.vocab_size
+            else:
+                if self.train:
+                    model_conf.input_dim = self.train_loader.feat_dim
+                    model_conf.output_dim = self.train_loader.vocab_size
+                else:
+                    model_conf.input_dim = self.test_loader.feat_dim
+                    model_conf.output_dim = self.test_loader.vocab_size
+
+        model = WavLMASR.from_config(model_conf)
+
+        model_dict = paddle.load(config.wavlm_params_path)
+        model.wavlm.set_state_dict(model_dict)
+
+        if self.parallel:
+            model = paddle.DataParallel(model, find_unused_parameters=True)
+
+        layer_tools.print_params(model, logger.info)
+        self.model = model
+        logger.info("Setup model!")
+
+        # setup speech augmentation for wavlm
+        if hasattr(config, 'audio_augment') and self.train:
+            self.speech_augmentation = TimeDomainSpecAugment(
+                **config.audio_augment)
+
+        if not self.train:
+            return
+
+        train_config = config
+        model_optim_type = train_config.model_optim
+        model_optim_conf = train_config.model_optim_conf
+        logger.info("optim_model:{},{}", model_optim_type, model_optim_conf)
+        wavlm_optim_type = train_config.wavlm_optim
+        wavlm_optim_conf = train_config.wavlm_optim_conf
+        logger.info("optim_model:{},{}", wavlm_optim_type,
+                    wavlm_optim_conf)
+
+        model_scheduler_type = train_config.model_scheduler
+        model_scheduler_conf = train_config.model_scheduler_conf
+        wavlm_scheduler_type = train_config.wavlm_scheduler
+        wavlm_scheduler_conf = train_config.wavlm_scheduler_conf
+
+        model_scheduler_args = dict(
+            **{"learning_rate": model_optim_conf.lr,
+               "verbose": False}, **(dict(model_scheduler_conf)))
+
+        wavlm_scheduler_args = dict(
+            **{"learning_rate": wavlm_optim_conf.lr,
+               "verbose": False}, **(dict(wavlm_scheduler_conf)))
+
+        model_lr_scheduler = LRSchedulerFactory.from_args(model_scheduler_type,
+                                                          model_scheduler_args)
+        wavlm_lr_scheduler = LRSchedulerFactory.from_args(
+            wavlm_scheduler_type, wavlm_scheduler_args)
+
+        def optimizer_args(
+                config,
+                optim_type,
+                optim_conf,
+                parameters,
+                lr_scheduler=None, ):
+            optim_arg = dict(optim_conf)
+            optim_arg.update({
+                "learning_rate":
+                lr_scheduler if lr_scheduler else optim_conf.lr,
+                "parameters":
+                parameters
+            })
+            return optim_arg
+
+        model_optimizer_args = optimizer_args(
+            config, model_optim_type,
+            model_optim_conf, 
+            [{'params': model._layers.enc.parameters()}, {'params': model._layers.ctc.parameters()}] if self.parallel else [{'params': model.enc.parameters()}, {'params': model.ctc.parameters()}],
+            model_lr_scheduler
+        )
+            # [{'params': model._layers.ctc.parameters()}] if self.parallel else [{'params': model.ctc.parameters()}], model_lr_scheduler)
+
+
+        wavlm_optimizer_args = optimizer_args(
+            config, wavlm_optim_type, wavlm_optim_conf,
+            model._layers.wavlm.parameters() if self.parallel else
+            model.wavlm.parameters(), wavlm_lr_scheduler)
+
+        model_optimizer = OptimizerFactory.from_args(model_optim_type,
+                                                     model_optimizer_args)
+        wavlm_optimizer = OptimizerFactory.from_args(wavlm_optim_type,
+                                                        wavlm_optimizer_args)
+
+        self.model_optimizer = model_optimizer
+        self.wavlm_optimizer = wavlm_optimizer
+        self.model_lr_scheduler = model_lr_scheduler
+        self.wavlm_lr_scheduler = wavlm_lr_scheduler
+        logger.info("Setup optimizer/lr_scheduler!")
+
+
+class WavLMASRTester(WavLMASRTrainer):
+    def __init__(self, config, args):
+        super().__init__(config, args)
+        self.text_featurizer = TextFeaturizer(
+            unit_type=config.unit_type, vocab=config.vocab_filepath)
+        self.vocab_list = self.text_featurizer.vocab_list
+
+    def id2token(self, texts, texts_len):
+        """ ord() id to chr() chr """
+        trans = []
+        for text, n in zip(texts, texts_len):
+            n = n.numpy().item()
+            ids = text[:n]
+            trans.append(self.text_featurizer.defeaturize(ids.numpy().tolist()))
+        return trans
+
+    def compute_metrics(self, id, audio, audio_len, texts, texts_len,
+                        fout=None):
+        decode_cfg = self.config.decode
+        errors_sum, len_refs, num_ins = 0.0, 0, 0
+        errors_func = error_rate.char_errors if decode_cfg.error_rate_type == 'cer' else error_rate.word_errors
+        error_rate_func = error_rate.cer if decode_cfg.error_rate_type == 'cer' else error_rate.wer
+
+        start_time = time.time()
+        target_transcripts = self.id2token(texts, texts_len)
+        result_transcripts, result_tokenids = self.model.decode(
+            audio,
+            text_feature=self.text_featurizer,
+            decoding_method=decode_cfg.decoding_method,
+            beam_size=decode_cfg.beam_size)
+        decode_time = time.time() - start_time
+
+        for utt, target, result, rec_tids in zip(
+                id, target_transcripts, result_transcripts, result_tokenids):
+            errors, len_ref = errors_func(target, result)
+            errors_sum += errors
+            len_refs += len_ref
+            num_ins += 1
+            if fout:
+                fout.write({
+                    "utt": utt,
+                    "refs": [target],
+                    "hyps": [result],
+                    "hyps_tokenid": [rec_tids],
+                })
+            logger.info(f"Utt: {utt}")
+            logger.info(f"Ref: {target}")
+            logger.info(f"Hyp: {result}")
+            logger.info("One example error rate [%s] = %f" % (
+                decode_cfg.error_rate_type, error_rate_func(target, result)))
+
+        return dict(
+            errors_sum=errors_sum,
+            len_refs=len_refs,
+            num_ins=num_ins,  # num examples
+            error_rate=errors_sum / len_refs,
+            error_rate_type=decode_cfg.error_rate_type,
+            num_frames=audio_len.sum().numpy().item(),
+            decode_time=decode_time)
+
+    def sb_compute_metrics(self, id, sig, wrd, tokens, fout=None):
+        decode_cfg = self.config.decode
+        errors_sum, len_refs, num_ins = 0.0, 0, 0
+        errors_func = error_rate.char_errors if decode_cfg.error_rate_type == 'cer' else error_rate.word_errors
+        error_rate_func = error_rate.cer if decode_cfg.error_rate_type == 'cer' else error_rate.wer
+        start_time = time.time()
+        target_transcripts = wrd
+        result_transcripts, result_tokenids = self.model.decode(
+            sig[0],
+            text_feature=self.tokenizer,
+            decoding_method=decode_cfg.decoding_method,
+            beam_size=decode_cfg.beam_size,
+            sb_pipeline=True)
+        decode_time = time.time() - start_time
+
+        for utt, target, result, rec_tids in zip(
+                id, target_transcripts, result_transcripts, result_tokenids):
+            errors, len_ref = errors_func(target, result)
+            errors_sum += errors
+            len_refs += len_ref
+            num_ins += 1
+            if fout:
+                fout.write({
+                    "utt": utt,
+                    "refs": [target],
+                    "hyps": [result],
+                    "hyps_tokenid": [rec_tids],
+                })
+            logger.info(f"Utt: {utt}")
+            logger.info(f"Ref: {target}")
+            logger.info(f"Hyp: {result}")
+            logger.info("One example error rate [%s] = %f" % (
+                decode_cfg.error_rate_type, error_rate_func(target, result)))
+
+        return dict(
+            errors_sum=errors_sum,
+            len_refs=len_refs,
+            num_ins=num_ins,  # num examples
+            error_rate=errors_sum / len_refs,
+            error_rate_type=decode_cfg.error_rate_type,
+            num_frames=sig[1].sum().numpy().item(),
+            decode_time=decode_time)
+
+    @mp_tools.rank_zero_only
+    @paddle.no_grad()
+    def test(self):
+        logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}")
+        self.model.eval()
+
+        error_rate_type = None
+        errors_sum, len_refs, num_ins = 0.0, 0, 0
+        num_frames = 0.0
+        num_time = 0.0
+        # Initialized the decoder in model
+        decode_cfg = self.config.decode
+        vocab_list = self.vocab_list
+        decode_batch_size = decode_cfg.decode_batch_size
+
+        with jsonlines.open(self.args.result_file, 'w') as fout:
+            for i, batch in enumerate(self.test_loader):
+                if self.use_sb:
+                    metrics = self.sb_compute_metrics(**batch, fout=fout)
+                else:
+                    metrics = self.compute_metrics(*batch, fout=fout)
+                num_frames += metrics['num_frames']
+                num_time += metrics["decode_time"]
+                errors_sum += metrics['errors_sum']
+                len_refs += metrics['len_refs']
+                num_ins += metrics['num_ins']
+                error_rate_type = metrics['error_rate_type']
+                rtf = num_time / (num_frames)
+                logger.info(
+                    "RTF: %f, Error rate [%s] (%d/?) = %f" %
+                    (rtf, error_rate_type, num_ins, errors_sum / len_refs))
+
+        # logging
+        msg = "Test: "
+        msg += "epoch: {}, ".format(self.epoch)
+        msg += "step: {}, ".format(self.iteration)
+        msg += "Final error rate [%s] (%d/%d) = %f" % (
+            error_rate_type, num_ins, num_ins, errors_sum / len_refs)
+        logger.info(msg)
+
+        err_meta_path = os.path.splitext(self.args.result_file)[0] + '.err'
+        err_type_str = "{}".format(error_rate_type)
+        with open(err_meta_path, 'w', encoding='utf8') as f:
+            data = json.dumps({
+                "epoch":
+                self.epoch,
+                "step":
+                self.iteration,
+                "rtf":
+                rtf,
+                error_rate_type:
+                errors_sum / len_refs,
+                "dataset_hour": (num_frames) / 1000.0 / 3600.0,
+                "process_hour":
+                num_time / 1000.0 / 3600.0,
+                "num_examples":
+                num_ins,
+                "err_sum":
+                errors_sum,
+                "ref_len":
+                len_refs,
+                "decode_method":
+                self.config.decode.decoding_method,
+            })
+            f.write(data + '\n')
diff --git a/paddlespeech/s2t/frontend/augmentor/augmentation.py b/paddlespeech/s2t/frontend/augmentor/augmentation.py
index 4c5ca4fe..744ea56d 100644
--- a/paddlespeech/s2t/frontend/augmentor/augmentation.py
+++ b/paddlespeech/s2t/frontend/augmentor/augmentation.py
@@ -45,7 +45,7 @@ class AugmentationPipeline():
     samples to make the model invariant to certain types of perturbations in the
     real world, improving model's generalization ability.
 
-    The pipeline is built according the the augmentation configuration in json
+    The pipeline is built according to the augmentation configuration in json
     string, e.g.
     
     .. code-block::
diff --git a/paddlespeech/s2t/frontend/featurizer/text_featurizer.py b/paddlespeech/s2t/frontend/featurizer/text_featurizer.py
index 982c6b8f..7623d0b8 100644
--- a/paddlespeech/s2t/frontend/featurizer/text_featurizer.py
+++ b/paddlespeech/s2t/frontend/featurizer/text_featurizer.py
@@ -48,13 +48,16 @@ class TextFeaturizer():
         self.unit_type = unit_type
         self.unk = UNK
         self.maskctc = maskctc
+        self.vocab_path_or_list = vocab
 
-        if vocab:
+        if self.vocab_path_or_list:
             self.vocab_dict, self._id2token, self.vocab_list, self.unk_id, self.eos_id, self.blank_id = self._load_vocabulary_from_file(
                 vocab, maskctc)
             self.vocab_size = len(self.vocab_list)
         else:
-            logger.warning("TextFeaturizer: not have vocab file or vocab list.")
+            logger.warning(
+                "TextFeaturizer: not have vocab file or vocab list. Only Tokenizer can use, can not convert to token idx"
+            )
 
         if unit_type == 'spm':
             spm_model = spm_model_prefix + '.model'
@@ -62,6 +65,7 @@ class TextFeaturizer():
             self.sp.Load(spm_model)
 
     def tokenize(self, text, replace_space=True):
+        """tokenizer split text into text tokens"""
         if self.unit_type == 'char':
             tokens = self.char_tokenize(text, replace_space)
         elif self.unit_type == 'word':
@@ -71,6 +75,7 @@ class TextFeaturizer():
         return tokens
 
     def detokenize(self, tokens):
+        """tokenizer convert text tokens back to text"""
         if self.unit_type == 'char':
             text = self.char_detokenize(tokens)
         elif self.unit_type == 'word':
@@ -88,6 +93,7 @@ class TextFeaturizer():
         Returns:
             List[int]: List of token indices.
         """
+        assert self.vocab_path_or_list, "toidx need vocab path or vocab list"
         tokens = self.tokenize(text)
         ids = []
         for token in tokens:
@@ -107,6 +113,7 @@ class TextFeaturizer():
         Returns:
             str: Text.
         """
+        assert self.vocab_path_or_list, "toidx need vocab path or vocab list"
         tokens = []
         for idx in idxs:
             if idx == self.eos_id:
@@ -127,10 +134,10 @@ class TextFeaturizer():
         """
         text = text.strip()
         if replace_space:
-            text_list = [SPACE if item == " " else item for item in list(text)]
+            tokens = [SPACE if item == " " else item for item in list(text)]
         else:
-            text_list = list(text)
-        return text_list
+            tokens = list(text)
+        return tokens
 
     def char_detokenize(self, tokens):
         """Character detokenizer.
diff --git a/paddlespeech/s2t/io/reader.py b/paddlespeech/s2t/io/reader.py
index 5e018bef..d433a643 100644
--- a/paddlespeech/s2t/io/reader.py
+++ b/paddlespeech/s2t/io/reader.py
@@ -14,9 +14,12 @@
 # Modified from espnet(https://github.com/espnet/espnet)
 from collections import OrderedDict
 
+import io
+import os
 import kaldiio
 import numpy as np
 import soundfile
+import h5py
 
 from .utility import feat_type
 from paddlespeech.audio.transform.transformation import Transformation
@@ -401,7 +404,7 @@ class SoundHDF5File():
     def __contains__(self, item):
         return item in self.file
 
-    def __len__(self, item):
+    def __len__(self):
         return len(self.file)
 
     def __enter__(self):
diff --git a/paddlespeech/s2t/io/speechbrain/sampler.py b/paddlespeech/s2t/io/speechbrain/sampler.py
index ba13193e..09a884c2 100755
--- a/paddlespeech/s2t/io/speechbrain/sampler.py
+++ b/paddlespeech/s2t/io/speechbrain/sampler.py
@@ -283,7 +283,7 @@ class DynamicBatchSampler(Sampler):
             num_quantiles, )
         # get quantiles using lognormal distribution
         quantiles = lognorm.ppf(latent_boundaries, 1)
-        # scale up to to max_batch_length
+        # scale up to max_batch_length
         bucket_boundaries = quantiles * max_batch_length / quantiles[-1]
         # compute resulting bucket length multipliers
         length_multipliers = [
diff --git a/paddlespeech/s2t/models/hubert/__init__.py b/paddlespeech/s2t/models/hubert/__init__.py
new file mode 100644
index 00000000..87887a4c
--- /dev/null
+++ b/paddlespeech/s2t/models/hubert/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .hubert_ASR import HubertASR
+from .hubert_ASR import HubertBase
+
+__all__ = ["HubertASR", "HubertBase"]
diff --git a/paddlespeech/s2t/models/hubert/hubert_ASR.py b/paddlespeech/s2t/models/hubert/hubert_ASR.py
new file mode 100644
index 00000000..df347589
--- /dev/null
+++ b/paddlespeech/s2t/models/hubert/hubert_ASR.py
@@ -0,0 +1,368 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""HubertASR model."""
+from collections import defaultdict
+from copy import deepcopy
+from dataclasses import dataclass
+from dataclasses import is_dataclass
+from typing import Dict
+from typing import List
+from typing import Tuple
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddlespeech.s2t.models.hubert.modules.hubert_model import HubertConfig
+from paddlespeech.s2t.models.hubert.modules.hubert_model import HubertModel
+from paddlespeech.s2t.models.hubert.modules.hubert_model import HubertPretrainingConfig
+from paddlespeech.s2t.models.wav2vec2.modules.VanillaNN import VanillaNN
+from paddlespeech.s2t.models.wav2vec2.processing.speech_augmentation import SpecAugment
+from paddlespeech.s2t.modules.ctc import CTCDecoderBase as CTC
+from paddlespeech.s2t.modules.initializer import DefaultInitializerContext
+from paddlespeech.s2t.utils.ctc_utils import remove_duplicates_and_blank
+from paddlespeech.s2t.utils.log import Log
+from paddlespeech.s2t.utils.utility import log_add
+
+logger = Log(__name__).getlog()
+
+
+class HubertASR(nn.Layer):
+    def __init__(self, config: dict):
+        super().__init__()
+        init_type = config.get("init_type", None)
+        with DefaultInitializerContext(init_type):
+            self.config = config
+            task_cfg = self.merge_with_parent(HubertPretrainingConfig,
+                                              dict(self.config.task_cfg))
+            model_cfg = self.merge_with_parent(HubertConfig,
+                                               dict(self.config.model_cfg))
+            hubert = HubertModel(model_cfg, task_cfg, [None])
+
+            self.normalize_wav = config.normalize_wav
+            self.output_norm = config.output_norm
+            if hasattr(config, 'spec_augment'):
+                self.spec_augment = SpecAugment(**config.spec_augment)
+
+            if config.freeze_hubert:
+                hubert.eval()
+                for parm in hubert.parameters():
+                    parm.trainable = False
+            self.hubert = hubert
+            self.enc = VanillaNN(**config.enc)
+            self.ctc = CTC(**config.ctc,
+                           odim=config.output_dim,
+                           batch_average=False,
+                           reduction='mean')
+
+    def merge_with_parent(self, dc: dataclass, cfg: dict):
+        assert is_dataclass(dc)
+        assert type(cfg) == dict
+        cfg = deepcopy(cfg)
+
+        def fix_cfg(cfg):
+            target_keys = set(dc.__dataclass_fields__.keys())
+            for k in list(cfg.keys()):
+                if k not in target_keys:
+                    del cfg[k]
+
+        fix_cfg(cfg)
+        assert len(cfg) > 0
+        return dc(**cfg)
+
+    def forward(self, wav, wavs_lens_rate, target, target_lens):
+
+        if self.normalize_wav:
+            wav = F.layer_norm(wav, wav.shape)
+
+        # Extract wav2vec output
+        out = self.hubert.extract_features(wav)[0]
+        # We normalize the output if required
+        if self.output_norm:
+            out = F.layer_norm(out, out.shape)
+
+        if self.training and hasattr(self.config, 'spec_augment'):
+            feats = self.spec_augment(out)
+        else:
+            feats = out
+
+        x = self.enc(feats)
+
+        x_lens = (wavs_lens_rate * x.shape[1]).round().astype(paddle.int64)
+
+        ctc_loss = self.ctc(x, x_lens, target, target_lens)
+
+        return ctc_loss
+
+    @paddle.no_grad()
+    def decode(self,
+               feats: paddle.Tensor,
+               text_feature: Dict[str, int],
+               decoding_method: str,
+               beam_size: int,
+               tokenizer: str=None,
+               sb_pipeline=False):
+        batch_size = feats.shape[0]
+
+        if decoding_method == 'ctc_prefix_beam_search' and batch_size > 1:
+            logger.error(
+                f"decoding mode {decoding_method} must be running with batch_size == 1"
+            )
+            logger.error(f"current batch_size is {batch_size}")
+
+        if decoding_method == 'ctc_greedy_search':
+            if tokenizer is None and sb_pipeline is False:
+                hyps = self.ctc_greedy_search(feats)
+                res = [text_feature.defeaturize(hyp) for hyp in hyps]
+                res_tokenids = [hyp for hyp in hyps]
+            else:
+                if sb_pipeline is True:
+                    hyps = self.ctc_greedy_search(feats.unsqueeze(-1))
+                else:
+                    hyps = self.ctc_greedy_search(feats)
+                res = []
+                res_tokenids = []
+                for sequence in hyps:
+                    # Decode token terms to words 
+                    predicted_tokens = text_feature.convert_ids_to_tokens(
+                        sequence)
+                tmp_res = []
+                tmp_res_tokenids = []
+                for c in predicted_tokens:
+                    if c == "[CLS]":
+                        continue
+                    elif c == "[SEP]" or c == "[PAD]":
+                        break
+                    else:
+                        tmp_res.append(c)
+                        tmp_res_tokenids.append(text_feature.vocab[c])
+                res.append(''.join(tmp_res))
+                res_tokenids.append(tmp_res_tokenids)
+
+        # ctc_prefix_beam_search and attention_rescoring only return one
+        # result in List[int], change it to List[List[int]] for compatible
+        # with other batch decoding mode
+        elif decoding_method == 'ctc_prefix_beam_search':
+            assert feats.shape[0] == 1
+            if tokenizer is None and sb_pipeline is False:
+                hyp = self.ctc_prefix_beam_search(feats, beam_size)
+                res = [text_feature.defeaturize(hyp)]
+                res_tokenids = [hyp]
+            else:
+                if sb_pipeline is True:
+                    hyp = self.ctc_prefix_beam_search(
+                        feats.unsqueeze(-1), beam_size)
+                else:
+                    hyp = self.ctc_prefix_beam_search(feats, beam_size)
+                res = []
+                res_tokenids = []
+                predicted_tokens = text_feature.convert_ids_to_tokens(hyp)
+                tmp_res = []
+                tmp_res_tokenids = []
+                for c in predicted_tokens:
+                    if c == "[CLS]":
+                        continue
+                    elif c == "[SEP]" or c == "[PAD]":
+                        break
+                    else:
+                        tmp_res.append(c)
+                        tmp_res_tokenids.append(text_feature.vocab[c])
+                res.append(''.join(tmp_res))
+                res_tokenids.append(tmp_res_tokenids)
+        else:
+            raise ValueError(
+                f"wav2vec2 not support decoding method: {decoding_method}")
+
+        return res, res_tokenids
+
+    @classmethod
+    def from_config(cls, config):
+        model = cls(config)
+        return model
+
+    def ctc_greedy_search(self, wav) -> List[List[int]]:
+        """ Apply CTC greedy search
+        Args:
+            speech (paddle.Tensor): (batch, max_len)
+            speech_length (paddle.Tensor): (batch, )
+        Returns:
+            List[List[int]]: best path result
+        """
+        batch_size = wav.shape[0]
+        wav = wav[:, :, 0]
+        if self.normalize_wav:
+            wav = F.layer_norm(wav, wav.shape[1:])
+        # Extract wav2vec output
+        out = self.hubert.extract_features(wav)[0]
+        # We normalize the output if required
+        if self.output_norm:
+            out = F.layer_norm(out, out.shape[1:])
+        feats = out
+        x = self.enc(feats)
+        x_lens = x.shape[1]
+        ctc_probs = self.ctc.log_softmax(x)  # (B, maxlen, vocab_size)
+        topk_prob, topk_index = ctc_probs.topk(1, axis=2)  # (B, maxlen, 1)
+        topk_index = topk_index.view(batch_size, x_lens)  # (B, maxlen)
+
+        hyps = [hyp.tolist() for hyp in topk_index]
+        hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps]
+        return hyps
+
+    def _ctc_prefix_beam_search(
+            self,
+            wav,
+            beam_size,
+            blank_id: int=0, ) -> Tuple[List[Tuple[int, float]], paddle.Tensor]:
+        """ CTC prefix beam search inner implementation
+        Args:
+            speech (paddle.Tensor): (batch, max_len, feat_dim)
+            speech_length (paddle.Tensor): (batch, )
+            beam_size (int): beam size for beam search
+            decoding_chunk_size (int): decoding chunk for dynamic chunk
+                trained model.
+                <0: for decoding, use full chunk.
+                >0: for decoding, use fixed chunk size as set.
+                0: used for training, it's prohibited here
+            simulate_streaming (bool): whether do encoder forward in a
+                streaming fashion
+        Returns:
+            List[Tuple[int, float]]: nbest results, (N,1), (text, likelihood)
+            paddle.Tensor: encoder output, (1, max_len, encoder_dim),
+                it will be used for rescoring in attention rescoring mode
+        """
+        wav = wav[:, :, 0]
+
+        if self.normalize_wav:
+            wav = F.layer_norm(wav, wav.shape[1:])
+        # Extract wav2vec output
+        out = self.hubert.extract_features(wav)[0]
+        # We normalize the output if required
+        if self.output_norm:
+            out = F.layer_norm(out, out.shape[1:])
+        feats = out
+
+        x = self.enc(feats)
+        maxlen = x.shape[1]
+        ctc_probs = self.ctc.log_softmax(x)  # (1, maxlen, vocab_size)
+        ctc_probs = ctc_probs.squeeze(0)
+
+        # cur_hyps: (prefix, (blank_ending_score, none_blank_ending_score))
+        # blank_ending_score and  none_blank_ending_score in ln domain
+        cur_hyps = [(tuple(), (0.0, -float('inf')))]
+        # 2. CTC beam search step by step
+        for t in range(0, maxlen):
+            logp = ctc_probs[t]  # (vocab_size,)
+            # key: prefix, value (pb, pnb), default value(-inf, -inf)
+            next_hyps = defaultdict(lambda: (-float('inf'), -float('inf')))
+            # 2.1 First beam prune: select topk best
+            top_k_logp, top_k_index = logp.topk(beam_size)  # (beam_size,)
+            for s in top_k_index:
+                s = s.item()
+                ps = logp[s].item()
+                for prefix, (pb, pnb) in cur_hyps:
+                    last = prefix[-1] if len(prefix) > 0 else None
+                    if s == blank_id:  # blank
+                        n_pb, n_pnb = next_hyps[prefix]
+                        n_pb = log_add([n_pb, pb + ps, pnb + ps])
+                        next_hyps[prefix] = (n_pb, n_pnb)
+                    elif s == last:
+                        #  Update *ss -> *s;
+                        n_pb, n_pnb = next_hyps[prefix]
+                        n_pnb = log_add([n_pnb, pnb + ps])
+                        next_hyps[prefix] = (n_pb, n_pnb)
+                        # Update *s-s -> *ss, - is for blank
+                        n_prefix = prefix + (s, )
+                        n_pb, n_pnb = next_hyps[n_prefix]
+                        n_pnb = log_add([n_pnb, pb + ps])
+                        next_hyps[n_prefix] = (n_pb, n_pnb)
+                    else:
+                        n_prefix = prefix + (s, )
+                        n_pb, n_pnb = next_hyps[n_prefix]
+                        n_pnb = log_add([n_pnb, pb + ps, pnb + ps])
+                        next_hyps[n_prefix] = (n_pb, n_pnb)
+
+            # 2.2 Second beam prune
+            next_hyps = sorted(
+                next_hyps.items(),
+                key=lambda x: log_add(list(x[1])),
+                reverse=True)
+            cur_hyps = next_hyps[:beam_size]
+
+        hyps = [(y[0], log_add([y[1][0], y[1][1]])) for y in cur_hyps]
+        return hyps
+
+    def ctc_prefix_beam_search(self, wav, beam_size) -> List[int]:
+        """ Apply CTC prefix beam search
+        Args:
+            speech (paddle.Tensor): (batch, max_len, feat_dim)
+            speech_length (paddle.Tensor): (batch, )
+            beam_size (int): beam size for beam search
+            decoding_chunk_size (int): decoding chunk for dynamic chunk
+                trained model.
+                <0: for decoding, use full chunk.
+                >0: for decoding, use fixed chunk size as set.
+                0: used for training, it's prohibited here
+            simulate_streaming (bool): whether do encoder forward in a
+                streaming fashion
+        Returns:
+            List[int]: CTC prefix beam search nbest results
+        """
+        hyps = self._ctc_prefix_beam_search(wav, beam_size)
+        return hyps[0][0]
+
+
+class HubertBase(nn.Layer):
+    """Hubert model"""
+
+    def __init__(self, config: dict):
+        super().__init__()
+        self.config = config
+        task_cfg = self.merge_with_parent(HubertPretrainingConfig,
+                                          dict(self.config.task_cfg))
+        model_cfg = self.merge_with_parent(HubertConfig,
+                                           dict(self.config.model_cfg))
+        hubert = HubertModel(model_cfg, task_cfg, [None])
+        self.hubert = hubert
+
+    @classmethod
+    def from_config(cls, configs: dict):
+        """init model.
+        Args:
+            configs (dict): config dict.
+        Raises:
+            ValueError: raise when using not support encoder type.
+        Returns:
+            nn.Layer: HubertBase
+        """
+        model = cls(configs)
+        return model
+
+    def merge_with_parent(self, dc: dataclass, cfg: dict):
+        assert is_dataclass(dc)
+        assert type(cfg) == dict
+        cfg = deepcopy(cfg)
+
+        def fix_cfg(cfg):
+            target_keys = set(dc.__dataclass_fields__.keys())
+            for k in list(cfg.keys()):
+                if k not in target_keys:
+                    del cfg[k]
+
+        fix_cfg(cfg)
+        assert len(cfg) > 0
+        return dc(**cfg)
+
+    def forward(self, wav):
+        out = self.hubert.extract_features(wav)
+        return out
diff --git a/paddlespeech/s2t/models/hubert/modules/__init__.py b/paddlespeech/s2t/models/hubert/modules/__init__.py
new file mode 100644
index 00000000..595add0a
--- /dev/null
+++ b/paddlespeech/s2t/models/hubert/modules/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddlespeech/s2t/models/hubert/modules/hubert_model.py b/paddlespeech/s2t/models/hubert/modules/hubert_model.py
new file mode 100644
index 00000000..46f4d9bc
--- /dev/null
+++ b/paddlespeech/s2t/models/hubert/modules/hubert_model.py
@@ -0,0 +1,586 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Paddle Hubert model."""
+from dataclasses import dataclass
+from dataclasses import field
+from typing import Any
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Tuple
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+
+from paddlespeech.s2t.models.wav2vec2.modules.wav2vec2_model import ChoiceEnum
+from paddlespeech.s2t.models.wav2vec2.modules.wav2vec2_model import compute_mask_indices
+from paddlespeech.s2t.models.wav2vec2.modules.wav2vec2_model import ConvFeatureExtractionModel
+from paddlespeech.s2t.models.wav2vec2.modules.wav2vec2_model import EXTRACTOR_MODE_CHOICES
+from paddlespeech.s2t.models.wav2vec2.modules.wav2vec2_model import get_available_activation_fns
+from paddlespeech.s2t.models.wav2vec2.modules.wav2vec2_model import GLU
+from paddlespeech.s2t.models.wav2vec2.modules.wav2vec2_model import GradMultiply
+from paddlespeech.s2t.models.wav2vec2.modules.wav2vec2_model import LAYER_TYPE_CHOICES
+from paddlespeech.s2t.models.wav2vec2.modules.wav2vec2_model import MASKING_DISTRIBUTION_CHOICES
+from paddlespeech.s2t.models.wav2vec2.modules.wav2vec2_model import TransformerEncoder
+from paddlespeech.s2t.modules.align import LayerNorm
+from paddlespeech.s2t.modules.align import Linear
+from paddlespeech.s2t.utils.log import Log
+
+logger = Log(__name__).getlog()
+
+
+@dataclass
+class HubertPretrainingConfig:
+    label_rate: float = field(
+        default=-1.0,
+        metadata={"help": "label frame rate. -1.0 for sequence label"}, )
+    sample_rate: int = field(
+        default=16_000,
+        metadata={
+            "help":
+            "target sample rate. audio files will be up/down "
+            "sampled to this rate"
+        }, )
+    normalize: bool = field(
+        default=False,
+        metadata={
+            "help": "if set, normalizes input to have 0 mean and unit variance"
+        }, )
+    enable_padding: bool = field(
+        default=False,
+        metadata={"help": "pad shorter samples instead of cropping"}, )
+    max_keep_size: Optional[int] = field(
+        default=None,
+        metadata={"help": "exclude sample longer than this"}, )
+    max_sample_size: Optional[int] = field(
+        default=None,
+        metadata={"help": "max sample size to crop to for batching"}, )
+    min_sample_size: Optional[int] = field(
+        default=None,
+        metadata={"help": "min sample size to crop to for batching"}, )
+    random_crop: Optional[bool] = field(
+        default=True,
+        metadata={"help": "always crop from the beginning if false"}, )
+    pad_audio: Optional[bool] = field(
+        default=False,
+        metadata={"help": "pad audio to the longest one in the batch if true"},
+    )
+
+
+@dataclass
+class HubertConfig:
+    label_rate: float
+
+    extractor_mode: EXTRACTOR_MODE_CHOICES = field(
+        default="default",
+        metadata={
+            "help":
+            "mode for feature extractor. default has a single group "
+            "norm with d groups in the first conv block, whereas layer_norm "
+            "has layer norms in every block (meant to use with normalize=True)"
+        }, )
+    encoder_layers: int = field(
+        default=12, metadata={"help": "num encoder layers in the transformer"})
+    encoder_embed_dim: int = field(
+        default=768, metadata={"help": "encoder embedding dimension"})
+    encoder_ffn_embed_dim: int = field(
+        default=3072, metadata={"help": "encoder embedding dimension for FFN"})
+    encoder_attention_heads: int = field(
+        default=12, metadata={"help": "num encoder attention heads"})
+    activation_fn: ChoiceEnum(get_available_activation_fns()) = field(
+        default="gelu", metadata={"help": "activation function to use"})
+    layer_type: LAYER_TYPE_CHOICES = field(
+        default="transformer", metadata={"help": "layer type in encoder"})
+
+    # dropouts
+    dropout: float = field(
+        default=0.1,
+        metadata={"help": "dropout probability for the transformer"}, )
+    attention_dropout: float = field(
+        default=0.1,
+        metadata={"help": "dropout probability for attention weights"}, )
+    activation_dropout: float = field(
+        default=0.0,
+        metadata={"help": "dropout probability after activation in FFN"}, )
+    encoder_layerdrop: float = field(
+        default=0.0,
+        metadata={"help": "probability of dropping a tarnsformer layer"}, )
+    dropout_input: float = field(
+        default=0.0,
+        metadata={"help": "dropout to apply to the input (after feat extr)"}, )
+    dropout_features: float = field(
+        default=0.0,
+        metadata={"help": "dropout to apply to the features (after feat extr)"},
+    )
+
+    final_dim: int = field(
+        default=0,
+        metadata={
+            "help":
+            "project final representations and targets to this many "
+            "dimensions. set to encoder_embed_dim is <= 0"
+        }, )
+    untie_final_proj: bool = field(
+        default=False,
+        metadata={"help": "use separate projection for each target"}, )
+    layer_norm_first: bool = field(
+        default=False,
+        metadata={"help": "apply layernorm first in the transformer"}, )
+    conv_feature_layers: str = field(
+        default="[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2",
+        metadata={
+            "help":
+            "string describing convolutional feature extraction "
+            "layers in form of a python list that contains "
+            "[(dim, kernel_size, stride), ...]"
+        }, )
+    conv_bias: bool = field(
+        default=False, metadata={"help": "include bias in conv encoder"})
+    logit_temp: float = field(
+        default=0.1, metadata={"help": "temperature to divide logits by"})
+    target_glu: bool = field(
+        default=False, metadata={"help": "adds projection + glu to targets"})
+    feature_grad_mult: float = field(
+        default=1.0,
+        metadata={"help": "multiply feature extractor var grads by this"}, )
+
+    # masking
+    mask_length: int = field(default=10, metadata={"help": "mask length"})
+    mask_prob: float = field(
+        default=0.65,
+        metadata={"help": "probability of replacing a token with mask"}, )
+    mask_selection: MASKING_DISTRIBUTION_CHOICES = field(
+        default="static", metadata={"help": "how to choose mask length"})
+    mask_other: float = field(
+        default=0,
+        metadata={
+            "help":
+            "secondary mask argument "
+            "(used for more complex distributions), "
+            "see help in compute_mask_indicesh"
+        }, )
+    no_mask_overlap: bool = field(
+        default=False, metadata={"help": "whether to allow masks to overlap"})
+    mask_min_space: int = field(
+        default=1,
+        metadata={"help": "min space between spans (if no overlap is enabled)"},
+    )
+
+    # channel masking
+    mask_channel_length: int = field(
+        default=10,
+        metadata={"help": "length of the mask for features (channels)"}, )
+    mask_channel_prob: float = field(
+        default=0.0,
+        metadata={"help": "probability of replacing a feature with 0"}, )
+    mask_channel_selection: MASKING_DISTRIBUTION_CHOICES = field(
+        default="static",
+        metadata={"help": "how to choose mask length for channel masking"}, )
+    mask_channel_other: float = field(
+        default=0,
+        metadata={
+            "help":
+            "secondary mask argument "
+            "(used for more complex distributions), "
+            "see help in compute_mask_indicesh"
+        }, )
+    no_mask_channel_overlap: bool = field(
+        default=False,
+        metadata={"help": "whether to allow channel masks to overlap"}, )
+    mask_channel_min_space: int = field(
+        default=1,
+        metadata={"help": "min space between spans (if no overlap is enabled)"},
+    )
+
+    # positional embeddings
+    conv_pos: int = field(
+        default=128,
+        metadata={
+            "help": "number of filters for convolutional positional embeddings"
+        }, )
+    conv_pos_groups: int = field(
+        default=16,
+        metadata={
+            "help": "number of groups for convolutional positional embedding"
+        }, )
+
+    latent_temp: Tuple[float, float, float] = field(
+        default=(2, 0.5, 0.999995),
+        metadata={"help": "legacy (to be removed)"}, )
+
+    # loss computation
+    skip_masked: bool = field(
+        default=False,
+        metadata={"help": "skip computing losses over masked frames"}, )
+    skip_nomask: bool = field(
+        default=False,
+        metadata={"help": "skip computing losses over unmasked frames"}, )
+
+    checkpoint_activations: bool = field(
+        default=False,
+        metadata={
+            "help": "recompute activations and save memory for extra compute"
+        }, )
+
+    # FP16 optimization
+    required_seq_len_multiple: int = field(
+        default=2,
+        metadata={
+            "help":
+            "pad the input to encoder such that the sequence length is divisible by multiple"
+        }, )
+
+    # Conformer
+    depthwise_conv_kernel_size: int = field(
+        default=31,
+        metadata={
+            "help":
+            "depthwise-conv-kernel-size for convolution in conformer layer"
+        }, )
+    attn_type: str = field(
+        default="",
+        metadata={"help": "if espnet use ESPNET MHA"}, )
+    pos_enc_type: str = field(
+        default="abs",
+        metadata={"help": "Positional encoding type to use in conformer"}, )
+    fp16: bool = field(
+        default=False, metadata={"help": "If fp16 is being used"})
+
+
+class HubertModel(nn.Layer):
+    def __init__(
+            self,
+            cfg: HubertConfig,
+            task_cfg: HubertPretrainingConfig,
+            dictionaries: List[Any], ) -> None:
+        super().__init__()
+        logger.info(f"HubertModel Config: {cfg}")
+
+        feature_enc_layers = eval(cfg.conv_feature_layers)  # noqa
+        self.embed = feature_enc_layers[-1][0]
+
+        self.feature_extractor = ConvFeatureExtractionModel(
+            conv_layers=feature_enc_layers,
+            dropout=0.0,
+            mode=cfg.extractor_mode,
+            conv_bias=cfg.conv_bias, )
+        feature_ds_rate = np.prod([s for _, _, s in feature_enc_layers])
+        self.feat2tar_ratio = cfg.label_rate * feature_ds_rate / task_cfg.sample_rate
+
+        self.post_extract_proj = (Linear(self.embed, cfg.encoder_embed_dim) if
+                                  self.embed != cfg.encoder_embed_dim else None)
+
+        self.mask_prob = cfg.mask_prob
+        self.mask_selection = cfg.mask_selection
+        self.mask_other = cfg.mask_other
+        self.mask_length = cfg.mask_length
+        self.no_mask_overlap = cfg.no_mask_overlap
+        self.mask_min_space = cfg.mask_min_space
+
+        self.mask_channel_prob = cfg.mask_channel_prob
+        self.mask_channel_selection = cfg.mask_channel_selection
+        self.mask_channel_other = cfg.mask_channel_other
+        self.mask_channel_length = cfg.mask_channel_length
+        self.no_mask_channel_overlap = cfg.no_mask_channel_overlap
+        self.mask_channel_min_space = cfg.mask_channel_min_space
+
+        self.dropout_input = nn.Dropout(cfg.dropout_input)
+        self.dropout_features = nn.Dropout(cfg.dropout_features)
+
+        self.feature_grad_mult = cfg.feature_grad_mult
+        self.logit_temp = cfg.logit_temp
+        self.skip_masked = cfg.skip_masked
+        self.skip_nomask = cfg.skip_nomask
+
+        final_dim = cfg.final_dim if cfg.final_dim > 0 else cfg.encoder_embed_dim
+
+        self.mask_emb = paddle.create_parameter(
+            shape=[cfg.encoder_embed_dim],
+            dtype='float32',
+            default_initializer=paddle.nn.initializer.Uniform(low=0), )
+
+        self.encoder = TransformerEncoder(cfg)
+        self.layer_norm = LayerNorm(self.embed)
+
+        self.target_glu = None
+        if cfg.target_glu:
+            self.target_glu = nn.Sequential(
+                Linear(final_dim, final_dim * 2), GLU())
+
+        self.untie_final_proj = cfg.untie_final_proj
+        if self.untie_final_proj:
+            self.final_proj = Linear(cfg.encoder_embed_dim,
+                                     final_dim * len(dictionaries))
+        else:
+            self.final_proj = Linear(cfg.encoder_embed_dim, final_dim)
+
+        # modules below are not needed during fine-tuning
+        if any([d is None for d in dictionaries]):
+            logger.info(
+                "cannot find dictionary. assume will be used for fine-tuning")
+        else:
+            self.num_classes = [len(d) for d in dictionaries]
+            self.label_embs_concat = paddle.create_parameter(
+                shape=[sum(self.num_classes), final_dim],
+                dtype='float32',
+                default_initializer=paddle.nn.initializer.Uniform(low=0), )
+
+    @classmethod
+    def build_model(cls, cfg: HubertConfig, task):
+        """Build a new model instance."""
+
+        model = HubertModel(cfg, task.cfg, task.dictionaries)
+        return model
+
+    def apply_mask(self, x, padding_mask, target_list):
+        B, T, C = x.shape
+        if self.mask_prob > 0:
+            mask_indices = compute_mask_indices(
+                (B, T),
+                padding_mask,
+                self.mask_prob,
+                self.mask_length,
+                self.mask_selection,
+                self.mask_other,
+                min_masks=2,
+                no_overlap=self.no_mask_overlap,
+                min_space=self.mask_min_space, )
+
+            mask_indices = paddle.to_tensor(
+                mask_indices, dtype='int64', place=x.place)
+            x[mask_indices] = self.mask_emb
+        else:
+            mask_indices = None
+
+        if self.mask_channel_prob > 0:
+            mask_channel_indices = compute_mask_indices(
+                (B, C),
+                None,
+                self.mask_channel_prob,
+                self.mask_channel_length,
+                self.mask_channel_selection,
+                self.mask_channel_other,
+                no_overlap=self.no_mask_channel_overlap,
+                min_space=self.mask_channel_min_space, )
+            mask_channel_indices = (paddle.to_tensor(
+                mask_channel_indices, dtype='int64', place=x.place).unsqueeze(1)
+                                    .expand([-1, T, -1]))
+            x[mask_channel_indices] = 0
+
+        return x, mask_indices
+
+    def compute_nce(self, x, pos, negs):
+        neg_is_pos = (pos == negs).all(-1)
+        pos = pos.unsqueeze(0)
+        targets = paddle.concat([pos, negs], axis=0)
+
+        logits = paddle.nn.functional.cosine_similarity(
+            x.astype('float32'), targets.astype('float32'), axis=-1)
+        logits /= self.logit_temp
+        if paddle.any(neg_is_pos):
+            logits[1:][neg_is_pos] = float("-inf")
+        logits = logits.transpose([1, 0])  # (num_x, num_cls+1)
+        return logits
+
+    def forward_features(self, source: paddle.Tensor) -> paddle.Tensor:
+        if self.feature_grad_mult > 0:
+            features = self.feature_extractor(source)
+            if self.feature_grad_mult != 1.0:
+                features = GradMultiply.apply(features, self.feature_grad_mult)
+        else:
+            with paddle.no_grad():
+                features = self.feature_extractor(source)
+        return features
+
+    def forward_targets(
+            self,
+            features: paddle.Tensor,
+            target_list: List[paddle.Tensor],
+    ) -> Tuple[paddle.Tensor, paddle.Tensor]:
+        # Trim features to ensure labels exist and then get aligned labels
+        feat_tsz = features.shape[2]
+        targ_tsz = min([t.shape[1] for t in target_list])
+        if self.feat2tar_ratio * feat_tsz > targ_tsz:
+            feat_tsz = int(targ_tsz / self.feat2tar_ratio)
+            features = features[:, :, :feat_tsz]
+        target_inds = paddle.arange(feat_tsz).astype(
+            'float32') * self.feat2tar_ratio
+        target_list = [t[:, target_inds.astype('int64')] for t in target_list]
+        return features, target_list
+
+    def forward_padding_mask(
+            self,
+            features: paddle.Tensor,
+            padding_mask: paddle.Tensor, ) -> paddle.Tensor:
+        extra = padding_mask.shape[1] % features.shape[1]
+        if extra > 0:
+            padding_mask = padding_mask[:, :-extra]
+        padding_mask = paddle.reshape(
+            padding_mask, [padding_mask.shape[0], features.shape[1], -1])
+        padding_mask = paddle.all(padding_mask, axis=-1)
+        return padding_mask
+
+    def forward(
+            self,
+            source: paddle.Tensor,
+            target_list: Optional[List[paddle.Tensor]]=None,
+            padding_mask: Optional[paddle.Tensor]=None,
+            mask: bool=True,
+            features_only: bool=False,
+            output_layer: Optional[int]=None, ) -> Dict[str, paddle.Tensor]:
+        """output layer is 1-based"""
+        features = self.forward_features(source)
+        if target_list is not None:
+            features, target_list = self.forward_targets(features, target_list)
+
+        features_pen = features.pow(2).mean()
+
+        features = features.transpose([0, 2, 1])
+        features = self.layer_norm(features)
+        unmasked_features = features.clone()
+
+        if padding_mask is not None:
+            padding_mask = self.forward_padding_mask(features, padding_mask)
+
+        if self.post_extract_proj is not None:
+            features = self.post_extract_proj(features)
+
+        features = self.dropout_input(features)
+        unmasked_features = self.dropout_features(unmasked_features)
+
+        if mask:
+            x, mask_indices = self.apply_mask(features, padding_mask,
+                                              target_list)
+        else:
+            x = features
+            mask_indices = None
+
+        # feature: (B, T, D), float
+        # target: (B, T), long
+        # x: (B, T, D), float
+        # padding_mask: (B, T), bool
+        # mask_indices: (B, T), bool
+        x, _ = self.encoder(
+            x,
+            padding_mask=padding_mask,
+            layer=None if output_layer is None else output_layer - 1, )
+
+        if features_only:
+            return {"x": x, "padding_mask": padding_mask, "features": features}
+
+        def compute_pred(self, proj_x, target, label_embs):
+            # compute logits for the i-th label set
+            y = paddle.index_select(
+                label_embs, index=target.astype('int64'), axis=0)
+            negs = paddle.expand(
+                label_embs.unsqueeze(1),
+                [label_embs.shape[0], proj_x.shape[0], label_embs.shape[-1]])
+            if self.target_glu:
+                y = self.target_glu(y)
+                negs = self.target_glu(negs)
+            # proj_x: (S, D)
+            # y: (S, D)
+            # negs: (Neg, S, D)
+            return self.compute_nce(proj_x, y, negs)
+
+        label_embs_list = self.label_embs_concat.split(self.num_classes, 0)
+
+        if not self.skip_masked:
+            masked_indices = paddle.logical_and(~padding_mask, mask_indices)
+            proj_x_m = self.final_proj(x[masked_indices])
+            if self.untie_final_proj:
+                proj_x_m_list = proj_x_m.chunk(len(target_list), dim=-1)
+            else:
+                proj_x_m_list = [proj_x_m for _ in range(len(target_list))]
+            logit_m_list = [
+                compute_pred(proj_x_m, t[masked_indices], label_embs_list[i])
+                for i, (proj_x_m, t
+                        ) in enumerate(zip(proj_x_m_list, target_list))
+            ]
+        else:
+            logit_m_list = [None for _ in target_list]
+
+        if not self.skip_nomask:
+            nomask_indices = paddle.logical_and(~padding_mask, ~mask_indices)
+            proj_x_u = self.final_proj(x[nomask_indices])
+            if self.untie_final_proj:
+                proj_x_u_list = proj_x_u.chunk(len(target_list), dim=-1)
+            else:
+                proj_x_u_list = [proj_x_u for _ in range(len(target_list))]
+
+            logit_u_list = [
+                compute_pred(proj_x_u, t[nomask_indices], label_embs_list[i])
+                for i, (proj_x_u, t
+                        ) in enumerate(zip(proj_x_u_list, target_list))
+            ]
+        else:
+            logit_u_list = [None for _ in target_list]
+
+        result = {
+            "logit_m_list": logit_m_list,
+            "logit_u_list": logit_u_list,
+            "padding_mask": padding_mask,
+            "features_pen": features_pen,
+        }
+        return result
+
+    def extract_features(
+            self,
+            source: paddle.Tensor,
+            padding_mask: Optional[paddle.Tensor]=None,
+            mask: bool=False,
+            ret_conv: bool=False,
+            output_layer: Optional[int]=None,
+    ) -> Tuple[paddle.Tensor, paddle.Tensor]:
+        res = self.forward(
+            source,
+            padding_mask=padding_mask,
+            mask=mask,
+            features_only=True,
+            output_layer=output_layer, )
+        feature = res["features"] if ret_conv else res["x"]
+        return feature, res["padding_mask"]
+
+    def get_logits(self, net_output, is_masked=True):
+        if is_masked:
+            logits_list = net_output["logit_m_list"]
+        else:
+            logits_list = net_output["logit_u_list"]
+        logits_list = [
+            paddle.cast(x, 'float32') for x in logits_list if x is not None
+        ]
+        return logits_list
+
+    def get_targets(self, net_output, is_masked=True):
+        logits_list = self.get_logits(net_output, is_masked)
+        targets_list = [
+            paddle.zeros_like(x, dtype='int64') for x in logits_list
+        ]
+        return targets_list
+
+    def get_extra_losses(self, net_output):
+        extra_losses = []
+        names = []
+
+        if "features_pen" in net_output:
+            extra_losses.append(net_output["features_pen"])
+            names.append("features_pen")
+
+        return extra_losses, names
+
+    def remove_pretraining_modules(self):
+        self.target_glu = None
+        self.final_proj = None
diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py
index 544c1e83..2e1c14ac 100644
--- a/paddlespeech/s2t/models/u2/u2.py
+++ b/paddlespeech/s2t/models/u2/u2.py
@@ -43,6 +43,7 @@ from paddlespeech.s2t.modules.ctc import CTCDecoderBase
 from paddlespeech.s2t.modules.decoder import BiTransformerDecoder
 from paddlespeech.s2t.modules.decoder import TransformerDecoder
 from paddlespeech.s2t.modules.encoder import ConformerEncoder
+from paddlespeech.s2t.modules.encoder import SqueezeformerEncoder
 from paddlespeech.s2t.modules.encoder import TransformerEncoder
 from paddlespeech.s2t.modules.initializer import DefaultInitializerContext
 from paddlespeech.s2t.modules.loss import LabelSmoothingLoss
@@ -144,7 +145,6 @@ class U2BaseModel(ASRInterface, nn.Layer):
                                 text_lengths)
             ctc_time = time.time() - start
             #logger.debug(f"ctc time: {ctc_time}")
-
         if loss_ctc is None:
             loss = loss_att
         elif loss_att is None:
@@ -559,7 +559,7 @@ class U2BaseModel(ASRInterface, nn.Layer):
             [len(hyp[0]) for hyp in hyps], place=device,
             dtype=paddle.long)  # (beam_size,)
         hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id)
-        hyps_lens = hyps_lens + 1  # Add <sos> at begining
+        hyps_lens = hyps_lens + 1  # Add <sos> at beginning
         logger.debug(
             f"hyps pad: {hyps_pad} {self.sos} {self.eos} {self.ignore_id}")
 
@@ -708,7 +708,7 @@ class U2BaseModel(ASRInterface, nn.Layer):
             hypothesis from ctc prefix beam search and one encoder output
         Args:
             hyps (paddle.Tensor): hyps from ctc prefix beam search, already
-                pad sos at the begining, (B, T)
+                pad sos at the beginning, (B, T)
             hyps_lens (paddle.Tensor): length of each hyp in hyps, (B)
             encoder_out (paddle.Tensor): corresponding encoder output, (B=1, T, D)
         Returns:
@@ -905,6 +905,9 @@ class U2Model(U2DecodeModel):
         elif encoder_type == 'conformer':
             encoder = ConformerEncoder(
                 input_dim, global_cmvn=global_cmvn, **configs['encoder_conf'])
+        elif encoder_type == 'squeezeformer':
+            encoder = SqueezeformerEncoder(
+                input_dim, global_cmvn=global_cmvn, **configs['encoder_conf'])
         else:
             raise ValueError(f"not support encoder type:{encoder_type}")
 
@@ -912,6 +915,8 @@ class U2Model(U2DecodeModel):
         decoder_type = configs.get('decoder', 'transformer')
         logger.debug(f"U2 Decoder type: {decoder_type}")
         if decoder_type == 'transformer':
+            configs['model_conf'].pop('reverse_weight', None)
+            configs['decoder_conf'].pop('r_num_blocks', None)
             decoder = TransformerDecoder(vocab_size,
                                          encoder.output_size(),
                                          **configs['decoder_conf'])
diff --git a/paddlespeech/s2t/models/u2_st/u2_st.py b/paddlespeech/s2t/models/u2_st/u2_st.py
index 31defbba..b4c8c255 100644
--- a/paddlespeech/s2t/models/u2_st/u2_st.py
+++ b/paddlespeech/s2t/models/u2_st/u2_st.py
@@ -455,7 +455,7 @@ class U2STBaseModel(nn.Layer):
             hypothesis from ctc prefix beam search and one encoder output
         Args:
             hyps (paddle.Tensor): hyps from ctc prefix beam search, already
-                pad sos at the begining, (B, T)
+                pad sos at the beginning, (B, T)
             hyps_lens (paddle.Tensor): length of each hyp in hyps, (B)
             encoder_out (paddle.Tensor): corresponding encoder output, (B=1, T, D)
         Returns:
diff --git a/paddlespeech/s2t/models/wav2vec2/modules/wav2vec2_model.py b/paddlespeech/s2t/models/wav2vec2/modules/wav2vec2_model.py
new file mode 100644
index 00000000..3fbb9426
--- /dev/null
+++ b/paddlespeech/s2t/models/wav2vec2/modules/wav2vec2_model.py
@@ -0,0 +1,2614 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Paddle Wav2Vec2 model."""
+import math
+import uuid
+from dataclasses import dataclass
+from dataclasses import field
+from enum import Enum
+from enum import EnumMeta
+from typing import Callable
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Tuple
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import Tensor
+
+from paddlespeech.s2t.modules.align import Conv1D
+from paddlespeech.s2t.modules.align import Conv2D
+from paddlespeech.s2t.modules.align import Embedding
+from paddlespeech.s2t.modules.align import LayerNorm
+from paddlespeech.s2t.modules.align import Linear
+from paddlespeech.s2t.utils.log import Log
+
+logger = Log(__name__).getlog()
+
+
+class GLU(nn.Layer):
+    r"""Applies the gated linear unit function
+    :math:`{GLU}(a, b)= a \otimes \sigma(b)` where :math:`a` is the first half
+    of the input matrices and :math:`b` is the second half.
+
+    Args:
+        axis (int): the dimension on which to split the input. Default: -1
+
+    Shape:
+        - Input: :math:`(\ast_1, N, \ast_2)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(\ast_1, M, \ast_2)` where :math:`M=N/2`
+
+    Examples::
+
+        >>> m = nn.GLU()
+        >>> input = paddle.randn([4, 2])
+        >>> output = m(input)
+    """
+
+    def __init__(self, axis: int=-1) -> None:
+        super().__init__()
+        self.axis = axis
+
+    def forward(self, input: Tensor) -> Tensor:
+        return F.glu(input, self.axis)
+
+
+class FairseqIncrementalState(object):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.init_incremental_state()
+
+    def init_incremental_state(self):
+        self._incremental_state_id = str(uuid.uuid4())
+
+    def _get_full_incremental_state_key(self, key: str) -> str:
+        return "{}.{}".format(self._incremental_state_id, key)
+
+    def get_incremental_state(
+            self,
+            incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]],
+            key: str, ) -> Optional[Dict[str, Optional[Tensor]]]:
+        """Helper for getting incremental state for an nn.Layer."""
+        full_key = self._get_full_incremental_state_key(key)
+        if incremental_state is None or full_key not in incremental_state:
+            return None
+        return incremental_state[full_key]
+
+    def set_incremental_state(
+            self,
+            incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]],
+            key: str,
+            value: Dict[str, Optional[Tensor]],
+    ) -> Optional[Dict[str, Dict[str, Optional[Tensor]]]]:
+        """Helper for setting incremental state for an nn.Layer."""
+        if incremental_state is not None:
+            full_key = self._get_full_incremental_state_key(key)
+            incremental_state[full_key] = value
+        return incremental_state
+
+
+def with_incremental_state(cls):
+    cls.__bases__ = (FairseqIncrementalState, ) + tuple(
+        b for b in cls.__bases__ if b != FairseqIncrementalState)
+    return cls
+
+
+class FairseqDropout(paddle.nn.Layer):
+    def __init__(self, p, module_name=None):
+        super().__init__()
+        self.p = p
+        self.module_name = module_name
+        self.apply_during_inference = False
+
+    def forward(self, x):
+        if self.p > 0 and (self.training or self.apply_during_inference):
+            return F.dropout(x, p=self.p, training=True)
+        else:
+            return x
+
+    def make_generation_fast_(
+            self,
+            name: str,
+            retain_dropout: bool=False,
+            retain_dropout_modules: Optional[List[str]]=None,
+            **kwargs, ):
+        if retain_dropout:
+            if retain_dropout_modules is not None and self.module_name is None:
+                logger.warning(
+                    "Cannot enable dropout during inference for module {} "
+                    "because module_name was not set".format(name))
+            elif (retain_dropout_modules is
+                  None  # if None, apply to all modules
+                  or self.module_name in retain_dropout_modules):
+                logger.info("Enabling dropout during inference for module: {}".
+                            format(name))
+                self.apply_during_inference = True
+            else:
+                logger.info("Disabling dropout for module: {}".format(name))
+
+
+def quant_noise(module, p, block_size):
+    """
+    Wraps modules and applies quantization noise to the weights for
+    subsequent quantization with Iterative Product Quantization as
+    described in "Training with Quantization Noise for Extreme Model Compression"
+
+    Args:
+        - module: nn.Layer
+        - p: amount of Quantization Noise
+        - block_size: size of the blocks for subsequent quantization with iPQ
+
+    Remarks:
+        - Layer weights must have the right sizes wrt the block size
+        - Only Linear, Embedding and Conv2d modules are supported for the moment
+        - For more detail on how to quantize by blocks with convolutional weights,
+          see "And the Bit Goes Down: Revisiting the Quantization of Neural Networks"
+        - We implement the simplest form of noise here as stated in the paper
+          which consists in randomly dropping blocks
+    """
+
+    # if no quantization noise, don't register hook
+    if p <= 0:
+        return module
+
+    # supported modules
+    assert isinstance(module, (Linear, Embedding, Conv2D))
+
+    # test whether module.weight has the right sizes wrt block_size
+    is_conv = len(module.weight.shape) == 4
+
+    # 2D matrix
+    if not is_conv:
+        if isinstance(module, Linear):
+            features_weight = module.weight.shape[0]
+        else:
+            features_weight = module.weight.shape[1]
+        assert (
+            features_weight %
+            block_size == 0), "Input features must be a multiple of block sizes"
+
+    # 4D matrix
+    else:
+        # 1x1 convolutions
+        if module.weight.shape[2:] == (1, 1):
+            assert (module.weight.shape[1] % block_size == 0
+                    ), "Input channels must be a multiple of block sizes"
+        # regular convolutions
+        else:
+            k = module.weight.shape[2] * module.weight.shape[3]
+            assert k % block_size == 0, "Kernel size must be a multiple of block size"
+
+    def _forward_pre_hook(mod, input):
+        # no noise for evaluation
+        if mod.training:
+            if not is_conv:
+                # gather weight and sizes
+                weight = mod.weight
+                if isinstance(module, Linear):
+                    in_features = weight.shape[0]
+                    out_features = weight.shape[1]
+                else:
+                    in_features = weight.shape[1]
+                    out_features = weight.shape[0]
+
+                # split weight matrix into blocks and randomly drop selected blocks
+                mask = paddle.zeros(
+                    [in_features // block_size * out_features],
+                    dtype=paddle.bool)
+                # the implementation of bernoulli_, p=0.5
+                mask = paddle.ones_like(mask) * 0.5
+                mask = paddle.bernoulli(mask)
+                mask = mask.unsqueeze(1).tile([1, block_size]).reshape(
+                    [-1, in_features])
+
+            else:
+                # gather weight and sizes
+                weight = mod.weight
+                in_channels = mod.weight.shape[1]
+                out_channels = mod.weight.shape[0]
+
+                # split weight matrix into blocks and randomly drop selected blocks
+                if module.weight.shape[2:] == (1, 1):
+                    mask = paddle.zeros(
+                        [in_channels // block_size * out_channels],
+                        dtype=paddle.bool)
+
+                    # the implementation of bernoulli_, p=0.5
+                    mask = paddle.ones_like(mask) * 0.5
+                    mask = paddle.bernoulli(mask)
+                    mask = mask.unsqueeze(1).tile([1, block_size]).reshape(
+                        [-1, in_channels])
+                else:
+                    mask = paddle.zeros(weight.shape)
+
+                    # the implementation of bernoulli_, p=0.5
+                    mask = paddle.ones_like(mask) * 0.5
+                    mask = paddle.bernoulli(mask)
+                    mask = mask.unsqueeze(1).tile([1, in_channels, 1, 1])
+
+            # scale weights and apply mask
+            s = 1 / (1 - p)
+            mod.weight.set_value(s * weight.masked_fill(mask, 0))
+
+    module.register_forward_pre_hook(_forward_pre_hook)
+    return module
+
+
+@with_incremental_state
+class MultiheadAttention(nn.Layer):
+    """Multi-headed attention.
+
+    See "Attention Is All You Need" for more details.
+    """
+
+    def __init__(
+            self,
+            embed_dim,
+            num_heads,
+            kdim=None,
+            vdim=None,
+            dropout=0.0,
+            bias=True,
+            add_bias_kv=False,
+            add_zero_attn=False,
+            self_attention=False,
+            encoder_decoder_attention=False,
+            q_noise=0.0,
+            qn_block_size=8,
+            # TODO: pass in config rather than string.
+            # config defined in xformers.components.attention.AttentionConfig
+            xformers_att_config: Optional[str]=None,
+            xformers_blocksparse_layout: Optional[
+                paddle.Tensor]=None,  # This should be part of the config
+            xformers_blocksparse_blocksize: Optional[
+                int]=16,  # This should be part of the config
+    ):
+        super().__init__()
+
+        def eval_str_dict(x, type=dict):
+            if x is None:
+                return None
+            if isinstance(x, str):
+                x = eval(x)
+            return x
+
+        xformers_att_config = eval_str_dict(xformers_att_config)
+        self.use_xformers = xformers_att_config is not None
+        assert not self.use_xformers, "Do not use xformers in PaddleSpeech"
+
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim
+
+        self.num_heads = num_heads
+        self.dropout_module = FairseqDropout(
+            dropout, module_name=self.__class__.__name__)
+
+        self.head_dim = embed_dim // num_heads
+        assert (self.head_dim * num_heads == self.embed_dim
+                ), "embed_dim must be divisible by num_heads"
+        self.scaling = self.head_dim**-0.5
+
+        self.self_attention = self_attention
+        self.encoder_decoder_attention = encoder_decoder_attention
+
+        assert not self.self_attention or self.qkv_same_dim, (
+            "Self-attention requires query, key and "
+            "value to be of the same size")
+
+        # Todo scaled initialization
+        # Empirically observed the convergence to be much better with
+        # the scaled initialization
+        weight_attr = nn.initializer.XavierUniform()
+        kv_proj_bias_attr = nn.initializer.XavierUniform()
+        out_proj_bias_attr = nn.initializer.Constant(0)
+
+        self.k_proj = quant_noise(
+            nn.Linear(
+                self.kdim,
+                embed_dim,
+                weight_attr=weight_attr,
+                bias_attr=bias
+                if not bias else kv_proj_bias_attr), q_noise, qn_block_size)
+        self.v_proj = quant_noise(
+            nn.Linear(
+                self.vdim,
+                embed_dim,
+                weight_attr=weight_attr,
+                bias_attr=bias
+                if not bias else kv_proj_bias_attr), q_noise, qn_block_size)
+        self.q_proj = quant_noise(
+            nn.Linear(
+                embed_dim, embed_dim, weight_attr=weight_attr, bias_attr=bias),
+            q_noise, qn_block_size)
+
+        self.out_proj = quant_noise(
+            nn.Linear(
+                embed_dim,
+                embed_dim,
+                weight_attr=weight_attr,
+                bias_attr=bias
+                if not bias else out_proj_bias_attr), q_noise, qn_block_size)
+
+        #         nn.initializer.XavierUniform(self.k_proj.weight, gain=1 / math.sqrt(2))
+        #         nn.initializer.XavierUniform(self.v_proj.weight, gain=1 / math.sqrt(2))
+        #         nn.initializer.XavierUniform(self.q_proj.weight, gain=1 / math.sqrt(2))
+        #     else:
+        #         self.k_proj.weight = paddle.ParamAttr()
+        #     nn.initializer.XavierUniform(self.k_proj.weight)
+        #     nn.initializer.XavierUniform(self.v_proj.weight)
+        #     nn.initializer.XavierUniform(self.q_proj.weight)
+
+        #     nn.initializer.XavierUniform(self.out_proj.weight)
+        # if self.out_proj.bias is not None:
+        #     nn.initializer.Constant(self.out_proj.bias)
+        # if self.bias_k is not None:
+        #     nn.initializer.XavierNormal(self.bias_k)
+        # if self.bias_v is not None:
+        #     nn.initializer.XavierNormal(self.bias_v)
+
+        # self.k_proj = Linear(self.kdim, embed_dim)
+
+        # self.v_proj = Linear(self.vdim, embed_dim)
+
+        # self.q_proj = Linear(embed_dim, embed_dim)
+
+        # self.out_proj = Linear(embed_dim, embed_dim)
+
+        if add_bias_kv:
+            self.bias_k = paddle.create_parameter(
+                shape=[1, 1, embed_dim],
+                dtype='float32',
+                initializer=nn.initializer.XavierUniform)
+            self.bias_v = paddle.create_parameter(
+                shape=[1, 1, embed_dim],
+                dtype='float32',
+                initializer=nn.initializer.XavierUniform)
+        else:
+            self.bias_k = self.bias_v = None
+
+        self.add_zero_attn = add_zero_attn
+        self.beam_size = 1
+        # self.reset_parameters()
+
+        self.onnx_trace = False
+        self.skip_embed_dim_check = False
+
+    def prepare_for_onnx_export_(self):
+        self.onnx_trace = True
+
+    def reset_parameters(self):
+        if self.qkv_same_dim:
+            # Empirically observed the convergence to be much better with
+            # the scaled initialization
+            nn.initializer.XavierUniform(
+                self.k_proj.weight, gain=1 / math.sqrt(2))
+            nn.initializer.XavierUniform(
+                self.v_proj.weight, gain=1 / math.sqrt(2))
+            nn.initializer.XavierUniform(
+                self.q_proj.weight, gain=1 / math.sqrt(2))
+        else:
+            self.k_proj.weight = paddle.ParamAttr()
+            nn.initializer.XavierUniform(self.k_proj.weight)
+            nn.initializer.XavierUniform(self.v_proj.weight)
+            nn.initializer.XavierUniform(self.q_proj.weight)
+
+            nn.initializer.XavierUniform(self.out_proj.weight)
+        if self.out_proj.bias is not None:
+            nn.initializer.Constant(self.out_proj.bias)
+        if self.bias_k is not None:
+            nn.initializer.XavierNormal(self.bias_k)
+        if self.bias_v is not None:
+            nn.initializer.XavierNormal(self.bias_v)
+
+    def _get_reserve_head_index(self, num_heads_to_keep: int):
+        k_proj_heads_norm = []
+        q_proj_heads_norm = []
+        v_proj_heads_norm = []
+
+        for i in range(self.num_heads):
+            start_idx = i * self.head_dim
+            end_idx = (i + 1) * self.head_dim
+            k_proj_heads_norm.append(
+                paddle.sum(
+                    paddle.abs(self.k_proj.weight[:, start_idx:end_idx]))
+                .tolist() + paddle.sum(
+                    paddle.abs(self.k_proj.bias[start_idx:end_idx])).tolist())
+            q_proj_heads_norm.append(
+                paddle.sum(
+                    paddle.abs(self.q_proj.weight[:, start_idx:end_idx]))
+                .tolist() + paddle.sum(
+                    paddle.abs(self.q_proj.bias[start_idx:end_idx])).tolist())
+            v_proj_heads_norm.append(
+                paddle.sum(
+                    paddle.abs(self.v_proj.weight[:, start_idx:end_idx]))
+                .tolist() + paddle.sum(
+                    paddle.abs(self.v_proj.bias[start_idx:end_idx])).tolist())
+
+        heads_norm = []
+        for i in range(self.num_heads):
+            heads_norm.append(k_proj_heads_norm[i] + q_proj_heads_norm[i] +
+                              v_proj_heads_norm[i])
+
+        sorted_head_index = sorted(
+            range(self.num_heads), key=lambda k: heads_norm[k], reverse=True)
+        reserve_head_index = []
+        for i in range(num_heads_to_keep):
+            start = sorted_head_index[i] * self.head_dim
+            end = (sorted_head_index[i] + 1) * self.head_dim
+            reserve_head_index.append((start, end))
+
+        return reserve_head_index
+
+    def _adaptive_prune_heads(self, reserve_head_index: List[Tuple[int, int]]):
+        new_q_weight = []
+        new_q_bias = []
+        new_k_weight = []
+        new_k_bias = []
+        new_v_weight = []
+        new_v_bias = []
+        new_out_proj_weight = []
+
+        for ele in reserve_head_index:
+            start_idx, end_idx = ele
+            new_q_weight.append(self.q_proj.weight[:, start_idx:end_idx])
+            new_q_bias.append(self.q_proj.bias[start_idx:end_idx])
+
+            new_k_weight.append(self.k_proj.weight[:, start_idx:end_idx])
+
+            new_k_bias.append(self.k_proj.bias[start_idx:end_idx])
+
+            new_v_weight.append(self.v_proj.weight[:, start_idx:end_idx])
+            new_v_bias.append(self.v_proj.bias[start_idx:end_idx])
+
+            new_out_proj_weight.append(
+                self.out_proj.weight[start_idx:end_idx, ])
+
+        new_q_weight = paddle.concat(new_q_weight, axis=-1).detach()
+        new_k_weight = paddle.concat(new_k_weight, axis=-1).detach()
+        new_v_weight = paddle.concat(new_v_weight, axis=-1).detach()
+        new_out_proj_weight = paddle.concat(new_out_proj_weight).detach()
+        new_q_weight.stop_gradient = False
+        new_k_weight.stop_gradient = False
+        new_v_weight.stop_gradient = False
+        new_out_proj_weight.stop_gradient = False
+
+        new_q_bias = paddle.concat(new_q_bias).detach()
+        new_q_bias.stop_gradient = False
+
+        new_k_bias = paddle.concat(new_k_bias).detach()
+        new_k_bias.stop_gradient = False
+
+        new_v_bias = paddle.concat(new_v_bias).detach()
+        new_v_bias.stop_gradient = False
+
+        self.q_proj.weight = paddle.create_parameter(
+            shape=new_q_weight.shape,
+            dtype=new_q_weight.dtype,
+            default_initializer=paddle.nn.initializer.Assign(new_q_weight))
+        self.q_proj.bias = paddle.create_parameter(
+            shape=new_q_bias.shape,
+            dtype=new_q_bias.dtype,
+            default_initializer=paddle.nn.initializer.Assign(new_q_bias))
+
+        self.k_proj.weight = paddle.create_parameter(
+            shape=new_k_weight.shape,
+            dtype=new_k_weight.dtype,
+            default_initializer=paddle.nn.initializer.Assign(new_k_weight))
+        self.k_proj.bias = paddle.create_parameter(
+            shape=new_k_bias.shape,
+            dtype=new_k_bias.dtype,
+            default_initializer=paddle.nn.initializer.Assign(new_k_bias))
+
+        self.v_proj.weight = paddle.create_parameter(
+            shape=new_v_weight.shape,
+            dtype=new_v_weight.dtype,
+            default_initializer=paddle.nn.initializer.Assign(new_v_weight))
+        self.v_proj.bias = paddle.create_parameter(
+            shape=new_v_bias.shape,
+            dtype=new_v_bias.dtype,
+            default_initializer=paddle.nn.initializer.Assign(new_v_bias))
+
+        self.out_proj.weight = paddle.create_parameter(
+            shape=new_out_proj_weight.shape,
+            dtype=new_out_proj_weight.dtype,
+            default_initializer=paddle.nn.initializer.Assign(
+                new_out_proj_weight))
+
+        self.num_heads = len(reserve_head_index)
+        self.embed_dim = self.head_dim * self.num_heads
+        self.q_proj.out_features = self.embed_dim
+        self.k_proj.out_features = self.embed_dim
+        self.v_proj.out_features = self.embed_dim
+
+    def _set_skip_embed_dim_check(self):
+        self.skip_embed_dim_check = True
+
+    def _pad_masks(
+            self,
+            key_padding_mask: Optional[Tensor],
+            attn_mask: Optional[Tensor],
+    ) -> Tuple[Optional[Tensor], Optional[Tensor]]:
+        if attn_mask is not None:
+            shape = attn_mask.shape[:-1] + [
+                1,
+            ]
+            attn_mask = paddle.concat(
+                [attn_mask, paddle.zeros(shape, dtype=attn_mask.dtype)],
+                axis=-1)
+        if key_padding_mask is not None:
+            shape = key_padding_mask.shape[:-1] + [
+                1,
+            ]
+            key_padding_mask = paddle.concat(
+                [
+                    key_padding_mask, paddle.zeros(
+                        shape, dtype=key_padding_mask.dtype)
+                ],
+                axis=-1)
+        return key_padding_mask, attn_mask
+
+    def _add_bias(
+            self,
+            k: Tensor,
+            v: Tensor,
+            key_padding_mask: Optional[Tensor],
+            attn_mask: Optional[Tensor],
+            bsz: int,
+    ) -> Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]:
+        assert self.bias_k is not None
+        assert self.bias_v is not None
+        k = paddle.concat([k, self.bias_k.tile([1, bsz, 1])], axis=-1)
+        v = paddle.concat([v, self.bias_v.tile([1, bsz, 1])], axis=-1)
+        key_padding_mask, attn_mask = self._pad_masks(
+            key_padding_mask=key_padding_mask, attn_mask=attn_mask)
+        return k, v, key_padding_mask, attn_mask
+
+    def _append_zero_attn(
+            self,
+            k: Tensor,
+            v: Tensor,
+            key_padding_mask: Optional[Tensor],
+            attn_mask: Optional[Tensor],
+    ) -> Tuple[Tensor, Tensor, Optional[Tensor], Optional[Tensor]]:
+        zero_attn_shape = k.shape[:-2] + [1] + k.shape[-1:]
+        k = paddle.concat(
+            [k, paddle.zeros(zero_attn_shape, dtype=k.dtype)], axis=-2)
+        v = paddle.concat(
+            [v, paddle.zeros(zero_attn_shape, dtype=v.dtype)], axis=-2)
+        key_padding_mask, attn_mask = self._pad_masks(
+            key_padding_mask=key_padding_mask, attn_mask=attn_mask)
+        return k, v, key_padding_mask, attn_mask
+
+    def forward(
+            self,
+            query,
+            key: Optional[Tensor],
+            value: Optional[Tensor],
+            key_padding_mask: Optional[Tensor]=None,
+            incremental_state: Optional[Dict[str, Dict[str, Optional[
+                Tensor]]]]=None,
+            need_weights: bool=True,
+            static_kv: bool=False,
+            attn_mask: Optional[Tensor]=None,
+            before_softmax: bool=False,
+            need_head_weights: bool=False, ) -> Tuple[Tensor, Optional[Tensor]]:
+        """Input shape: Time x Batch x Channel
+
+        Args:
+            key_padding_mask (ByteTensor, optional): mask to exclude
+                keys that are pads, of shape `(batch, src_len)`, where
+                padding elements are indicated by 1s.
+            need_weights (bool, optional): return the attention weights,
+                averaged over heads (default: False).
+            attn_mask (ByteTensor, optional): typically used to
+                implement causal attention, where the mask prevents the
+                attention from looking forward in time (default: None).
+            before_softmax (bool, optional): return the raw attention
+                weights and values before the attention softmax.
+            need_head_weights (bool, optional): return the attention
+                weights for each head. Implies *need_weights*. Default:
+                return the average attention weights over all heads.
+        """
+        if need_head_weights:
+            need_weights = True
+
+        is_tpu = query.place == "xla"
+
+        tgt_len, bsz, embed_dim = query.shape
+        src_len = tgt_len
+        if not self.skip_embed_dim_check:
+            assert (embed_dim == self.embed_dim
+                    ), f"query dim {embed_dim} != {self.embed_dim}"
+        assert list(query.shape) == [tgt_len, bsz, embed_dim]
+        if key is not None:
+            src_len, key_bsz, _ = key.shape
+            # if not torch.jit.is_scripting():
+            #     assert value is not None
+            #     assert src_len, key_bsz == value.shape[:2]
+
+        # if (
+        #     not self.onnx_trace
+        #     and not is_tpu  # don't use PyTorch version on TPUs
+        #     and incremental_state is None
+        #     and not static_kv
+        #     # A workaround for quantization to work. Otherwise JIT compilation
+        #     # treats bias in linear module as method.
+        #     and not torch.jit.is_scripting()
+        #     # The Multihead attention implemented in pytorch forces strong dimension check
+        #     # for input embedding dimention and K,Q,V projection dimension.
+        #     # Since pruning will break the dimension check and it is not easy to modify the pytorch API,
+        #     # it is preferred to bypass the pytorch MHA when we need to skip embed_dim_check
+        #     and not self.skip_embed_dim_check
+        # ):
+        #     assert key is not None and value is not None
+
+        # if self.use_xformers:
+        #     return self._xformers_attn_forward(
+        #         query, key, value, key_padding_mask, need_weights, attn_mask
+        #     )
+
+        # else:
+        #     return F.multi_head_attention_forward(
+        #         query,
+        #         key,
+        #         value,
+        #         self.embed_dim,
+        #         self.num_heads,
+        #         torch.empty([0]),
+        #         torch.cat((self.q_proj.bias, self.k_proj.bias, self.v_proj.bias)),
+        #         self.bias_k,
+        #         self.bias_v,
+        #         self.add_zero_attn,
+        #         self.dropout_module.p,
+        #         self.out_proj.weight,
+        #         self.out_proj.bias,
+        #         self.training or self.dropout_module.apply_during_inference,
+        #         key_padding_mask,
+        #         need_weights,
+        #         attn_mask,
+        #         use_separate_proj_weight=True,
+        #         q_proj_weight=self.q_proj.weight,
+        #         k_proj_weight=self.k_proj.weight,
+        #         v_proj_weight=self.v_proj.weight,
+        #     )
+
+        if incremental_state is not None:
+            saved_state = self._get_input_buffer(incremental_state)
+            if saved_state is not None and "prev_key" in saved_state:
+                # previous time steps are cached - no need to recompute
+                # key and value if they are static
+                if static_kv:
+                    assert self.encoder_decoder_attention and not self.self_attention
+                    key = value = None
+        else:
+            saved_state = None
+
+        if self.self_attention:
+            q = self.q_proj(query)
+            k = self.k_proj(query)
+            v = self.v_proj(query)
+        elif self.encoder_decoder_attention:
+            # encoder-decoder attention
+            q = self.q_proj(query)
+            if key is None:
+                assert value is None
+                k = v = None
+            else:
+                if self.beam_size > 1 and bsz == key.size(1):
+                    # key is [T, bsz*beam_size, C], reduce to [T, bsz, C]
+                    key = key.view(
+                        key.size(0), -1, self.beam_size,
+                        key.size(2))[:, :, 0, :]
+                    if key_padding_mask is not None:
+                        key_padding_mask = key_padding_mask.view(
+                            -1, self.beam_size,
+                            key_padding_mask.size(1))[:, 0, :]
+                k = self.k_proj(key)
+                v = self.v_proj(key)
+
+        else:
+            assert key is not None and value is not None
+            q = self.q_proj(query)
+            k = self.k_proj(key)
+            v = self.v_proj(value)
+        q *= self.scaling
+
+        if self.bias_k is not None:
+            assert self.bias_v is not None
+            k, v, attn_mask, key_padding_mask = self._add_bias(
+                k, v, attn_mask, key_padding_mask, bsz)
+
+        q = paddle.reshape(
+            q, [tgt_len, bsz * self.num_heads, self.head_dim]).transpose(
+                [1, 0, 2])
+        kv_bsz = bsz  # need default value for scripting
+        if k is not None:
+            kv_bsz = k.shape[1]
+            k = paddle.reshape(
+                k, [-1, kv_bsz * self.num_heads, self.head_dim]).transpose(
+                    [1, 0, 2])
+        if v is not None:
+            v = paddle.reshape(
+                v, [-1, kv_bsz * self.num_heads, self.head_dim]).transpose(
+                    [1, 0, 2])
+
+        if saved_state is not None:
+            # saved states are stored with shape (bsz, num_heads, seq_len, head_dim)
+            if "prev_key" in saved_state:
+                _prev_key = saved_state["prev_key"]
+                assert _prev_key is not None
+                kv_bsz = _prev_key.shape[0]
+                prev_key = _prev_key.reshape(
+                    [kv_bsz * self.num_heads, -1, self.head_dim])
+                if static_kv:
+                    k = prev_key
+                else:
+                    assert k is not None
+                    k = paddle.concat([prev_key, k], axis=1)
+                src_len = k.shape[1]
+            if "prev_value" in saved_state:
+                _prev_value = saved_state["prev_value"]
+                assert _prev_value is not None
+                assert kv_bsz == _prev_value.size(0)
+                prev_value = _prev_value.reshape(
+                    [kv_bsz * self.num_heads, -1, self.head_dim])
+                if static_kv:
+                    v = prev_value
+                else:
+                    assert v is not None
+                    v = paddle.concat([prev_value, v], axis=1)
+            prev_key_padding_mask: Optional[Tensor] = None
+            if "prev_key_padding_mask" in saved_state:
+                prev_key_padding_mask = saved_state["prev_key_padding_mask"]
+            assert k is not None and v is not None
+            key_padding_mask = MultiheadAttention._append_prev_key_padding_mask(
+                key_padding_mask=key_padding_mask,
+                prev_key_padding_mask=prev_key_padding_mask,
+                batch_size=kv_bsz,
+                src_len=k.shape[1],
+                static_kv=static_kv, )
+
+            saved_state["prev_key"] = k.reshape(
+                [kv_bsz, self.num_heads, -1, self.head_dim])
+            saved_state["prev_value"] = v.reshape(
+                [kv_bsz, self.num_heads, -1, self.head_dim])
+            saved_state["prev_key_padding_mask"] = key_padding_mask
+            # In this branch incremental_state is never None
+            assert incremental_state is not None
+            incremental_state = self._set_input_buffer(incremental_state,
+                                                       saved_state)
+        assert k is not None
+        assert k.shape[1] == src_len
+
+        # This is part of a workaround to get around fork/join parallelism
+        # not supporting Optional types.
+        if key_padding_mask is not None and key_padding_mask.dim() == 0:
+            key_padding_mask = None
+
+        if key_padding_mask is not None:
+            assert key_padding_mask.shape[0] == kv_bsz
+            assert key_padding_mask.shape[1] == src_len
+
+        if self.add_zero_attn:
+            assert v is not None
+            src_len += 1
+            k, v, key_padding_mask, attn_mask = self._append_zero_attn(
+                k=k,
+                v=v,
+                key_padding_mask=key_padding_mask,
+                attn_mask=attn_mask)
+
+        if self.encoder_decoder_attention and bsz != kv_bsz:
+            attn_weights = paddle.einsum(
+                "bxhtd,bhsd->bxhts",
+                q.reshape([kv_bsz, -1, self.num_heads] + q.shape[1:]),
+                k.reshape([kv_bsz, self.num_heads] + k.shape[1:]), )
+            attn_weights = attn_weights.reshape([
+                -1,
+            ] + attn_weights.shape[-2:])
+        else:
+            attn_weights = paddle.bmm(q, k.transpose([0, 2, 1]))
+        attn_weights = self.apply_sparse_mask(attn_weights, tgt_len, src_len,
+                                              bsz)
+
+        assert list(
+            attn_weights.shape) == [bsz * self.num_heads, tgt_len, src_len]
+
+        if attn_mask is not None:
+            attn_mask = attn_mask.unsqueeze(0)
+            if self.onnx_trace:
+                attn_mask = attn_mask.tile([attn_weights.shape[0], 1, 1])
+            attn_weights += attn_mask
+
+        if key_padding_mask is not None:
+            # don't attend to padding symbols
+            attn_weights = attn_weights.reshape(
+                [bsz, self.num_heads, tgt_len, src_len])
+            if not is_tpu:
+                attn_weights = attn_weights.reshape(
+                    [kv_bsz, -1, self.num_heads, tgt_len, src_len])
+                attn_weights = paddle.where(
+                    key_padding_mask.unsqueeze(1).unsqueeze(2).unsqueeze(3)
+                    .astype('bool'),
+                    float('-inf') * paddle.ones_like(attn_weights),
+                    attn_weights)
+            else:
+                attn_weights = attn_weights.transpose([2, 1, 0])
+                attn_weights = paddle.where(key_padding_mask,
+                                            float('-inf') *
+                                            paddle.ones_like(attn_weights),
+                                            attn_weights)
+                attn_weights = attn_weights.transpose([2, 1, 0])
+            attn_weights = attn_weights.reshape(
+                [bsz * self.num_heads, tgt_len, src_len])
+
+        if before_softmax:
+            return attn_weights, v
+
+        def softmax_supporting_onnx_trace(x, dim: int, onnx_trace: bool=False):
+            if onnx_trace:
+                return F.softmax(x, axis=dim)
+            else:
+                return F.softmax(x, axis=dim, dtype='float32')
+
+        attn_weights_float = softmax_supporting_onnx_trace(
+            attn_weights, dim=-1, onnx_trace=self.onnx_trace)
+        attn_weights = paddle.cast(attn_weights_float, attn_weights.dtype)
+        attn_probs = self.dropout_module(attn_weights)
+
+        assert v is not None
+        if self.encoder_decoder_attention and bsz != kv_bsz:
+            attn = paddle.einsum(
+                "bxhts,bhsd->bxhtd",
+                attn_probs.reshape([kv_bsz, -1, self.num_heads] +
+                                   attn_probs.shape[1:]),
+                v.reshape([kv_bsz, self.num_heads] + v.shape[1:]), )
+            attn = attn.reshape([
+                -1,
+            ] + attn.shape[-2:])
+        else:
+            attn = paddle.bmm(attn_probs, v)
+        assert list(
+            attn.shape) == [bsz * self.num_heads, tgt_len, self.head_dim]
+        if self.onnx_trace and attn.shape[1] == 1:
+            # when ONNX tracing a single decoder step (sequence length == 1)
+            # the transpose is a no-op copy before view, thus unnecessary
+            attn = attn.reshape([tgt_len, bsz, self.embed_dim])
+        else:
+            attn = attn.transpose([1, 0, 2]).reshape(
+                [tgt_len, bsz, self.embed_dim])
+        attn = self.out_proj(attn)
+        attn_weights: Optional[Tensor] = None
+        if need_weights:
+            attn_weights = attn_weights_float.reshape(
+                [bsz, self.num_heads, tgt_len, src_len]).transpose([1, 0, 2, 3])
+            if not need_head_weights:
+                # average attention weights over heads
+                attn_weights = attn_weights.mean(axis=0)
+
+        return attn, attn_weights
+
+    @staticmethod
+    def _append_prev_key_padding_mask(
+            key_padding_mask: Optional[Tensor],
+            prev_key_padding_mask: Optional[Tensor],
+            batch_size: int,
+            src_len: int,
+            static_kv: bool, ) -> Optional[Tensor]:
+        # saved key padding masks have shape (bsz, seq_len)
+        if prev_key_padding_mask is not None and static_kv:
+            new_key_padding_mask = prev_key_padding_mask
+        elif prev_key_padding_mask is not None and key_padding_mask is not None:
+            new_key_padding_mask = paddle.concat(
+                [
+                    paddle.cast(prev_key_padding_mask, 'float32'),
+                    paddle.cast(key_padding_mask, 'float32')
+                ],
+                axis=1)
+        # During incremental decoding, as the padding token enters and
+        # leaves the frame, there will be a time when prev or current
+        # is None
+        elif prev_key_padding_mask is not None:
+            if src_len > prev_key_padding_mask.shape[1]:
+                filler = paddle.zeros(
+                    [batch_size, src_len - prev_key_padding_mask.shape[1]], )
+                new_key_padding_mask = paddle.concat(
+                    [
+                        paddle.cast(prev_key_padding_mask, 'float32'),
+                        paddle.cast(filler, 'float32')
+                    ],
+                    axis=1)
+            else:
+                new_key_padding_mask = prev_key_padding_mask
+        elif key_padding_mask is not None:
+            if src_len > key_padding_mask.shape[1]:
+                filler = paddle.zeros(
+                    [batch_size, src_len - key_padding_mask.shape[1]], )
+                new_key_padding_mask = paddle.concat(
+                    [
+                        paddle.cast(filler, 'float32'),
+                        paddle.cast(key_padding_mask, 'float32')
+                    ],
+                    axis=1)
+            else:
+                new_key_padding_mask = paddle.cast(key_padding_mask, 'float32')
+        else:
+            new_key_padding_mask = prev_key_padding_mask
+        return new_key_padding_mask
+
+    @paddle.jit.to_static
+    def reorder_incremental_state(
+            self,
+            incremental_state: Dict[str, Dict[str, Optional[Tensor]]],
+            new_order: Tensor, ):
+        """Reorder buffered internal state (for incremental generation)."""
+        input_buffer = self._get_input_buffer(incremental_state)
+        if input_buffer is not None:
+            for k in input_buffer.keys():
+                input_buffer_k = input_buffer[k]
+                if input_buffer_k is not None:
+                    if self.encoder_decoder_attention:
+                        if input_buffer_k.shape[
+                                0] * self.beam_size == new_order.shape[0]:
+                            return incremental_state
+                        elif self.beam_size > 1:
+                            input_buffer[k] = paddle.index_select(
+                                input_buffer_k,
+                                index=new_order.reshape(
+                                    [-1, self.beam_size])[:, 0] //
+                                self.beam_size,
+                                axis=0, )
+                        else:
+                            input_buffer[k] = paddle.index_select(
+                                input_buffer_k, index=new_order, axis=0)
+                    else:
+                        input_buffer[k] = paddle.index_select(
+                            input_buffer_k, index=new_order, axis=0)
+            incremental_state = self._set_input_buffer(incremental_state,
+                                                       input_buffer)
+        return incremental_state
+
+    def set_beam_size(self, beam_size):
+        """Used for effiecient beamable enc-dec attention"""
+        self.beam_size = beam_size
+
+    def _get_input_buffer(
+            self,
+            incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]]
+    ) -> Dict[str, Optional[Tensor]]:
+        result = self.get_incremental_state(incremental_state, "attn_state")
+        if result is not None:
+            return result
+        else:
+            empty_result: Dict[str, Optional[Tensor]] = {}
+            return empty_result
+
+    def _set_input_buffer(
+            self,
+            incremental_state: Dict[str, Dict[str, Optional[Tensor]]],
+            buffer: Dict[str, Optional[Tensor]], ):
+        return self.set_incremental_state(incremental_state, "attn_state",
+                                          buffer)
+
+    def apply_sparse_mask(self,
+                          attn_weights,
+                          tgt_len: int,
+                          src_len: int,
+                          bsz: int):
+        return attn_weights
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        prefix = name + "." if name != "" else ""
+        items_to_add = {}
+        keys_to_remove = []
+        for k in state_dict.keys():
+            if k.endswith(prefix + "in_proj_weight"):
+                # in_proj_weight used to be q + k + v with same dimensions
+                dim = int(state_dict[k].shape[0] / 3)
+                items_to_add[prefix + "q_proj.weight"] = state_dict[k][:dim]
+                items_to_add[prefix +
+                             "k_proj.weight"] = state_dict[k][dim:2 * dim]
+                items_to_add[prefix + "v_proj.weight"] = state_dict[k][2 * dim:]
+
+                keys_to_remove.append(k)
+
+                k_bias = prefix + "in_proj_bias"
+                if k_bias in state_dict.keys():
+                    dim = int(state_dict[k].shape[0] / 3)
+                    items_to_add[prefix +
+                                 "q_proj.bias"] = state_dict[k_bias][:dim]
+                    items_to_add[prefix + "k_proj.bias"] = state_dict[k_bias][
+                        dim:2 * dim]
+                    items_to_add[prefix +
+                                 "v_proj.bias"] = state_dict[k_bias][2 * dim:]
+
+                    keys_to_remove.append(prefix + "in_proj_bias")
+
+        for k in keys_to_remove:
+            del state_dict[k]
+
+        for key, value in items_to_add.items():
+            state_dict[key] = value
+
+
+class GumbelVectorQuantizer(nn.Layer):
+    def __init__(
+            self,
+            dim,
+            num_vars,
+            temp,
+            groups,
+            combine_groups,
+            vq_dim,
+            time_first,
+            activation=nn.GELU(),
+            weight_proj_depth=1,
+            weight_proj_factor=1, ):
+        """Vector quantization using gumbel softmax
+
+        Args:
+            dim: input dimension (channels)
+            num_vars: number of quantized vectors per group
+            temp: temperature for training. this should be a tuple of 3 elements: (start, stop, decay factor)
+            groups: number of groups for vector quantization
+            combine_groups: whether to use the vectors for all groups
+            vq_dim: dimensionality of the resulting quantized vector
+            time_first: if true, expect input in BxTxC format, otherwise in BxCxT
+            activation: what activation to use (should be a module). this is only used if weight_proj_depth is > 1
+            weight_proj_depth: number of layers (with activation in between) to project input before computing logits
+            weight_proj_factor: this is used only if weight_proj_depth is > 1. scales the inner dimensionality of
+                                projections by this factor
+        """
+        super().__init__()
+
+        self.groups = groups
+        self.combine_groups = combine_groups
+        self.input_dim = dim
+        self.num_vars = num_vars
+        self.time_first = time_first
+
+        assert (
+            vq_dim % groups == 0
+        ), f"dim {vq_dim} must be divisible by groups {groups} for concatenation"
+
+        var_dim = vq_dim // groups
+        num_groups = groups if not combine_groups else 1
+
+        self.vars = self.create_parameter(
+            (1, num_groups * num_vars, var_dim),
+            default_initializer=nn.initializer.Uniform())
+
+        if weight_proj_depth > 1:
+
+            def block(input_dim, output_dim):
+                return nn.Sequential(Linear(input_dim, output_dim), activation)
+
+            inner_dim = self.input_dim * weight_proj_factor
+            self.weight_proj = nn.Sequential(
+                *[
+                    block(self.input_dim if i == 0 else inner_dim, inner_dim)
+                    for i in range(weight_proj_depth - 1)
+                ],
+                Linear(inner_dim, groups * num_vars), )
+        else:
+            self.weight_proj = Linear(
+                self.input_dim,
+                groups * num_vars,
+                weight_attr=nn.initializer.Normal(mean=0, std=1),
+                bias_attr=nn.initializer.Zero())
+
+        if isinstance(temp, str):
+            import ast
+
+            temp = ast.literal_eval(temp)
+        assert len(temp) == 3, f"{temp}, {len(temp)}"
+
+        self.max_temp, self.min_temp, self.temp_decay = temp
+        self.curr_temp = self.max_temp
+        self.codebook_indices = None
+
+    def set_num_updates(self, num_updates):
+        self.curr_temp = max(self.max_temp * self.temp_decay**num_updates,
+                             self.min_temp)
+
+    def get_codebook_indices(self):
+        if self.codebook_indices is None:
+            from itertools import product
+
+            p = [range(self.num_vars)] * self.groups
+            inds = list(product(*p))
+            self.codebook_indices = paddle.to_tensor(
+                inds, dtype='int64', place=self.vars.place).flatten()
+
+            if not self.combine_groups:
+                self.codebook_indices = self.codebook_indices.reshape(
+                    self.num_vars**self.groups, -1)
+                for b in range(1, self.groups):
+                    self.codebook_indices[:, b] += self.num_vars * b
+                self.codebook_indices = self.codebook_indices.flatten()
+        return self.codebook_indices
+
+    def codebook(self):
+        indices = self.get_codebook_indices()
+        return (self.vars.squeeze(0).index_select(0, indices)
+                .reshape(self.num_vars**self.groups, -1))
+
+    def sample_from_codebook(self, b, n):
+        indices = self.get_codebook_indices()
+        indices = indices.reshape(-1, self.groups)
+        cb_size = indices.shape[0]
+        assert (n < cb_size
+                ), f"sample size {n} is greater than size of codebook {cb_size}"
+        sample_idx = paddle.randint(low=0, high=cb_size, shape=(b * n, ))
+        indices = indices[sample_idx]
+
+        z = self.vars.squeeze(0).index_select(0, indices.flatten()).reshape(
+            b, n, -1)
+        return z
+
+    def to_codebook_index(self, indices):
+        res = paddle.full(indices.shape[:-1], 0, dtype=indices.dtype)
+        for i in range(self.groups):
+            exponent = self.groups - i - 1
+            res += indices[..., i] * (self.num_vars**exponent)
+        return res
+
+    def forward_idx(self, x):
+        res = self.forward(x, produce_targets=True)
+        return res["x"], res["targets"]
+
+    def forward(self, x, produce_targets=False):
+        result = {"num_vars": self.num_vars * self.groups}
+
+        if not self.time_first:
+            x = x.transpose([0, 2, 1])
+
+        bsz, tsz, fsz = x.shape
+        x = x.reshape([-1, fsz])
+        x = self.weight_proj(x)
+        x = x.reshape([bsz * tsz * self.groups, -1])
+
+        _, k = x.max(-1)
+        hard_x = paddle.zeros_like(x)
+        hard_x.scatter_(-1, k.reshape([-1, 1]), 1.0)
+        hard_x = hard_x.reshape([bsz * tsz, self.groups, -1])
+        hard_probs = paddle.mean(hard_x.astype('float32'), axis=0)
+        result["code_perplexity"] = paddle.exp(-paddle.sum(
+            hard_probs * paddle.log(hard_probs + 1e-7), axis=-1)).sum()
+
+        avg_probs = F.softmax(
+            x.reshape([bsz * tsz, self.groups, -1]).astype('float32'),
+            axis=-1).mean(axis=0)
+        result["prob_perplexity"] = paddle.exp(-paddle.sum(
+            avg_probs * paddle.log(avg_probs + 1e-7), axis=-1)).sum()
+
+        result["temp"] = self.curr_temp
+
+        if self.training:
+            x = F.gumbel_softmax(
+                x.astype('float32'), temperature=self.curr_temp,
+                hard=True).astype(x.dtype)
+        else:
+            x = hard_x
+
+        x = x.reshape([bsz * tsz, -1])
+
+        vars = self.vars
+        if self.combine_groups:
+            vars = vars.tile([1, self.groups, 1])
+
+        if produce_targets:
+            result["targets"] = (x.reshape([bsz * tsz * self.groups, -1])
+                                 .argmax(axis=-1)
+                                 .reshape([bsz, tsz, self.groups]).detach())
+
+        x = x.unsqueeze(-1) * vars
+        x = x.reshape([bsz * tsz, self.groups, self.num_vars, -1])
+        x = x.sum(axis=-2)
+        x = x.reshape([bsz, tsz, -1])
+
+        if not self.time_first:
+            x = x.transpose([0, 2, 1])
+
+        result["x"] = x
+
+        return result
+
+
+class GradMultiply(paddle.autograd.PyLayer):
+    @staticmethod
+    def forward(ctx, x, scale):
+        ctx.scale = scale
+        res = x.numpy().copy()
+        return paddle.to_tensor(res, dtype=x.dtype)
+
+    @staticmethod
+    def backward(ctx, grad):
+        return grad * ctx.scale, None
+
+
+class SamePad(nn.Layer):
+    def __init__(self, kernel_size, causal=False):
+        super().__init__()
+        if causal:
+            self.remove = kernel_size - 1
+        else:
+            self.remove = 1 if kernel_size % 2 == 0 else 0
+
+    def forward(self, x):
+        if self.remove > 0:
+            x = x[:, :, :-self.remove]
+        return x
+
+
+class TransposeLast(nn.Layer):
+    def __init__(self, deconstruct_idx=None):
+        super().__init__()
+        self.deconstruct_idx = deconstruct_idx
+
+    def forward(self, x):
+        if self.deconstruct_idx is not None:
+            x = x[self.deconstruct_idx]
+        trans_dim = paddle.arange(x.dim())
+        trans_dim[-1], trans_dim[-2] = trans_dim[-2], trans_dim[-1]
+        return x.transpose(trans_dim)
+
+
+class Fp32LayerNorm(LayerNorm):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def forward(self, input):
+        output = F.layer_norm(
+            input.astype('float32'),
+            self._normalized_shape,
+            self.weight.astype('float32') if self.weight is not None else None,
+            self.bias.astype('float32') if self.bias is not None else None,
+            self._epsilon, )
+        return output.astype(input.dtype)
+
+
+# Todo: change this when paddle supports F.group_norm
+class Fp32GroupNorm(nn.Layer):
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+        self.group_norm = paddle.nn.GroupNorm(*args, **kwargs)
+        fp32_weight = paddle.create_parameter(
+            shape=self.group_norm.weight.shape,
+            dtype='float32',
+            default_initializer=paddle.nn.initializer.Assign(
+                self.group_norm.weight))
+        fp32_bias = paddle.create_parameter(
+            shape=self.group_norm.bias.shape,
+            dtype='float32',
+            default_initializer=paddle.nn.initializer.Assign(
+                self.group_norm.bias))
+        self.group_norm.weight = fp32_weight
+        self.group_norm.bias = fp32_bias
+
+    def forward(self, input):
+        output = self.group_norm(input.astype('float32'))
+        return output.astype(input.dtype)
+
+
+class StrEnumMeta(EnumMeta):
+    # this is workaround for submitit pickling leading to instance checks failing in hydra for StrEnum, see
+    # https://github.com/facebookresearch/hydra/issues/1156
+    @classmethod
+    def __instancecheck__(cls, other):
+        return "enum" in str(type(other))
+
+
+class StrEnum(Enum, metaclass=StrEnumMeta):
+    def __str__(self):
+        return self.value
+
+    def __eq__(self, other: str):
+        return self.value == other
+
+    def __repr__(self):
+        return self.value
+
+    def __hash__(self):
+        return hash(str(self))
+
+
+def ChoiceEnum(choices: List[str]):
+    """return the Enum class used to enforce list of choices"""
+    return StrEnum("Choices", {k: k for k in choices})
+
+
+def relu_squared(x: paddle.Tensor):
+    return F.relu(x).pow(2)
+
+
+def get_activation_fn(activation: str) -> Callable:
+    """Returns the activation function corresponding to `activation`"""
+
+    def gelu_accurate(x):
+        if not hasattr(gelu_accurate, "_a"):
+            gelu_accurate._a = math.sqrt(2 / math.pi)
+        return (0.5 * x * (1 + paddle.tanh(gelu_accurate._a *
+                                           (x + 0.044715 * paddle.pow(x, 3)))))
+
+    def gelu(x: paddle.Tensor) -> paddle.Tensor:
+        return paddle.nn.functional.gelu(x.astype('float32')).astype(x.dtype)
+
+    if activation == "relu":
+        return F.relu
+    elif activation == "relu_squared":
+        return relu_squared
+    elif activation == "gelu":
+        return gelu
+    elif activation == "gelu_fast":
+        return gelu_accurate
+    elif activation == "gelu_accurate":
+        return gelu_accurate
+    elif activation == "tanh":
+        return paddle.tanh
+    elif activation == "linear":
+        return lambda x: x
+    elif activation == "swish":
+        return paddle.nn.Swish
+    else:
+        raise RuntimeError(
+            "--activation-fn {} not supported".format(activation))
+
+
+def get_available_activation_fns() -> List:
+    return [
+        "relu",
+        "gelu",
+        "gelu_fast",  # deprecated
+        "gelu_accurate",
+        "tanh",
+        "linear",
+    ]
+
+
+def compute_mask_indices(
+        shape: Tuple[int, int],
+        padding_mask: Optional[paddle.Tensor],
+        mask_prob: float,
+        mask_length: int,
+        mask_type: str="static",
+        mask_other: float=0.0,
+        min_masks: int=0,
+        no_overlap: bool=False,
+        min_space: int=0,
+        require_same_masks: bool=True,
+        mask_dropout: float=0.0, ) -> np.ndarray:
+    """
+    Computes random mask spans for a given shape
+
+    Args:
+        shape: the the shape for which to compute masks.
+            should be of size 2 where first element is batch size and 2nd is timesteps
+        padding_mask: optional padding mask of the same size as shape, which will prevent masking padded elements
+        mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by
+            number of timesteps divided by length of mask span to mask approximately this percentage of all elements.
+            however due to overlaps, the actual number will be smaller (unless no_overlap is True)
+        mask_type: how to compute mask lengths
+            static = fixed size
+            uniform = sample from uniform distribution [mask_other, mask_length*2]
+            normal = sample from normal distribution with mean mask_length and stdev mask_other. mask is min 1 element
+            poisson = sample from possion distribution with lambda = mask length
+        min_masks: minimum number of masked spans
+        no_overlap: if false, will switch to an alternative recursive algorithm that prevents spans from overlapping
+        min_space: only used if no_overlap is True, this is how many elements to keep unmasked between spans
+        require_same_masks: if true, will randomly drop out masks until same amount of masks remains in each sample
+        mask_dropout: randomly dropout this percentage of masks in each example
+    """
+
+    bsz, all_sz = shape
+    mask = np.full((bsz, all_sz), False)
+
+    all_num_mask = int(
+        # add a random number for probabilistic rounding
+        mask_prob * all_sz / float(mask_length) + np.random.rand())
+
+    all_num_mask = max(min_masks, all_num_mask)
+
+    mask_idcs = []
+    for i in range(bsz):
+        if padding_mask is not None:
+            sz = all_sz - padding_mask[i].long().sum().item()
+            num_mask = int(
+                # add a random number for probabilistic rounding
+                mask_prob * sz / float(mask_length) + np.random.rand())
+            num_mask = max(min_masks, num_mask)
+        else:
+            sz = all_sz
+            num_mask = all_num_mask
+
+        if mask_type == "static":
+            lengths = np.full(num_mask, mask_length)
+        elif mask_type == "uniform":
+            lengths = np.random.randint(
+                mask_other, mask_length * 2 + 1, size=num_mask)
+        elif mask_type == "normal":
+            lengths = np.random.normal(mask_length, mask_other, size=num_mask)
+            lengths = [max(1, int(round(x))) for x in lengths]
+        elif mask_type == "poisson":
+            lengths = np.random.poisson(mask_length, size=num_mask)
+            lengths = [int(round(x)) for x in lengths]
+        else:
+            raise Exception("unknown mask selection " + mask_type)
+
+        if sum(lengths) == 0:
+            lengths[0] = min(mask_length, sz - 1)
+
+        if no_overlap:
+            mask_idc = []
+
+            def arrange(s, e, length, keep_length):
+                span_start = np.random.randint(s, e - length)
+                mask_idc.extend(span_start + i for i in range(length))
+
+                new_parts = []
+                if span_start - s - min_space >= keep_length:
+                    new_parts.append((s, span_start - min_space + 1))
+                if e - span_start - length - min_space > keep_length:
+                    new_parts.append((span_start + length + min_space, e))
+                return new_parts
+
+            parts = [(0, sz)]
+            min_length = min(lengths)
+            for length in sorted(lengths, reverse=True):
+                lens = np.fromiter(
+                    (e - s if e - s >= length + min_space else 0
+                     for s, e in parts),
+                    np.int, )
+                l_sum = np.sum(lens)
+                if l_sum == 0:
+                    break
+                probs = lens / np.sum(lens)
+                c = np.random.choice(len(parts), p=probs)
+                s, e = parts.pop(c)
+                parts.extend(arrange(s, e, length, min_length))
+            mask_idc = np.asarray(mask_idc)
+        else:
+            min_len = min(lengths)
+            if sz - min_len <= num_mask:
+                min_len = sz - num_mask - 1
+
+            mask_idc = np.random.choice(sz - min_len, num_mask, replace=False)
+
+            mask_idc = np.asarray([
+                mask_idc[j] + offset
+                for j in range(len(mask_idc)) for offset in range(lengths[j])
+            ])
+
+        mask_idcs.append(np.unique(mask_idc[mask_idc < sz]))
+
+    min_len = min([len(m) for m in mask_idcs])
+    for i, mask_idc in enumerate(mask_idcs):
+        if len(mask_idc) > min_len and require_same_masks:
+            mask_idc = np.random.choice(mask_idc, min_len, replace=False)
+        if mask_dropout > 0:
+            num_holes = np.rint(len(mask_idc) * mask_dropout).astype(int)
+            mask_idc = np.random.choice(
+                mask_idc, len(mask_idc) - num_holes, replace=False)
+
+        mask[i, mask_idc] = True
+
+    return mask
+
+
+def index_put(tensor, indices, value):
+    tensor[indices] = value
+    return tensor
+
+
+# ToDo if faster?
+def buffered_arange(max):
+    if not hasattr(buffered_arange, "buf"):
+        buffered_arange.buf = paddle.empty([max], dtype='int64')
+    if max > buffered_arange.buf.numel():
+        buffered_arange.buf = paddle.arange(max)
+    return buffered_arange.buf[:max]
+
+
+def pad_to_multiple(x, multiple, dim=-1, value=0):
+    # Inspired from https://github.com/lucidrains/local-attention/blob/master/local_attention/local_attention.py#L41
+    if x is None:
+        return None, 0
+    tsz = x.shape[dim]
+    m = tsz / multiple
+    remainder = math.ceil(m) * multiple - tsz
+    if m.is_integer():
+        return x, 0
+    pad_offset = (0, ) * (-1 - dim) * 2
+    return F.pad(
+        x,
+        pad=[*pad_offset, 0, remainder, *pad_offset],
+        value=value,
+        data_format='NLC'), remainder
+
+
+EXTRACTOR_MODE_CHOICES = ChoiceEnum(["default", "layer_norm"])
+MASKING_DISTRIBUTION_CHOICES = ChoiceEnum(
+    ["static", "uniform", "normal", "poisson"])
+LAYER_TYPE_CHOICES = ChoiceEnum(["transformer"])  # ToDo: conformer 
+
+
+@dataclass
+class Wav2Vec2Config:
+    extractor_mode: EXTRACTOR_MODE_CHOICES = field(
+        default="default",
+        metadata={
+            "help":
+            "mode for feature extractor. default has a single group norm with d "
+            "groups in the first conv block, whereas layer_norm has layer norms in "
+            "every block (meant to use with normalize=True)"
+        }, )
+    encoder_layers: int = field(
+        default=12, metadata={"help": "num encoder layers in the transformer"})
+    encoder_embed_dim: int = field(
+        default=768, metadata={"help": "encoder embedding dimension"})
+    encoder_ffn_embed_dim: int = field(
+        default=3072, metadata={"help": "encoder embedding dimension for FFN"})
+    encoder_attention_heads: int = field(
+        default=12, metadata={"help": "num encoder attention heads"})
+    activation_fn: ChoiceEnum(get_available_activation_fns()) = field(
+        default="gelu", metadata={"help": "activation function to use"})
+    layer_type: LAYER_TYPE_CHOICES = field(
+        default="transformer", metadata={"help": "layer type in encoder"})
+    # dropouts
+    dropout: float = field(
+        default=0.1,
+        metadata={"help": "dropout probability for the transformer"})
+    attention_dropout: float = field(
+        default=0.1,
+        metadata={"help": "dropout probability for attention weights"})
+    activation_dropout: float = field(
+        default=0.0,
+        metadata={"help": "dropout probability after activation in FFN"})
+    encoder_layerdrop: float = field(
+        default=0.0,
+        metadata={"help": "probability of dropping a tarnsformer layer"})
+    dropout_input: float = field(
+        default=0.0,
+        metadata={"help": "dropout to apply to the input (after feat extr)"}, )
+    dropout_features: float = field(
+        default=0.0,
+        metadata={"help": "dropout to apply to the features (after feat extr)"},
+    )
+
+    final_dim: int = field(
+        default=0,
+        metadata={
+            "help":
+            "project final representations and targets to this many dimensions."
+            "set to encoder_embed_dim is <= 0"
+        }, )
+    layer_norm_first: bool = field(
+        default=False,
+        metadata={"help": "apply layernorm first in the transformer"})
+    conv_feature_layers: str = field(
+        default="[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]",
+        metadata={
+            "help":
+            "string describing convolutional feature extraction layers in form of a python list that contains "
+            "[(dim, kernel_size, stride), ...]"
+        }, )
+    conv_bias: bool = field(
+        default=False, metadata={"help": "include bias in conv encoder"})
+    logit_temp: float = field(
+        default=0.1, metadata={"help": "temperature to divide logits by"})
+    quantize_targets: bool = field(
+        default=False, metadata={"help": "use quantized targets"})
+    quantize_input: bool = field(
+        default=False, metadata={"help": "use quantized inputs"})
+    same_quantizer: bool = field(
+        default=False,
+        metadata={"help": "use same quantizer for inputs and targets"})
+    target_glu: bool = field(
+        default=False, metadata={"help": "adds projection + glu to targets"})
+    feature_grad_mult: float = field(
+        default=1.0,
+        metadata={"help": "multiply feature extractor var grads by this"})
+    quantizer_depth: int = field(
+        default=1,
+        metadata={"help": "number of quantizer layers"}, )
+    quantizer_factor: int = field(
+        default=3,
+        metadata={
+            "help":
+            "dimensionality increase for inner quantizer layers (if depth > 1)"
+        }, )
+    latent_vars: int = field(
+        default=320,
+        metadata={
+            "help": "number of latent variables V in each group of the codebook"
+        }, )
+    latent_groups: int = field(
+        default=2,
+        metadata={
+            "help": "number of groups G of latent variables in the codebook"
+        }, )
+    latent_dim: int = field(
+        default=0,
+        metadata={
+            "help":
+            "if > 0, uses this dimensionality for latent variables. "
+            "otherwise uses final_dim / latent_groups"
+        }, )
+
+    # masking
+    mask_length: int = field(default=10, metadata={"help": "mask length"})
+    mask_prob: float = field(
+        default=0.65,
+        metadata={"help": "probability of replacing a token with mask"})
+    mask_selection: MASKING_DISTRIBUTION_CHOICES = field(
+        default="static", metadata={"help": "how to choose mask length"})
+    mask_other: float = field(
+        default=0,
+        metadata={
+            "help":
+            "secondary mask argument (used for more complex distributions), "
+            "see help in compute_mask_indices"
+        }, )
+    no_mask_overlap: bool = field(
+        default=False, metadata={"help": "whether to allow masks to overlap"})
+    mask_min_space: int = field(
+        default=1,
+        metadata={"help": "min space between spans (if no overlap is enabled)"},
+    )
+    require_same_masks: bool = field(
+        default=True,
+        metadata={
+            "help":
+            "whether to number of masked timesteps must be the same across all "
+            "examples in a batch"
+        }, )
+    mask_dropout: float = field(
+        default=0.0,
+        metadata={"help": "percent of masks to unmask for each sample"}, )
+
+    # channel masking
+    mask_channel_length: int = field(
+        default=10,
+        metadata={"help": "length of the mask for features (channels)"})
+    mask_channel_prob: float = field(
+        default=0.0,
+        metadata={"help": "probability of replacing a feature with 0"})
+    mask_channel_before: bool = False
+    mask_channel_selection: MASKING_DISTRIBUTION_CHOICES = field(
+        default="static",
+        metadata={"help": "how to choose mask length for channel masking"}, )
+    mask_channel_other: float = field(
+        default=0,
+        metadata={
+            "help":
+            "secondary mask argument (used for more complex distributions), "
+            "see help in compute_mask_indicesh"
+        }, )
+    no_mask_channel_overlap: bool = field(
+        default=False,
+        metadata={"help": "whether to allow channel masks to overlap"})
+    mask_channel_min_space: int = field(
+        default=1,
+        metadata={"help": "min space between spans (if no overlap is enabled)"},
+    )
+
+    # negative selection
+    num_negatives: int = field(
+        default=100,
+        metadata={"help": "number of negative examples from the same sample"}, )
+    negatives_from_everywhere: bool = field(
+        default=False,
+        metadata={
+            "help": "sample negatives from everywhere, not just masked states"
+        }, )
+    cross_sample_negatives: int = field(
+        default=0,
+        metadata={"help": "number of negative examples from the any sample"})
+    codebook_negatives: int = field(
+        default=0, metadata={"help": "number of negative examples codebook"})
+
+    # positional embeddings
+    conv_pos: int = field(
+        default=128,
+        metadata={
+            "help": "number of filters for convolutional positional embeddings"
+        }, )
+    conv_pos_groups: int = field(
+        default=16,
+        metadata={
+            "help": "number of groups for convolutional positional embedding"
+        }, )
+    pos_conv_depth: int = field(
+        default=1,
+        metadata={"help": "depth of positional encoder network"}, )
+
+    latent_temp: Tuple[float, float, float] = field(
+        default=(2, 0.5, 0.999995),
+        metadata={
+            "help":
+            "temperature for latent variable sampling. "
+            "can be tuple of 3 values (start, end, decay)"
+        }, )
+    max_positions: int = field(
+        default=100000, metadata={"help": "Max positions"})
+    checkpoint_activations: bool = field(
+        default=False,
+        metadata={
+            "help": "recompute activations and save memory for extra compute"
+        }, )
+
+    # FP16 optimization
+    required_seq_len_multiple: int = field(
+        default=2,
+        metadata={
+            "help":
+            "pad the input to encoder such that the sequence length is divisible by multiple"
+        }, )
+    crop_seq_to_multiple: int = field(
+        default=1,
+        metadata={
+            "help":
+            "crop convolutional feature extractor output such that the sequence length is divisible by multiple"
+        }, )
+
+    # Conformer
+    depthwise_conv_kernel_size: int = field(
+        default=31,
+        metadata={
+            "help":
+            "depthwise-conv-kernel-size for convolution in conformer layer"
+        }, )
+    attn_type: str = field(
+        default="",
+        metadata={"help": "if espnet use ESPNET MHA"}, )
+    pos_enc_type: str = field(
+        default="abs",
+        metadata={"help": "Positional encoding type to use in conformer"}, )
+    fp16: bool = field(
+        default=False, metadata={"help": "If fp16 is being used"})
+
+
+class Wav2Vec2Model(nn.Layer):
+    def __init__(self, cfg: Wav2Vec2Config):
+        super().__init__()
+        self.cfg = cfg
+
+        feature_enc_layers = eval(cfg.conv_feature_layers)
+        self.embed = feature_enc_layers[-1][0]
+
+        self.feature_extractor = ConvFeatureExtractionModel(
+            conv_layers=feature_enc_layers,
+            dropout=0.0,
+            mode=cfg.extractor_mode,
+            conv_bias=cfg.conv_bias, )
+
+        self.post_extract_proj = (Linear(self.embed, cfg.encoder_embed_dim)
+                                  if self.embed != cfg.encoder_embed_dim and
+                                  not cfg.quantize_input else None)
+
+        self.crop_seq_to_multiple = cfg.crop_seq_to_multiple
+
+        self.mask_prob = cfg.mask_prob
+        self.mask_selection = cfg.mask_selection
+        self.mask_other = cfg.mask_other
+        self.mask_length = cfg.mask_length
+        self.no_mask_overlap = cfg.no_mask_overlap
+        self.mask_min_space = cfg.mask_min_space
+
+        self.mask_channel_prob = cfg.mask_channel_prob
+        self.mask_channel_before = cfg.mask_channel_before
+        self.mask_channel_selection = cfg.mask_channel_selection
+        self.mask_channel_other = cfg.mask_channel_other
+        self.mask_channel_length = cfg.mask_channel_length
+        self.no_mask_channel_overlap = cfg.no_mask_channel_overlap
+        self.mask_channel_min_space = cfg.mask_channel_min_space
+
+        self.dropout_input = nn.Dropout(cfg.dropout_input)
+        self.dropout_features = nn.Dropout(cfg.dropout_features)
+
+        self.feature_grad_mult = cfg.feature_grad_mult
+
+        self.quantizer = None
+        self.input_quantizer = None
+
+        self.n_negatives = cfg.num_negatives
+        self.cross_sample_negatives = cfg.cross_sample_negatives
+        self.codebook_negatives = cfg.codebook_negatives
+        self.negatives_from_everywhere = cfg.negatives_from_everywhere
+
+        self.logit_temp = cfg.logit_temp
+
+        final_dim = cfg.final_dim if cfg.final_dim > 0 else cfg.encoder_embed_dim
+
+        if cfg.quantize_targets:
+            vq_dim = cfg.latent_dim if cfg.latent_dim > 0 else final_dim
+            self.quantizer = GumbelVectorQuantizer(
+                dim=self.embed,
+                num_vars=cfg.latent_vars,
+                temp=cfg.latent_temp,
+                groups=cfg.latent_groups,
+                combine_groups=False,
+                vq_dim=vq_dim,
+                time_first=True,
+                weight_proj_depth=cfg.quantizer_depth,
+                weight_proj_factor=cfg.quantizer_factor, )
+            self.project_q = Linear(vq_dim, final_dim)
+        else:
+            self.project_q = Linear(self.embed, final_dim)
+
+        if cfg.quantize_input:
+            if cfg.same_quantizer and self.quantizer is not None:
+                vq_dim = final_dim
+                self.input_quantizer = self.quantizer
+            else:
+                vq_dim = cfg.latent_dim if cfg.latent_dim > 0 else cfg.encoder_embed_dim
+                self.input_quantizer = GumbelVectorQuantizer(
+                    dim=self.embed,
+                    num_vars=cfg.latent_vars,
+                    temp=cfg.latent_temp,
+                    groups=cfg.latent_groups,
+                    combine_groups=False,
+                    vq_dim=vq_dim,
+                    time_first=True,
+                    weight_proj_depth=cfg.quantizer_depth,
+                    weight_proj_factor=cfg.quantizer_factor, )
+            self.project_inp = Linear(vq_dim, cfg.encoder_embed_dim)
+
+        self.mask_emb = self.create_parameter(
+            shape=[cfg.encoder_embed_dim],
+            default_initializer=paddle.nn.initializer.Uniform(),
+            dtype='float32', )
+
+        encoder_cls = TransformerEncoder
+
+        self.encoder = encoder_cls(cfg)
+        self.layer_norm = LayerNorm(self.embed)
+
+        self.target_glu = None
+        if cfg.target_glu:
+            self.target_glu = nn.Sequential(
+                Linear(final_dim, final_dim * 2), GLU())
+
+        self.final_proj = Linear(cfg.encoder_embed_dim, final_dim)
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        super().upgrade_state_dict_named(state_dict, name)
+        """Upgrade a (possibly old) state dict for new versions of fairseq."""
+        return state_dict
+
+    @classmethod
+    def build_model(cls, cfg: Wav2Vec2Config, task=None):
+        """Build a new model instance."""
+        return cls(cfg)
+
+    def apply_mask(
+            self,
+            x,
+            padding_mask,
+            mask_indices=None,
+            mask_channel_indices=None, ):
+        B, T, C = x.shape
+
+        if self.mask_channel_prob > 0 and self.mask_channel_before:
+            mask_channel_indices = compute_mask_indices(
+                (B, C),
+                None,
+                self.mask_channel_prob,
+                self.mask_channel_length,
+                self.mask_channel_selection,
+                self.mask_channel_other,
+                no_overlap=self.no_mask_channel_overlap,
+                min_space=self.mask_channel_min_space, )
+            mask_channel_indices = (
+                paddle.to_tensor(mask_channel_indices, plcae=x.plcae)
+                .unsqueeze(1).expand([-1, T, -1]))
+            x[mask_channel_indices] = 0
+
+        if self.mask_prob > 0:
+            if mask_indices is None:
+                mask_indices = compute_mask_indices(
+                    (B, T),
+                    padding_mask,
+                    self.mask_prob,
+                    self.mask_length,
+                    self.mask_selection,
+                    self.mask_other,
+                    min_masks=2,
+                    no_overlap=self.no_mask_overlap,
+                    min_space=self.mask_min_space,
+                    require_same_masks=self.cfg.require_same_masks,
+                    mask_dropout=self.cfg.mask_dropout, )
+                mask_indices = paddle.to_tensor(mask_indices, place=x.place)
+            x = index_put(x, mask_indices, self.mask_emb)
+        else:
+            mask_indices = None
+
+        if self.mask_channel_prob > 0 and not self.mask_channel_before:
+            if mask_channel_indices is None:
+                mask_channel_indices = compute_mask_indices(
+                    (B, C),
+                    None,
+                    self.mask_channel_prob,
+                    self.mask_channel_length,
+                    self.mask_channel_selection,
+                    self.mask_channel_other,
+                    no_overlap=self.no_mask_channel_overlap,
+                    min_space=self.mask_channel_min_space, )
+                mask_channel_indices = (
+                    paddle.to_tensor(mask_channel_indices, place=x.place)
+                    .unsqueeze(1).expand([-1, T, -1]))
+            x = index_put(x, mask_channel_indices, 0)
+
+        return x, mask_indices
+
+    def sample_negatives(self, y, num, padding_count=None):
+
+        if self.n_negatives == 0 and self.cross_sample_negatives == 0:
+            return paddle.empty([0], dtype=y.dtype)
+
+        bsz, tsz, fsz = y.shape
+        y = y.reshape([-1, fsz])  # BTC => (BxT)C
+
+        # FIXME: what happens if padding_count is specified?
+        cross_high = tsz * bsz
+        high = tsz - (padding_count or 0)
+        with paddle.no_grad():
+            assert high > 1, f"{bsz,tsz,fsz}"
+
+            if self.n_negatives > 0:
+                tszs = (buffered_arange(num).unsqueeze(-1)
+                        .expand([-1, self.n_negatives]).flatten())
+
+                neg_idxs = paddle.randint(
+                    low=0, high=high - 1, shape=[bsz, self.n_negatives * num])
+                neg_idxs[neg_idxs >= tszs] += 1
+
+            if self.cross_sample_negatives > 0:
+                tszs = (buffered_arange(num).unsqueeze(-1)
+                        .expand([-1, self.cross_sample_negatives]).flatten())
+
+                cross_neg_idxs = paddle.randint(
+                    low=0,
+                    high=cross_high - 1,
+                    shape=[bsz, self.cross_sample_negatives * num], )
+                cross_neg_idxs[cross_neg_idxs >= tszs] += 1
+
+        if self.n_negatives > 0:
+            neg_idxs = neg_idxs + (paddle.arange(bsz).unsqueeze(1) * high)
+        else:
+            neg_idxs = cross_neg_idxs
+
+        if self.cross_sample_negatives > 0 and self.n_negatives > 0:
+            neg_idxs = paddle.concat([neg_idxs, cross_neg_idxs], axis=1)
+
+        negs = y[neg_idxs.reshape([-1])]
+        negs = negs.reshape(
+            [bsz, num, self.n_negatives + self.cross_sample_negatives,
+             fsz]).transpose([2, 0, 1, 3])  # to NxBxTxC
+        return negs, neg_idxs
+
+    def compute_preds(self, x, y, negatives):
+        neg_is_pos = (y == negatives).all(-1)
+        y = y.unsqueeze(0)
+        targets = paddle.concat([y, negatives], axis=0)
+
+        logits = paddle.nn.functional.cosine_similarity(x, targets, axis=-1)
+        logits = logits / self.logit_temp
+        logits = logits.astype(x.dtype)
+
+        return logits
+
+    def _get_feat_extract_output_lengths(self, input_lengths: paddle.Tensor):
+        """
+        Computes the output length of the convolutional layers
+        """
+
+        def _conv_out_length(input_length, kernel_size, stride):
+            return paddle.floor((input_length - kernel_size) / stride + 1)
+
+        conv_cfg_list = eval(self.cfg.conv_feature_layers)
+
+        for i in range(len(conv_cfg_list)):
+            input_lengths = _conv_out_length(input_lengths, conv_cfg_list[i][1],
+                                             conv_cfg_list[i][2])
+
+        return paddle.cast(input_lengths, 'int64')
+
+    def forward(
+            self,
+            source,
+            padding_mask=None,
+            mask=True,
+            features_only=False,
+            layer=None,
+            mask_indices=None,
+            mask_channel_indices=None,
+            padding_count=None, ):
+
+        if self.feature_grad_mult > 0:
+            features = self.feature_extractor(source)
+            if self.feature_grad_mult != 1.0:
+                features = GradMultiply.apply(features, self.feature_grad_mult)
+        else:
+            with paddle.no_grad():
+                features = self.feature_extractor(source)
+
+        features_pen = features.pow(2).mean()
+
+        features = features.transpose([0, 2, 1])
+        features = self.layer_norm(features)
+        unmasked_features = features.clone()
+
+        if padding_mask is not None and padding_mask.any():
+            input_lengths = (1 - paddle.cast(padding_mask, 'int64')).sum(-1)
+            # apply conv formula to get real output_lengths
+            output_lengths = self._get_feat_extract_output_lengths(
+                input_lengths)
+
+            padding_mask = paddle.zeros(
+                features.shape[:2], dtype=features.dtype)
+
+            # these two operations makes sure that all values
+            # before the output lengths indices are attended to
+            padding_mask[(paddle.arange(padding_mask.shape[0]),
+                          output_lengths - 1, )] = 1
+            padding_mask = paddle.cast(
+                (1 - padding_mask.flip([-1]).cumsum(-1).flip([-1])), 'bool')
+        else:
+            padding_mask = None
+
+        time_steps_to_drop = features.shape[1] % self.crop_seq_to_multiple
+        if time_steps_to_drop != 0:
+            features = features[:, :-time_steps_to_drop]
+            unmasked_features = unmasked_features[:, :-time_steps_to_drop]
+            if padding_mask is not None:
+                padding_mask = padding_mask[:, :-time_steps_to_drop]
+
+        if self.post_extract_proj is not None:
+            features = self.post_extract_proj(features)
+
+        features = self.dropout_input(features)
+        unmasked_features = self.dropout_features(unmasked_features)
+
+        num_vars = None
+        code_ppl = None
+        prob_ppl = None
+        curr_temp = None
+
+        if self.input_quantizer:
+            q = self.input_quantizer(features, produce_targets=False)
+            features = q["x"]
+            num_vars = q["num_vars"]
+            code_ppl = q["code_perplexity"]
+            prob_ppl = q["prob_perplexity"]
+            curr_temp = q["temp"]
+            features = self.project_inp(features)
+
+        if mask:
+            x, mask_indices = self.apply_mask(
+                features,
+                padding_mask,
+                mask_indices=mask_indices,
+                mask_channel_indices=mask_channel_indices, )
+            if mask_indices is not None:
+                y = unmasked_features[mask_indices].reshape([
+                    unmasked_features.shape[0], -1, unmasked_features.shape[-1]
+                ])
+        else:
+            x = features
+            y = unmasked_features
+            mask_indices = None
+
+        x, layer_results = self.encoder(
+            x, padding_mask=padding_mask, layer=layer)
+
+        if features_only:
+            return {
+                "x": x,
+                "padding_mask": padding_mask,
+                "features": unmasked_features,
+                "layer_results": layer_results,
+            }
+
+        if self.quantizer:
+            if self.negatives_from_everywhere:
+                q = self.quantizer(unmasked_features, produce_targets=False)
+                y = q["x"]
+                num_vars = q["num_vars"]
+                code_ppl = q["code_perplexity"]
+                prob_ppl = q["prob_perplexity"]
+                curr_temp = q["temp"]
+                y = self.project_q(y)
+
+                negs, _ = self.sample_negatives(
+                    y,
+                    mask_indices[0].sum(),
+                    padding_count=padding_count, )
+                y = y[mask_indices].reshape([y.shape[0], -1, y.shape[-1]])
+
+            else:
+                q = self.quantizer(y, produce_targets=False)
+                y = q["x"]
+                num_vars = q["num_vars"]
+                code_ppl = q["code_perplexity"]
+                prob_ppl = q["prob_perplexity"]
+                curr_temp = q["temp"]
+
+                y = self.project_q(y)
+
+                negs, _ = self.sample_negatives(
+                    y,
+                    y.shape[1],
+                    padding_count=padding_count, )
+
+            if self.codebook_negatives > 0:
+                cb_negs = self.quantizer.sample_from_codebook(
+                    y.shape[0] * y.shape[1], self.codebook_negatives)
+                cb_negs = cb_negs.reshape(
+                    [self.codebook_negatives, y.shape[0], y.shape[1],
+                     -1])  # order doesnt matter
+                cb_negs = self.project_q(cb_negs)
+                negs = paddle.concat([negs, cb_negs], axis=0)
+        else:
+            y = self.project_q(y)
+
+            if self.negatives_from_everywhere:
+                negs, _ = self.sample_negatives(
+                    unmasked_features,
+                    y.shape[1],
+                    padding_count=padding_count, )
+                negs = self.project_q(negs)
+            else:
+                negs, _ = self.sample_negatives(
+                    y,
+                    y.shape[1],
+                    padding_count=padding_count, )
+
+        x = x[mask_indices].reshape([x.shape[0], -1, x.shape[-1]])
+
+        if self.target_glu:
+            y = self.target_glu(y)
+            negs = self.target_glu(negs)
+
+        x = self.final_proj(x)
+        x = self.compute_preds(x, y, negs)
+
+        result = {
+            "x": x,
+            "padding_mask": padding_mask,
+            "features_pen": features_pen,
+        }
+
+        if prob_ppl is not None:
+            result["prob_perplexity"] = prob_ppl
+            result["code_perplexity"] = code_ppl
+            result["num_vars"] = num_vars
+            result["temp"] = curr_temp
+
+        return result
+
+    def quantize(self, x):
+        assert self.quantizer is not None
+        x = self.feature_extractor(x)
+        x = x.transpose([0, 2, 1])
+        x = self.layer_norm(x)
+        return self.quantizer.forward_idx(x)
+
+    def extract_features(self, source, padding_mask, mask=False, layer=None):
+        res = self.forward(
+            source, padding_mask, mask=mask, features_only=True, layer=layer)
+        return res
+
+    def get_logits(self, net_output):
+        logits = net_output["x"]
+        logits = logits.transpose([2, 1, 0])
+        logits = logits.reshape([-1, logits.shape[-1]])
+        return logits
+
+    def get_targets(self, sample, net_output, expand_steps=True):
+        x = net_output["x"]
+        return paddle.zeros(x.shape[1] * x.shape[2], dtype='int64')
+
+    def get_extra_losses(self, net_output):
+        pen = []
+
+        if "prob_perplexity" in net_output:
+            pen.append((net_output["num_vars"] - net_output["prob_perplexity"])
+                       / net_output["num_vars"])
+
+        if "features_pen" in net_output:
+            pen.append(net_output["features_pen"])
+
+        return pen
+
+    def remove_pretraining_modules(self, last_layer=None):
+        self.quantizer = None
+        self.project_q = None
+        self.target_glu = None
+        self.final_proj = None
+
+        if last_layer is not None:
+            self.encoder.layers = nn.LayerList(
+                l for i, l in enumerate(self.encoder.layers) if i <= last_layer)
+
+
+class ConvFeatureExtractionModel(nn.Layer):
+    def __init__(
+            self,
+            conv_layers: List[Tuple[int, int, int]],
+            dropout: float=0.0,
+            mode: str="default",
+            conv_bias: bool=False, ):
+        super().__init__()
+
+        assert mode in {"default", "layer_norm"}
+
+        def block(
+                n_in,
+                n_out,
+                k,
+                stride,
+                is_layer_norm=False,
+                is_group_norm=False,
+                conv_bias=False, ):
+            def make_conv():
+                conv = Conv1D(
+                    n_in,
+                    n_out,
+                    k,
+                    stride=stride,
+                    bias_attr=conv_bias
+                    if not conv_bias else paddle.ParamAttr())
+                # nn.initializer.KaimingNormal()(conv.weight)
+                return conv
+
+            assert (is_layer_norm and is_group_norm
+                    ) is False, "layer norm and group norm are exclusive"
+
+            if is_layer_norm:
+                return nn.Sequential(
+                    make_conv(),
+                    nn.Dropout(p=dropout),
+                    nn.Sequential(
+                        TransposeLast(),
+                        Fp32LayerNorm(dim),
+                        TransposeLast(), ),
+                    nn.GELU(), )
+            elif is_group_norm:
+                return nn.Sequential(
+                    make_conv(),
+                    nn.Dropout(p=dropout),
+                    Fp32GroupNorm(dim, dim),
+                    nn.GELU(), )
+            else:
+                return nn.Sequential(
+                    make_conv(), nn.Dropout(p=dropout), nn.GELU())
+
+        in_d = 1
+        self.conv_layers = nn.LayerList()
+        for i, cl in enumerate(conv_layers):
+            assert len(cl) == 3, "invalid conv definition: " + str(cl)
+            (dim, k, stride) = cl
+
+            self.conv_layers.append(
+                block(
+                    in_d,
+                    dim,
+                    k,
+                    stride,
+                    is_layer_norm=mode == "layer_norm",
+                    is_group_norm=mode == "default" and i == 0,
+                    conv_bias=conv_bias, ))
+            in_d = dim
+
+    def forward(self, x):
+
+        # BxT -> BxCxT
+        x = x.unsqueeze(1)
+        for conv in self.conv_layers:
+            x = conv(x)
+
+        return x
+
+
+def make_conv_pos(e, k, g):
+    dropout = 0
+    std = math.sqrt((4 * (1.0 - dropout)) / (k * e))
+    pos_conv = Conv1D(
+        e,
+        e,
+        kernel_size=k,
+        padding=k // 2,
+        groups=g,
+        weight_attr=nn.initializer.Normal(mean=0, std=std),
+        bias_attr=nn.initializer.Constant(0))
+    pos_conv = nn.utils.weight_norm(pos_conv, name="weight", dim=2)
+    pos_conv = nn.Sequential(pos_conv, SamePad(k), nn.GELU())
+
+    return pos_conv
+
+
+class TransformerEncoder(nn.Layer):
+    def build_encoder_layer(self, args: Wav2Vec2Config):
+        layer = TransformerSentenceEncoderLayer(
+            embedding_dim=self.embedding_dim,
+            ffn_embedding_dim=args.encoder_ffn_embed_dim,
+            num_attention_heads=args.encoder_attention_heads,
+            dropout=self.dropout,
+            attention_dropout=args.attention_dropout,
+            activation_dropout=args.activation_dropout,
+            activation_fn=args.activation_fn,
+            layer_norm_first=args.layer_norm_first, )
+        return layer
+
+    def __init__(self, args: Wav2Vec2Config):
+        super().__init__()
+
+        self.dropout = args.dropout
+        self.embedding_dim = args.encoder_embed_dim
+        self.required_seq_len_multiple = args.required_seq_len_multiple
+
+        pos_conv_depth = getattr(args, "pos_conv_depth", 1)
+        if pos_conv_depth > 1:
+            num_layers = args.pos_conv_depth
+            k = max(3, args.conv_pos // num_layers)
+
+            def make_conv_block(e, k, g, l):
+                return nn.Sequential(*[
+                    nn.Sequential(
+                        Conv1D(
+                            e,
+                            e,
+                            kernel_size=k,
+                            padding=k // 2,
+                            groups=g, ),
+                        SamePad(k),
+                        TransposeLast(),
+                        LayerNorm(e, elementwise_affine=False),
+                        TransposeLast(),
+                        nn.GELU(), ) for _ in range(l)
+                ])
+
+            self.pos_conv = make_conv_block(self.embedding_dim, k,
+                                            args.conv_pos_groups, num_layers)
+
+        else:
+            self.pos_conv = make_conv_pos(
+                self.embedding_dim,
+                args.conv_pos,
+                args.conv_pos_groups, )
+
+        self.layers = nn.LayerList([
+            self.build_encoder_layer(args) for _ in range(args.encoder_layers)
+        ])
+        self.layer_norm_first = args.layer_norm_first
+        self.layer_norm = LayerNorm(self.embedding_dim)
+        self.layerdrop = args.encoder_layerdrop
+
+    def forward(self, x, padding_mask=None, layer=None):
+        x, layer_results = self.extract_features(x, padding_mask, layer)
+        if self.layer_norm_first and layer is None:
+            x = self.layer_norm(x)
+
+        return x, layer_results
+
+    def extract_features(
+            self,
+            x,
+            padding_mask=None,
+            tgt_layer=None,
+            min_layer=0, ):
+        if padding_mask is not None:
+            x = index_put(x, padding_mask, 0)
+
+        x_conv = self.pos_conv(x.transpose([0, 2, 1]))
+        x_conv = x_conv.transpose([0, 2, 1])
+        x = x + x_conv
+
+        if not self.layer_norm_first:
+            x = self.layer_norm(x)
+
+        # pad to the sequence length dimension
+        x, pad_length = pad_to_multiple(
+            x, self.required_seq_len_multiple, dim=-2, value=0)
+        if pad_length > 0 and padding_mask is None:
+            padding_mask = paddle.zeros([x.shape[0], x.shape[1]], dtype='bool')
+            padding_mask[:, -pad_length:] = True
+        else:
+            padding_mask, _ = pad_to_multiple(
+                padding_mask,
+                self.required_seq_len_multiple,
+                dim=-1,
+                value=True)
+        x = F.dropout(x, p=self.dropout, training=self.training)
+
+        # B x T x C -> T x B x C
+        x = x.transpose([1, 0, 2])
+
+        layer_results = []
+        r = None
+        for i, layer in enumerate(self.layers):
+            dropout_probability = np.random.random() if self.layerdrop > 0 else 1
+            if not self.training or (dropout_probability > self.layerdrop):
+                x, (z, lr) = layer(
+                    x, self_attn_padding_mask=padding_mask, need_weights=False)
+                if i >= min_layer:
+                    layer_results.append((x, z, lr))
+            if i == tgt_layer:
+                r = x
+                break
+
+        if r is not None:
+            x = r
+
+        # T x B x C -> B x T x C
+        x = x.transpose([1, 0, 2])
+
+        # undo paddding
+        if pad_length > 0:
+            x = x[:, :-pad_length]
+
+            def undo_pad(a, b, c):
+                return (a[:-pad_length], b[:-pad_length]
+                        if b is not None else b, c[:-pad_length], )
+
+            layer_results = [undo_pad(*u) for u in layer_results]
+
+        return x, layer_results
+
+    def max_positions(self):
+        """Maximum output length supported by the encoder."""
+        return self.args.max_positions
+
+    def upgrade_state_dict_named(self, state_dict, name):
+        """Upgrade a (possibly old) state dict for new versions of fairseq."""
+        return state_dict
+
+
+class TransformerSentenceEncoderLayer(nn.Layer):
+    """
+    Implements a Transformer Encoder Layer used in BERT/XLM style pre-trained
+    models.
+    """
+
+    def __init__(
+            self,
+            embedding_dim: float=768,
+            ffn_embedding_dim: float=3072,
+            num_attention_heads: int=8,
+            dropout: float=0.1,
+            attention_dropout: float=0.1,
+            activation_dropout: float=0.1,
+            activation_fn: str="relu",
+            layer_norm_first: bool=False, ) -> None:
+
+        super().__init__()
+        # Initialize parameters
+        self.embedding_dim = embedding_dim
+        self.dropout = dropout
+        self.activation_dropout = activation_dropout
+
+        # Initialize blocks
+        self.activation_fn = get_activation_fn(activation_fn)
+        self.self_attn = MultiheadAttention(
+            self.embedding_dim,
+            num_attention_heads,
+            dropout=attention_dropout,
+            self_attention=True, )
+
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(self.activation_dropout)
+        self.dropout3 = nn.Dropout(dropout)
+
+        self.layer_norm_first = layer_norm_first
+
+        # layer norm associated with the self attention layer
+        self.self_attn_layer_norm = LayerNorm(self.embedding_dim)
+        self.fc1 = Linear(self.embedding_dim, ffn_embedding_dim)
+        self.fc2 = Linear(ffn_embedding_dim, self.embedding_dim)
+
+        # layer norm associated with the position wise feed-forward NN
+        self.final_layer_norm = LayerNorm(self.embedding_dim)
+
+    def forward(
+            self,
+            x: paddle.Tensor,
+            self_attn_mask: paddle.Tensor=None,
+            self_attn_padding_mask: paddle.Tensor=None,
+            need_weights: bool=False,
+            att_args=None, ):
+        """
+        LayerNorm is applied either before or after the self-attention/ffn
+        modules similar to the original Transformer imlementation.
+        """
+        residual = x
+
+        if self.layer_norm_first:
+            x = self.self_attn_layer_norm(x)
+            x, attn = self.self_attn(
+                query=x,
+                key=x,
+                value=x,
+                key_padding_mask=self_attn_padding_mask,
+                attn_mask=self_attn_mask,
+                need_weights=False, )
+            x = self.dropout1(x)
+            x = residual + x
+
+            residual = x
+            x = self.final_layer_norm(x)
+            x = self.activation_fn(self.fc1(x))
+            x = self.dropout2(x)
+            x = self.fc2(x)
+
+            layer_result = x
+
+            x = self.dropout3(x)
+            x = residual + x
+        else:
+            x, attn = self.self_attn(
+                query=x,
+                key=x,
+                value=x,
+                key_padding_mask=self_attn_padding_mask,
+                need_weights=False, )
+
+            x = self.dropout1(x)
+            x = residual + x
+
+            x = self.self_attn_layer_norm(x)
+
+            residual = x
+            x = self.activation_fn(self.fc1(x))
+            x = self.dropout2(x)
+            x = self.fc2(x)
+
+            layer_result = x
+
+            x = self.dropout3(x)
+            x = residual + x
+            x = self.final_layer_norm(x)
+
+        return x, (attn, layer_result)
+
+
+@dataclass
+class AudioPretrainingConfig:
+    sample_rate: int = field(
+        default=16_000,
+        metadata={
+            "help":
+            "target sample rate. audio files will be up/down sampled to this rate"
+        }, )
+    normalize: bool = field(
+        default=False,
+        metadata={
+            "help": "if set, normalizes input to have 0 mean and unit variance"
+        }, )
+    enable_padding: bool = field(
+        default=False,
+        metadata={"help": "pad shorter samples instead of cropping"})
+    max_sample_size: Optional[int] = field(
+        default=None,
+        metadata={"help": "max sample size to crop to for batching"})
+    min_sample_size: Optional[int] = field(
+        default=None,
+        metadata={"help": "min sample size to skip small examples"})
diff --git a/paddlespeech/s2t/models/wav2vec2/processing/speech_augmentation.py b/paddlespeech/s2t/models/wav2vec2/processing/speech_augmentation.py
index 5482ed56..50a95f0b 100644
--- a/paddlespeech/s2t/models/wav2vec2/processing/speech_augmentation.py
+++ b/paddlespeech/s2t/models/wav2vec2/processing/speech_augmentation.py
@@ -354,7 +354,8 @@ class Resample(nn.Layer):
         window_width = self.lowpass_filter_width / (2.0 * lowpass_cutoff)
 
         assert lowpass_cutoff < min(self.orig_freq, self.new_freq) / 2
-        output_t = paddle.arange(start=0.0, end=self.output_samples)
+        output_t = paddle.arange(
+            start=0.0, end=self.output_samples, dtype='int64')
         output_t /= self.new_freq
         min_t = output_t - window_width
         max_t = output_t + window_width
diff --git a/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py b/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py
index 7468fdce..64195def 100755
--- a/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py
+++ b/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py
@@ -27,8 +27,11 @@ from paddlespeech.s2t.models.wav2vec2.processing.speech_augmentation import Spec
 from paddlespeech.s2t.modules.ctc import CTCDecoderBase as CTC
 from paddlespeech.s2t.modules.initializer import DefaultInitializerContext
 from paddlespeech.s2t.utils.ctc_utils import remove_duplicates_and_blank
+from paddlespeech.s2t.utils.log import Log
 from paddlespeech.s2t.utils.utility import log_add
 
+logger = Log(__name__).getlog()
+
 
 class Wav2vec2ASR(nn.Layer):
     def __init__(self, config: dict):
@@ -185,7 +188,7 @@ class Wav2vec2ASR(nn.Layer):
         x_lens = x.shape[1]
         ctc_probs = self.ctc.log_softmax(x)  # (B, maxlen, vocab_size)
         topk_prob, topk_index = ctc_probs.topk(1, axis=2)  # (B, maxlen, 1)
-        topk_index = topk_index.view(batch_size, x_lens)  # (B, maxlen)
+        topk_index = topk_index.reshape([batch_size, x_lens])  # (B, maxlen)
 
         hyps = [hyp.tolist() for hyp in topk_index]
         hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps]
diff --git a/paddlespeech/s2t/models/wavlm/__init__.py b/paddlespeech/s2t/models/wavlm/__init__.py
new file mode 100644
index 00000000..cf69114e
--- /dev/null
+++ b/paddlespeech/s2t/models/wavlm/__init__.py
@@ -0,0 +1,2 @@
+from .wavlm_paddle import WavLM, WavLMConfig
+from .wavlm_asr import WavLMASR, WavLMBase
\ No newline at end of file
diff --git a/paddlespeech/s2t/models/wavlm/modules/__init__.py b/paddlespeech/s2t/models/wavlm/modules/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/paddlespeech/s2t/models/wavlm/modules/activations.py b/paddlespeech/s2t/models/wavlm/modules/activations.py
new file mode 100644
index 00000000..b11dc1a9
--- /dev/null
+++ b/paddlespeech/s2t/models/wavlm/modules/activations.py
@@ -0,0 +1,88 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import paddle
+import paddle.nn.functional as F
+
+
+def _gelu_python(x):
+    """
+    Original Implementation of the GELU activation function in Google BERT repo when initially created. For
+    information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 +
+    torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in
+    torch.nn.functional Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
+    """
+    return x * 0.5 * (1.0 + paddle.erf(x / math.sqrt(2.0)))
+
+
+
+
+def gelu_new(x):
+    """
+    Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see
+    the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
+    """
+    return 0.5 * x * (1.0 + paddle.tanh(
+        math.sqrt(2.0 / math.pi) * (x + 0.044715 * paddle.pow(x, 3.0))))
+
+
+def gelu_fast(x):
+    return 0.5 * x * (1.0 + paddle.tanh(x * 0.7978845608 *
+                                        (1.0 + 0.044715 * x * x)))
+
+gelu = gelu_fast
+
+def _silu_python(x):
+    """
+    See Gaussian Error Linear Units (Hendrycks et al., https://arxiv.org/abs/1606.08415) where the SiLU (Sigmoid Linear
+    Unit) was originally introduced and coined, and see Sigmoid-Weighted Linear Units for Neural Network Function
+    Approximation in Reinforcement Learning (Elfwing et al., https://arxiv.org/abs/1702.03118) and Swish: a Self-Gated
+    Activation Function (Ramachandran et al., https://arxiv.org/abs/1710.05941v1) where the SiLU was experimented with
+    later.
+    """
+    return x * paddle.nn.functional.sigmoid(x)
+
+
+def mish(x):
+    return x * paddle.tanh(paddle.nn.functional.softplus(x))
+
+
+def linear_act(x):
+    return x
+
+
+ACT2FN = {
+    "relu": F.relu,
+    "silu": _silu_python,
+    "swish": _silu_python,
+    "gelu": gelu,
+    "tanh": paddle.tanh,
+    "gelu_new": gelu_new,
+    "gelu_fast": gelu_fast,
+    "mish": mish,
+    "linear": linear_act,
+    "sigmoid": paddle.nn.functional.sigmoid,
+}
+
+
+def get_activation(activation_string):
+    if activation_string in ACT2FN:
+        return ACT2FN[activation_string]
+    else:
+        raise KeyError(
+            f"function {activation_string} not found in ACT2FN mapping {list(ACT2FN.keys())}"
+        )
\ No newline at end of file
diff --git a/paddlespeech/s2t/models/wavlm/modules/functional.py b/paddlespeech/s2t/models/wavlm/modules/functional.py
new file mode 100644
index 00000000..d2ebdc71
--- /dev/null
+++ b/paddlespeech/s2t/models/wavlm/modules/functional.py
@@ -0,0 +1,473 @@
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from typing import Optional, List, Tuple
+import math
+
+def _mha_shape_check(query: paddle.Tensor, key: paddle.Tensor, value: paddle.Tensor,
+                     key_padding_mask: Optional[paddle.Tensor], attn_mask: Optional[paddle.Tensor], num_heads: int):
+    # Verifies the expected shape for `query, `key`, `value`, `key_padding_mask` and `attn_mask`
+    # and returns if the input is batched or not.
+    # Raises an error if `query` is not 2-D (unbatched) or 3-D (batched) tensor.
+
+    # Shape check.
+    if query.dim() == 3:
+        # Batched Inputs
+        is_batched = True
+        assert key.dim() == 3 and value.dim() == 3, \
+            ("For batched (3-D) `query`, expected `key` and `value` to be 3-D"
+             f" but found {key.dim()}-D and {value.dim()}-D tensors respectively")
+        if key_padding_mask is not None:
+            assert key_padding_mask.dim() == 2, \
+                ("For batched (3-D) `query`, expected `key_padding_mask` to be `None` or 2-D"
+                 f" but found {key_padding_mask.dim()}-D tensor instead")
+        if attn_mask is not None:
+            assert attn_mask.dim() in (2, 3), \
+                ("For batched (3-D) `query`, expected `attn_mask` to be `None`, 2-D or 3-D"
+                 f" but found {attn_mask.dim()}-D tensor instead")
+    elif query.dim() == 2:
+        # Unbatched Inputs
+        is_batched = False
+        assert key.dim() == 2 and value.dim() == 2, \
+            ("For unbatched (2-D) `query`, expected `key` and `value` to be 2-D"
+             f" but found {key.dim()}-D and {value.dim()}-D tensors respectively")
+
+        if key_padding_mask is not None:
+            assert key_padding_mask.dim() == 1, \
+                ("For unbatched (2-D) `query`, expected `key_padding_mask` to be `None` or 1-D"
+                 f" but found {key_padding_mask.dim()}-D tensor instead")
+
+        if attn_mask is not None:
+            assert attn_mask.dim() in (2, 3), \
+                ("For unbatched (2-D) `query`, expected `attn_mask` to be `None`, 2-D or 3-D"
+                 f" but found {attn_mask.dim()}-D tensor instead")
+            if attn_mask.dim() == 3:
+                expected_shape = (num_heads, query.shape[0], key.shape[0])
+                assert attn_mask.shape == expected_shape, \
+                    (f"Expected `attn_mask` shape to be {expected_shape} but got {attn_mask.shape}")
+    else:
+        raise AssertionError(
+            f"query should be unbatched 2D or batched 3D tensor but received {query.dim()}-D query tensor")
+
+
+def scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal):
+    """
+    Scaled Dot-Product Attention
+    """
+    
+    d_key = k.shape[-1]
+    scaled_q = paddle.scale(x=q, scale=d_key ** -0.5)
+    product = paddle.matmul(x=scaled_q, y=k, transpose_y=True)
+    weights = F.softmax(x=product + attn_mask)
+    if dropout_p:
+        weights = F.dropout(
+            weights,
+            p=dropout_p,
+            training=True,
+            mode="upscale_in_train"
+        )
+    out = paddle.matmul(x=weights, y=v)
+    return out
+
+    
+def addr(input, vec1, vec2, beta=1, alpha=1, out=None):
+    """
+    A helper function to calculate alpha*(vec1*vec2^T) + beta*input
+    """
+    row = vec1.shape[0]
+    column = vec2.shape[0]
+    vec1 = paddle.unsqueeze(vec1, 0)
+    vec1 = paddle.transpose(vec1, [1, 0])
+    vec1 = paddle.expand(vec1, [row, column])
+    new_vec2 = paddle.zeros([column, column], dtype=vec2.dtype)
+    new_vec2[0, :] = vec2
+    out = alpha * paddle.matmul(vec1, new_vec2)
+    out = beta * input + out
+    return out
+
+def multi_head_attention_forward(
+    x: paddle.Tensor,
+    num_heads: int,
+    q_proj: nn.Linear,
+    k_proj: nn.Linear,
+    v_proj: nn.Linear,
+    c_proj: nn.Linear,
+    attn_mask: Optional[paddle.Tensor] = None,
+):
+    max_len, batch_size, emb_dim = x.shape
+    head_dim = emb_dim // num_heads
+    scaling = float(head_dim) ** -0.5
+    q = q_proj(x)  # L, N, E
+    k = k_proj(x)  # L, N, E
+    v = v_proj(x)  # L, N, E
+
+    v = v.reshape((-1, batch_size * num_heads, head_dim)).transpose((1, 0, 2))
+    k = k.reshape((-1, batch_size * num_heads, head_dim)).transpose((1, 0, 2))
+    q = q.reshape((-1, batch_size * num_heads, head_dim)).transpose((1, 0, 2))
+
+    q = q * scaling
+    qk = paddle.matmul(q, k, transpose_y=True)
+    if attn_mask is not None:
+        if attn_mask.ndim == 2:
+            attn_mask.unsqueeze_(0)
+        assert attn_mask.shape[0] == 1 and attn_mask.shape[1] == max_len and attn_mask.shape[2] == max_len
+        qk += attn_mask
+
+    qk = F.softmax(qk, axis=-1)
+    atten = paddle.bmm(qk, v)
+    atten = atten.transpose((1, 0, 2))
+    atten = atten.reshape((max_len, batch_size, emb_dim))
+    atten = c_proj(atten)
+    return atten
+
+def linear(input, weight, bias=None):
+    # compute y = x A^T + b
+    # Input: (N, in_feature) paddle tensor
+    # weight: (out_feature, in_feature) paddle tensor
+    # bias: (out_feature) paddle tensor
+    if input.dim() == 2 and bias is not None:
+        # fused op is marginally faster
+        return paddle.addmm(bias, input, weight)
+    output = paddle.matmul(input, weight)
+    if bias is not None:
+        output += bias
+    return output
+
+
+def _in_projection_packed(
+    q: paddle.Tensor,
+    k: paddle.Tensor,
+    v: paddle.Tensor,
+    w: paddle.Tensor,
+    b: Optional[paddle.Tensor] = None,
+) -> List[paddle.Tensor]:
+    r"""
+    Performs the in-projection step of the attention operation, using packed weights.
+    Output is a triple containing projection tensors for query, key and value.
+    Args:
+        q, k, v: query, key and value tensors to be projected. For self-attention,
+            these are typically the same tensor; for encoder-decoder attention,
+            k and v are typically the same tensor. (We take advantage of these
+            identities for performance if they are present.) Regardless, q, k and v
+            must share a common embedding dimension; otherwise their shapes may vary.
+        w: projection weights for q, k and v, packed into a single tensor. Weights
+            are packed along dimension 0, in q, k, v order.
+        b: optional projection biases for q, k and v, packed into a single tensor
+            in q, k, v order.
+    Shape:
+        Inputs:
+        - q: :math:`(..., E)` where E is the embedding dimension
+        - k: :math:`(..., E)` where E is the embedding dimension
+        - v: :math:`(..., E)` where E is the embedding dimension
+        - w: :math:`(E * 3, E)` where E is the embedding dimension
+        - b: :math:`E * 3` where E is the embedding dimension
+        Output:
+        - in output list :math:`[q', k', v']`, each output tensor will have the
+            same shape as the corresponding input tensor.
+    """
+    E = q.shape[-1]
+    if k is v:
+        if q is k:
+            # self-attention
+            proj = F.linear(q, w, b)
+            # reshape to 3, E and not E, 3 is deliberate for better memory coalescing and keeping same order as chunk()
+            proj = proj.unflatten(-1, (3, E)).unsqueeze(0).transpose([2, 1, 0]).squeeze(-2).contiguous()
+            return proj[0], proj[1], proj[2]
+        else:
+            # encoder-decoder attention
+            w_q, w_kv = w.split([E, E * 2])
+            if b is None:
+                b_q = b_kv = None
+            else:
+                b_q, b_kv = b.split([E, E * 2])
+            q_proj = F.linear(q, w_q, b_q)
+            kv_proj = F.linear(k, w_kv, b_kv)
+            # reshape to 2, E and not E, 2 is deliberate for better memory coalescing and keeping same order as chunk()
+            kv_proj = kv_proj.unflatten(-1, (2, E)).unsqueeze(0).transpose([2, 1, 0]).squeeze(-2).contiguous()
+            return (q_proj, kv_proj[0], kv_proj[1])
+    else:
+        w_q, w_k, w_v = w.chunk(3)
+        if b is None:
+            b_q = b_k = b_v = None
+        else:
+            b_q, b_k, b_v = b.chunk(3)
+        return F.linear(q, w_q, b_q), F.linear(k, w_k, b_k), F.linear(v, w_v, b_v)
+    
+def _in_projection(
+    q: paddle.Tensor,
+    k: paddle.Tensor,
+    v: paddle.Tensor,
+    w_q: paddle.Tensor,
+    w_k: paddle.Tensor,
+    w_v: paddle.Tensor,
+    b_q: Optional[paddle.Tensor] = None,
+    b_k: Optional[paddle.Tensor] = None,
+    b_v: Optional[paddle.Tensor] = None,
+) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
+    A, B, C = F.linear(q, w_q, b_q), F.linear(k, w_k, b_k), F.linear(v, w_v, b_v)    
+    return A, B, C
+    
+def multi_head_attention_forward_paddle(
+    query: paddle.Tensor,
+    key: paddle.Tensor,
+    value: paddle.Tensor,
+    embed_dim_to_check: int,
+    num_heads: int,
+    in_proj_weight: Optional[paddle.Tensor],
+    in_proj_bias: Optional[paddle.Tensor],
+    bias_k: Optional[paddle.Tensor],
+    bias_v: Optional[paddle.Tensor],
+    add_zero_attn: bool,
+    dropout_p: float,
+    out_proj_weight: paddle.Tensor,
+    out_proj_bias: Optional[paddle.Tensor],
+    training: bool = True,
+    key_padding_mask: Optional[paddle.Tensor] = None,
+    need_weights: bool = True,
+    attn_mask: Optional[paddle.Tensor] = None,
+    use_separate_proj_weight: bool = False,
+    q_proj_weight: Optional[paddle.Tensor] = None,
+    k_proj_weight: Optional[paddle.Tensor] = None,
+    v_proj_weight: Optional[paddle.Tensor] = None,
+    static_k: Optional[paddle.Tensor] = None,
+    static_v: Optional[paddle.Tensor] = None,
+    average_attn_weights: bool = True,
+    is_causal: bool = False,
+) -> Tuple[paddle.Tensor, Optional[paddle.Tensor]]:
+    r"""
+    Args:
+        query, key, value: map a query and a set of key-value pairs to an output.
+            See "Attention Is All You Need" for more details.
+        embed_dim_to_check: total dimension of the model.
+        num_heads: parallel attention heads.
+        in_proj_weight, in_proj_bias: input projection weight and bias.
+        bias_k, bias_v: bias of the key and value sequences to be added at dim=0.
+        add_zero_attn: add a new batch of zeros to the key and
+                       value sequences at dim=1.
+        dropout_p: probability of an element to be zeroed.
+        out_proj_weight, out_proj_bias: the output projection weight and bias.
+        training: apply dropout if is ``True``.
+        key_padding_mask: if provided, specified padding elements in the key will
+            be ignored by the attention. This is an binary mask. When the value is True,
+            the corresponding value on the attention layer will be filled with -inf.
+        need_weights: output attn_output_weights.
+        attn_mask: 2D or 3D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all
+            the batches while a 3D mask allows to specify a different mask for the entries of each batch.
+        is_causal: If specified, applies a causal mask as attention mask, and ignores
+            attn_mask for computing scaled dot product attention.
+            Default: ``False``.
+        use_separate_proj_weight: the function accept the proj. weights for query, key,
+            and value in different forms. If false, in_proj_weight will be used, which is
+            a combination of q_proj_weight, k_proj_weight, v_proj_weight.
+        q_proj_weight, k_proj_weight, v_proj_weight, in_proj_bias: input projection weight and bias.
+        static_k, static_v: static key and value used for attention operators.
+        average_attn_weights: If true, indicates that the returned ``attn_weights`` should be averaged across heads.
+            Otherwise, ``attn_weights`` are provided separately per head. Note that this flag only has an effect
+            when ``need_weights=True.``. Default: True
+    Shape:
+        Inputs:
+        - query: :math:`(L, E)` or :math:`(L, N, E)` where L is the target sequence length, N is the batch size, E is
+          the embedding dimension.
+        - key: :math:`(S, E)` or :math:`(S, N, E)`, where S is the source sequence length, N is the batch size, E is
+          the embedding dimension.
+        - value: :math:`(S, E)` or :math:`(S, N, E)` where S is the source sequence length, N is the batch size, E is
+          the embedding dimension.
+        - key_padding_mask: :math:`(S)` or :math:`(N, S)` where N is the batch size, S is the source sequence length.
+          If a FloatTensor is provided, it will be directly added to the value.
+          If a BoolTensor is provided, the positions with the
+          value of ``True`` will be ignored while the position with the value of ``False`` will be unchanged.
+        - attn_mask: 2D mask :math:`(L, S)` where L is the target sequence length, S is the source sequence length.
+          3D mask :math:`(N*num_heads, L, S)` where N is the batch size, L is the target sequence length,
+          S is the source sequence length. attn_mask ensures that position i is allowed to attend the unmasked
+          positions. If a BoolTensor is provided, positions with ``True``
+          are not allowed to attend while ``False`` values will be unchanged. If a FloatTensor
+          is provided, it will be added to the attention weight.
+        - static_k: :math:`(N*num_heads, S, E/num_heads)`, where S is the source sequence length,
+          N is the batch size, E is the embedding dimension. E/num_heads is the head dimension.
+        - static_v: :math:`(N*num_heads, S, E/num_heads)`, where S is the source sequence length,
+          N is the batch size, E is the embedding dimension. E/num_heads is the head dimension.
+        Outputs:
+        - attn_output: :math:`(L, E)` or :math:`(L, N, E)` where L is the target sequence length, N is the batch size,
+          E is the embedding dimension.
+        - attn_output_weights: Only returned when ``need_weights=True``. If ``average_attn_weights=True``, returns
+          attention weights averaged across heads of shape :math:`(L, S)` when input is unbatched or
+          :math:`(N, L, S)`, where :math:`N` is the batch size, :math:`L` is the target sequence length, and
+          :math:`S` is the source sequence length. If ``average_attn_weights=False``, returns attention weights per
+          head of shape :math:`(num_heads, L, S)` when input is unbatched or :math:`(N, num_heads, L, S)`.
+    """
+    
+    is_batched = _mha_shape_check(query, key, value, key_padding_mask, attn_mask, num_heads)
+    tgt_len, bsz, embed_dim = query.shape
+    src_len, _, _ = key.shape
+
+    if is_causal:
+        attn_mask = None
+
+    assert embed_dim == embed_dim_to_check, \
+        f"was expecting embedding dimension of {embed_dim_to_check}, but got {embed_dim}"
+    if isinstance(embed_dim, paddle.Tensor):
+        # embed_dim can be a tensor when JIT tracing
+        head_dim = embed_dim.div(num_heads, rounding_mode='trunc')
+    else:
+        head_dim = embed_dim // num_heads
+    assert head_dim * num_heads == embed_dim, f"embed_dim {embed_dim} not divisible by num_heads {num_heads}"
+    if use_separate_proj_weight:
+        # allow MHA to have different embedding dimensions when separate projection weights are used
+        assert key.shape[:2] == value.shape[:2], \
+            f"key's sequence and batch dims {key.shape[:2]} do not match value's {value.shape[:2]}"
+    else:
+        assert key.shape == value.shape, f"key shape {key.shape} does not match value shape {value.shape}"
+
+    #
+    # compute in-projection
+    #
+    if not use_separate_proj_weight:
+        assert in_proj_weight is not None, "use_separate_proj_weight is False but in_proj_weight is None"
+        q, k, v = _in_projection_packed(query, key, value, in_proj_weight, in_proj_bias)
+        
+    else:
+        assert q_proj_weight is not None, "use_separate_proj_weight is True but q_proj_weight is None"
+        assert k_proj_weight is not None, "use_separate_proj_weight is True but k_proj_weight is None"
+        assert v_proj_weight is not None, "use_separate_proj_weight is True but v_proj_weight is None"
+        if in_proj_bias is None:
+            b_q = b_k = b_v = None
+        else:
+            b_q, b_k, b_v = in_proj_bias.chunk(3)
+        
+        q, k, v = _in_projection(query, key, value, q_proj_weight, k_proj_weight, v_proj_weight, b_q, b_k, b_v)
+    
+    # prep attention mask
+
+    if attn_mask is not None:
+        # ensure attn_mask's dim is 3
+        if attn_mask.dim() == 2:
+            correct_2d_size = (tgt_len, src_len)
+            if attn_mask.shape != correct_2d_size:
+                raise RuntimeError(f"The shape of the 2D attn_mask is {attn_mask.shape}, but should be {correct_2d_size}.")
+            attn_mask = attn_mask.unsqueeze(0)
+        elif attn_mask.dim() == 3:
+            correct_3d_size = (bsz * num_heads, tgt_len, src_len)
+            if tuple(attn_mask.shape) != correct_3d_size:
+                raise RuntimeError(f"The shape of the 3D attn_mask is {attn_mask.shape}, but should be {correct_3d_size}.")
+        else:
+            raise RuntimeError(f"attn_mask's dimension {attn_mask.dim()} is not supported")
+
+    # add bias along batch dimension (currently second)
+    if bias_k is not None and bias_v is not None:
+        assert static_k is None, "bias cannot be added to static key."
+        assert static_v is None, "bias cannot be added to static value."
+        k = paddle.concat([k, bias_k.repeat(1, bsz, 1)], axis=1)
+        v = paddle.concat([v, bias_v.repeat(1, bsz, 1)], axis=1)
+        if attn_mask is not None:
+            # attn_mask = pad(attn_mask, (0, 1))
+            # pad last dim with 0 on one side and 1 on the other
+            attn_mask = paddle.concat([attn_mask, paddle.zeros_like(attn_mask[:, :, -1:])], axis=2)
+        if key_padding_mask is not None:
+            # key_padding_mask = pad(key_padding_mask, (0, 1))
+            # pad last dim with 0 on one side and 1 on the other
+            key_padding_mask = paddle.concat([key_padding_mask, paddle.zeros_like(key_padding_mask[:, -1:])], axis=1)
+    else:
+        assert bias_k is None
+        assert bias_v is None
+
+    #
+    # reshape q, k, v for multihead attention and make em batch first
+    #
+    q = q.reshape([tgt_len, bsz * num_heads, head_dim]).transpose([1, 0, 2])
+
+    
+    if static_k is None:
+        k = k.reshape([k.shape[0], bsz * num_heads, head_dim]).transpose([1, 0, 2])
+    else:
+        assert static_k.size(0) == bsz * num_heads, \
+            f"expecting static_k.size(0) of {bsz * num_heads}, but got {static_k.size(0)}"
+        assert static_k.size(2) == head_dim, \
+            f"expecting static_k.size(2) of {head_dim}, but got {static_k.size(2)}"
+        k = static_k
+    if static_v is None:
+        v = v.reshape([v.shape[0], bsz * num_heads, head_dim]).transpose([1, 0, 2])
+    else:
+        # TODO finish disentangling control flow so we don't do in-projections when statics are passed
+        assert static_v.size(0) == bsz * num_heads, \
+            f"expecting static_v.size(0) of {bsz * num_heads}, but got {static_v.size(0)}"
+        assert static_v.size(2) == head_dim, \
+            f"expecting static_v.size(2) of {head_dim}, but got {static_v.size(2)}"
+        v = static_v
+
+    # add zero attention along batch dimension (now first)
+    if add_zero_attn:
+        zero_attn_shape = (bsz * num_heads, 1, head_dim)
+        k = paddle.concat([k, paddle.zeros(zero_attn_shape, dtype=k.dtype, device=k.device)], axis=1)
+        v = paddle.concat([v, paddle.zeros(zero_attn_shape, dtype=v.dtype, device=v.device)], axis=1)
+        if attn_mask is not None:
+            # attn_mask = pad(attn_mask, (0, 1))
+            attn_mask = paddle.concat([attn_mask, paddle.zeros_like(attn_mask[:, :, -1:])], axis=2)
+        if key_padding_mask is not None:
+            # key_padding_mask = pad(key_padding_mask, (0, 1))
+            key_padding_mask = paddle.concat([key_padding_mask, paddle.zeros_like(key_padding_mask[:, -1:])], axis=1)
+
+    # update source sequence length after adjustments
+    src_len = k.shape[1]
+
+    # merge key padding and attention masks
+    if key_padding_mask is not None:
+        assert key_padding_mask.shape == (bsz, src_len), \
+            f"expecting key_padding_mask shape of {(bsz, src_len)}, but got {key_padding_mask.shape}"
+        key_padding_mask = key_padding_mask.reshape([bsz, 1, 1, src_len]).expand([-1, num_heads, -1, -1]).reshape([bsz * num_heads, 1, src_len])
+        if attn_mask is None:
+            attn_mask = key_padding_mask
+        else:
+            attn_mask = attn_mask + key_padding_mask
+
+    # adjust dropout probability
+    if not training:
+        dropout_p = 0.0
+
+    #
+    # (deep breath) calculate attention and out projection
+    #
+    if need_weights:
+        B, Nt, E = q.shape
+        q_scaled = q / math.sqrt(E)
+        if attn_mask is not None:
+            attn_output_weights = addr(q_scaled, k.transpose(-2, -1))
+        else:
+            attn_output_weights = paddle.bmm(q_scaled, k.transpose(0, 2, 1))
+        attn_output_weights = F.softmax(attn_output_weights, axis=-1)
+        if dropout_p > 0.0:
+            attn_output_weights = F.dropout(attn_output_weights, p=dropout_p)
+
+        attn_output = paddle.bmm(attn_output_weights, v)
+        attn_output = attn_output.transpose([1, 0, 2]).reshape([tgt_len * bsz, embed_dim])
+        # attn_output = linear(attn_output, out_proj_weight, out_proj_bias)
+        attn_output = F.linear(attn_output, out_proj_weight, out_proj_bias)
+        attn_output = attn_output.reshape([tgt_len, bsz, attn_output.shape[1]])
+
+        # optionally average attention weights over heads
+        attn_output_weights = attn_output_weights.reshape([bsz, num_heads, tgt_len, src_len])
+        if average_attn_weights:
+            attn_output_weights = attn_output_weights.mean(dim=1)
+
+        if not is_batched:
+            # squeeze the output if input was unbatched
+            attn_output = attn_output.squeeze(1)
+            attn_output_weights = attn_output_weights.squeeze(0)
+        return attn_output, attn_output_weights
+    else:
+        # attn_mask can be either (L,S) or (N*num_heads, L, S)
+        # if attn_mask's shape is (1, L, S) we need to unsqueeze to (1, 1, L, S)
+        # in order to match the input for SDPA of (N, num_heads, L, S)
+        if attn_mask is not None:
+            if attn_mask.shape[0] == 1 and attn_mask.dim() == 3:
+                attn_mask = attn_mask.unsqueeze(0)
+            else:
+                attn_mask = attn_mask.reshape([bsz, num_heads, -1, src_len])
+
+        q = q.reshape([bsz, num_heads, tgt_len, head_dim])
+        k = k.reshape([bsz, num_heads, src_len, head_dim])
+        v = v.reshape([bsz, num_heads, src_len, head_dim])
+        attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)
+        attn_output = attn_output.transpose(perm=[2, 0, 1, 3]).reshape([bsz * tgt_len, embed_dim])
+        attn_output = F.linear(attn_output, out_proj_weight, out_proj_bias)
+        attn_output = attn_output.reshape([tgt_len, bsz, attn_output.shape[1]])
+        return attn_output, None
\ No newline at end of file
diff --git a/paddlespeech/s2t/models/wavlm/modules/modules.py b/paddlespeech/s2t/models/wavlm/modules/modules.py
new file mode 100644
index 00000000..f14e4016
--- /dev/null
+++ b/paddlespeech/s2t/models/wavlm/modules/modules.py
@@ -0,0 +1,768 @@
+# --------------------------------------------------------
+# paddle: Large-Scale Self-Supervised  Pre-training  for Full Stack Speech Processing (https://arxiv.org/abs/2110.13900.pdf)
+# Github source: https://github.com/microsoft/unilm/tree/master/paddle
+# Copyright (c) 2021 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Based on fairseq code bases
+# https://github.com/pytorch/fairseq
+# --------------------------------------------------------
+
+import math
+import warnings
+from typing import Dict, Optional, Tuple
+from .functional import multi_head_attention_forward_paddle
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle import Tensor
+
+
+
+class TransposeLast(nn.Layer):
+    def __init__(self, deconstruct_idx=None):
+        super().__init__()
+        self.deconstruct_idx = deconstruct_idx
+
+    def forward(self, x):
+        if self.deconstruct_idx is not None:
+            x = x[self.deconstruct_idx]
+        return paddle.transpose(x, perm=[0, 2, 1])
+
+
+class Fp32LayerNorm(nn.LayerNorm):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def forward(self, input):
+        output = F.layer_norm(
+            input.float(),
+            self.normalized_shape,
+            self.weight.float() if self.weight is not None else None,
+            self.bias.float() if self.bias is not None else None,
+            self.eps,
+        )
+        return output.type_as(input)
+
+
+class Fp32GroupNorm(nn.GroupNorm):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def forward(self, input):
+        output = F.group_norm(
+            input.float(),
+            self.num_groups,
+            self.weight.float() if self.weight is not None else None,
+            self.bias.float() if self.bias is not None else None,
+            self.eps,
+        )
+        return output.type_as(input)
+
+
+
+class SamePad(nn.Layer):
+    def __init__(self, kernel_size, causal=False):
+        super().__init__()
+        if causal:
+            self.remove = kernel_size - 1
+        else:
+            self.remove = 1 if kernel_size % 2 == 0 else 0
+
+    def forward(self, x):
+        if self.remove > 0:
+            x = x[:, :, : -self.remove]
+        return x
+
+
+class Swish(nn.Layer):
+    """Swish function
+    """
+
+    def __init__(self):
+        """Construct an MultiHeadedAttention object."""
+        super(Swish, self).__init__()
+        self.act = nn.Sigmoid()
+
+    def forward(self, x):
+        return x * self.act(x)
+
+
+class GLU_Linear(nn.Layer):
+    def __init__(self, input_dim, output_dim, glu_type="sigmoid", bias_in_glu=True):
+        super(GLU_Linear, self).__init__()
+
+        self.glu_type = glu_type
+        self.output_dim = output_dim
+
+        if glu_type == "sigmoid":
+            self.glu_act = nn.Sigmoid()
+        elif glu_type == "swish":
+            self.glu_act = Swish()
+        elif glu_type == "relu":
+            self.glu_act = nn.ReLU()
+        elif glu_type == "gelu":
+            self.glu_act = nn.GELU()
+
+        if bias_in_glu:
+            self.linear = nn.Linear(input_dim, output_dim * 2, True)
+        else:
+            self.linear = nn.Linear(input_dim, output_dim * 2, False)
+
+    def forward(self, x):
+        # to be consistent with GLU_Linear, we assume the input always has the #channel (#dim) in the last dimension of the tensor, so need to switch the dimension first for 1D-Conv case
+        x = self.linear(x)
+
+        if self.glu_type == "bilinear":
+            x = (x[:, :, 0:self.output_dim] * x[:, :, self.output_dim:self.output_dim * 2])
+        else:
+            x = (x[:, :, 0:self.output_dim] * self.glu_act(x[:, :, self.output_dim:self.output_dim * 2]))
+
+        return x
+
+
+def gelu_accurate(x):
+    if not hasattr(gelu_accurate, "_a"):
+        gelu_accurate._a = math.sqrt(2 / math.pi)
+    return (
+        0.5 * x * (1 + paddle.tanh(gelu_accurate._a * (x + 0.044715 * paddle.pow(x, 3))))
+    )
+
+
+def gelu(x: Tensor) -> Tensor:
+    return nn.functional.gelu(x.astype("float32")).astype(x.dtype)
+
+
+def get_activation_fn(activation: str):
+    """Returns the activation function corresponding to `activation`"""
+
+    if activation == "relu":
+        return F.relu
+    elif activation == "gelu":
+        return gelu
+    elif activation == "gelu_fast":
+        warnings.warn(
+            "--activation-fn=gelu_fast has been renamed to gelu_accurate"
+        )
+        return gelu_accurate
+    elif activation == "gelu_accurate":
+        return gelu_accurate
+    elif activation == "tanh":
+        return paddle.tanh
+    elif activation == "linear":
+        return lambda x: x
+    elif activation == "glu":
+        return lambda x: x
+    else:
+        raise RuntimeError("--activation-fn {} not supported".format(activation))
+
+
+def quant_noise(module, p, block_size):
+    """
+    Wraps modules and applies quantization noise to the weights for
+    subsequent quantization with Iterative Product Quantization as
+    described in "Training with Quantization Noise for Extreme Model Compression"
+
+    Args:
+        - module: nn.Layer
+        - p: amount of Quantization Noise
+        - block_size: size of the blocks for subsequent quantization with iPQ
+
+    Remarks:
+        - Module weights must have the right sizes wrt the block size
+        - Only Linear, Embedding and Conv2d modules are supported for the moment
+        - For more detail on how to quantize by blocks with convolutional weights,
+          see "And the Bit Goes Down: Revisiting the Quantization of Neural Networks"
+        - We implement the simplest form of noise here as stated in the paper
+          which consists in randomly dropping blocks
+    """
+
+    # if no quantization noise, don't register hook
+    if p <= 0:
+        return module
+
+    # supported modules
+    assert isinstance(module, (nn.Linear, nn.Embedding, nn.Conv2d))
+
+    # test whether module.weight has the right sizes wrt block_size
+    is_conv = module.weight.ndim == 4
+
+    # 2D matrix
+    if not is_conv:
+        assert (
+            module.weight.size(1) % block_size == 0
+        ), "Input features must be a multiple of block sizes"
+
+    # 4D matrix
+    else:
+        # 1x1 convolutions
+        if module.kernel_size == (1, 1):
+            assert (
+                module.in_channels % block_size == 0
+            ), "Input channels must be a multiple of block sizes"
+        # regular convolutions
+        else:
+            k = module.kernel_size[0] * module.kernel_size[1]
+            assert k % block_size == 0, "Kernel size must be a multiple of block size"
+
+    def _forward_pre_hook(mod, input):
+        # no noise for evaluation
+        if mod.training:
+            if not is_conv:
+                # gather weight and sizes
+                weight = mod.weight
+                in_features = weight.size(1)
+                out_features = weight.size(0)
+
+                # split weight matrix into blocks and randomly drop selected blocks
+                mask = paddle.zeros(
+                    in_features // block_size * out_features, device=weight.device
+                )
+                mask.bernoulli_(p)
+                mask = mask.repeat_interleave(block_size, -1).view(-1, in_features)
+
+            else:
+                # gather weight and sizes
+                weight = mod.weight
+                in_channels = mod.in_channels
+                out_channels = mod.out_channels
+
+                # split weight matrix into blocks and randomly drop selected blocks
+                if mod.kernel_size == (1, 1):
+                    mask = paddle.zeros(
+                        int(in_channels // block_size * out_channels),
+                        device=weight.device,
+                    )
+                    mask.bernoulli_(p)
+                    mask = mask.repeat_interleave(block_size, -1).view(-1, in_channels)
+                else:
+                    mask = paddle.zeros(
+                        weight.size(0), weight.size(1), device=weight.device
+                    )
+
+                    mask.bernoulli_(p)
+                    mask = (
+                        mask.unsqueeze(2)
+                        .unsqueeze(3)
+                        .repeat(1, 1, mod.kernel_size[0], mod.kernel_size[1])
+                    )
+
+            # scale weights and apply mask
+            mask = mask.to(
+                paddle.bool
+            )
+            s = 1 / (1 - p)
+            mod.weight.data = s * weight.masked_fill(mask, 0)
+
+    module.register_forward_pre_hook(_forward_pre_hook)
+    return module
+
+
+class MultiheadAttention(nn.Layer):
+    """Multi-headed attention.
+
+    See "Attention Is All You Need" for more details.
+    """
+
+    def __init__(
+            self,
+            embed_dim,
+            num_heads,
+            kdim=None,
+            vdim=None,
+            dropout=0.0,
+            bias=True,
+            add_bias_kv=False,
+            add_zero_attn=False,
+            self_attention=False,
+            encoder_decoder_attention=False,
+            q_noise=0.0,
+            qn_block_size=8,
+            has_relative_attention_bias=True,
+            num_buckets=32,
+            max_distance=128,
+            gru_rel_pos=True,
+            rescale_init=False,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim
+
+        self.num_heads = num_heads
+        self.dropout_module = nn.Dropout(dropout)
+
+        self.has_relative_attention_bias = has_relative_attention_bias
+        self.num_buckets = num_buckets
+        self.max_distance = max_distance
+        if self.has_relative_attention_bias:
+            self.relative_attention_bias = nn.Embedding(num_buckets, num_heads)
+
+        self.head_dim = embed_dim // num_heads
+        self.q_head_dim = self.head_dim
+        self.k_head_dim = self.head_dim
+        assert (
+                self.head_dim * num_heads == self.embed_dim
+        ), "embed_dim must be divisible by num_heads"
+        self.scaling = self.head_dim ** -0.5
+
+        self.self_attention = self_attention
+        self.encoder_decoder_attention = encoder_decoder_attention
+
+        assert not self.self_attention or self.qkv_same_dim, (
+            "Self-attention requires query, key and " "value to be of the same size"
+        )
+
+        k_bias = True
+        if rescale_init:
+            k_bias = False
+
+        k_embed_dim = embed_dim
+        q_embed_dim = embed_dim
+
+        self.k_proj = quant_noise(
+            nn.Linear(self.kdim, k_embed_dim, bias_attr=k_bias), q_noise, qn_block_size
+        )
+        self.v_proj = quant_noise(
+            nn.Linear(self.vdim, embed_dim, bias_attr=bias), q_noise, qn_block_size
+        )
+        self.q_proj = quant_noise(
+            nn.Linear(embed_dim, q_embed_dim, bias_attr=bias), q_noise, qn_block_size
+        )
+
+        self.out_proj = quant_noise(
+            nn.Linear(embed_dim, embed_dim, bias_attr=bias), q_noise, qn_block_size
+        )
+
+        if add_bias_kv:
+            self.bias_k = self.create_parameter(
+                shape=[1, 1, embed_dim], dtype="float32"
+            )
+            self.bias_v = self.create_parameter(
+                shape=[1, 1, embed_dim], dtype="float32"
+            )
+
+        else:
+            self.bias_k = self.bias_v = None
+
+        self.add_zero_attn = add_zero_attn
+
+        self.gru_rel_pos = gru_rel_pos
+        if self.gru_rel_pos:
+            self.grep_linear = nn.Linear(self.q_head_dim, 8)
+            self.grep_a = self.create_parameter(
+                shape=[1, num_heads, 1, 1], dtype="float32"
+            )
+
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        pass
+        
+    def _relative_positions_bucket(self, relative_positions, bidirectional=True):
+        num_buckets = self.num_buckets
+        max_distance = self.max_distance
+        relative_buckets = 0
+
+        if bidirectional:
+            num_buckets = num_buckets // 2
+            relative_buckets += (relative_positions > 0).astype("int64") * num_buckets
+            relative_positions = paddle.abs(relative_positions)
+        else:
+            relative_positions = -paddle.minimum(relative_positions, paddle.zeros_like(relative_positions))
+
+        max_exact = num_buckets // 2
+        is_small = relative_positions < max_exact
+
+        relative_postion_if_large = max_exact + (
+                paddle.log(relative_positions.astype("float32") / max_exact)
+                / math.log(max_distance / max_exact)
+                * (num_buckets - max_exact)
+        ).astype("int64")
+        relative_postion_if_large = paddle.minimum(
+            relative_postion_if_large, paddle.full_like(relative_postion_if_large, num_buckets - 1)
+        )
+
+        relative_buckets += paddle.where(is_small, relative_positions, relative_postion_if_large)
+        return relative_buckets
+
+    def compute_bias(self, query_length, key_length):
+        context_position = paddle.arange(query_length, dtype="int64")[:, None]
+        memory_position = paddle.arange(key_length, dtype="int64")[None, :]
+        relative_position = memory_position - context_position
+        relative_position_bucket = self._relative_positions_bucket(
+            relative_position,
+            bidirectional=True
+        )
+        # relative_position_bucket = relative_position_bucket.to(self.relative_attention_bias.weight.device)
+        values = self.relative_attention_bias(relative_position_bucket)
+        values = values.transpose([2, 0, 1])
+        return values
+
+    def forward(
+            self,
+            query,
+            key: Optional[Tensor],
+            value: Optional[Tensor],
+            key_padding_mask: Optional[Tensor] = None,
+            incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None,
+            need_weights: bool = True,
+            static_kv: bool = False,
+            attn_mask: Optional[Tensor] = None,
+            before_softmax: bool = False,
+            need_head_weights: bool = False,
+            position_bias: Optional[Tensor] = None
+    ) -> Tuple[Tensor, Optional[Tensor], Optional[Tensor]]:
+        """Input shape: Time x Batch x Channel
+
+        Args:
+            key_padding_mask (ByteTensor, optional): mask to exclude
+                keys that are pads, of shape `(batch, src_len)`, where
+                padding elements are indicated by 1s.
+            need_weights (bool, optional): return the attention weights,
+                averaged over heads (default: False).
+            attn_mask (ByteTensor, optional): typically used to
+                implement causal attention, where the mask prevents the
+                attention from looking forward in time (default: None).
+            before_softmax (bool, optional): return the raw attention
+                weights and values before the attention softmax.
+            need_head_weights (bool, optional): return the attention
+                weights for each head. Implies *need_weights*. Default:
+                return the average attention weights over all heads.
+        """
+        if need_head_weights:
+            need_weights = True
+
+        tgt_len, bsz, embed_dim = query.shape
+        src_len = tgt_len
+        assert embed_dim == self.embed_dim
+        assert list(query.shape) == [tgt_len, bsz, embed_dim]
+        if key is not None:
+            src_len, key_bsz, _ = key.shape
+            
+        if self.has_relative_attention_bias and position_bias is None:
+            position_bias = self.compute_bias(tgt_len, src_len)
+            position_bias_ = position_bias.unsqueeze(0)
+            position_bias = paddle.concat([position_bias_ for _ in range(bsz)], axis=0)
+            position_bias = position_bias.reshape([bsz * self.num_heads, tgt_len, src_len])
+        if (
+                incremental_state is None
+                and not static_kv
+                and self.q_head_dim == self.head_dim
+        ):
+            assert key is not None and value is not None
+            assert attn_mask is None
+
+            attn_mask_rel_pos = None
+            if position_bias is not None:
+                attn_mask_rel_pos = position_bias
+                if self.gru_rel_pos:
+                    query_layer = query.transpose([1, 0, 2])
+                    new_x_shape = query_layer.shape[:-1] + [self.num_heads, -1]
+                    query_layer = query_layer.reshape(new_x_shape)
+                    query_layer = query_layer.transpose([0, 2, 1, 3])
+                    _B, _H, _L, __ = query_layer.shape
+
+                    gate_a, gate_b = paddle.nn.functional.sigmoid(self.grep_linear(query_layer).reshape([_B, _H, _L, 2, 4]).sum(-1, keepdim=False)).chunk(2, axis=-1)
+                    
+                    gate_a_1 = gate_a * (gate_b * self.grep_a - 1.0) + 2.0
+                    attn_mask_rel_pos = gate_a_1.reshape([bsz * self.num_heads, -1, 1]) * position_bias
+
+                attn_mask_rel_pos = attn_mask_rel_pos.reshape((-1, tgt_len, tgt_len))
+            k_proj_bias = self.k_proj.bias
+            if k_proj_bias is None:
+                k_proj_bias = paddle.zeros_like(self.q_proj.bias)
+
+            
+            x, attn = multi_head_attention_forward_paddle(
+                query,
+                key,
+                value,
+                self.embed_dim,
+                self.num_heads,
+                paddle.empty([0]),
+                paddle.concat((self.q_proj.bias, self.k_proj.bias, self.v_proj.bias), axis=0),
+                self.bias_k,
+                self.bias_v,
+                self.add_zero_attn,
+                self.dropout_module.p,
+                self.out_proj.weight,
+                self.out_proj.bias,
+                self.training,
+                key_padding_mask,
+                need_weights,
+                attn_mask_rel_pos,
+                use_separate_proj_weight=True,
+                q_proj_weight=self.q_proj.weight,
+                k_proj_weight=self.k_proj.weight,
+                v_proj_weight=self.v_proj.weight,
+            )
+            
+            return x, attn, position_bias
+
+        if incremental_state is not None:
+            saved_state = self._get_input_buffer(incremental_state)
+            if saved_state is not None and "prev_key" in saved_state:
+                # previous time steps are cached - no need to recompute
+                # key and value if they are static
+                if static_kv:
+                    assert self.encoder_decoder_attention and not self.self_attention
+                    key = value = None
+        else:
+            saved_state = None
+
+        if self.self_attention:
+            q = self.q_proj(query)
+            k = self.k_proj(query)
+            v = self.v_proj(query)
+        elif self.encoder_decoder_attention:
+            # encoder-decoder attention
+            q = self.q_proj(query)
+            if key is None:
+                assert value is None
+                k = v = None
+            else:
+                k = self.k_proj(key)
+                v = self.v_proj(key)
+
+        else:
+            assert key is not None and value is not None
+            q = self.q_proj(query)
+            k = self.k_proj(key)
+            v = self.v_proj(value)
+        q *= self.scaling
+
+        if self.bias_k is not None:
+            assert self.bias_v is not None
+            k = paddle.concat([k, self.bias_k.repeat(1, bsz, 1)], axis=0)
+            v = paddle.concat([v, self.bias_v.repeat(1, bsz, 1)], axis=0)
+            if attn_mask is not None:
+                attn_mask = paddle.concat(
+                    [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], axis=1
+                )
+
+            if key_padding_mask is not None:
+                key_padding_mask = paddle.concat(
+                    [
+                        key_padding_mask,
+                        key_padding_mask.new_zeros(key_padding_mask.size(0), 1),
+                    ],
+                    axis=1,
+                )
+
+        q = (
+            q.contiguous()
+                .view(tgt_len, bsz * self.num_heads, self.q_head_dim)
+                .transpose([1, 0, 2])
+        )
+        if k is not None:
+            k = (
+                k.contiguous()
+                    .view(-1, bsz * self.num_heads, self.k_head_dim)
+                    .transpose([1, 0, 2])
+            )
+        if v is not None:
+            v = (
+                v.contiguous()
+                    .view(-1, bsz * self.num_heads, self.head_dim)
+                    .transpose([1, 0, 2])
+            )
+
+        if saved_state is not None:
+            # saved states are stored with shape (bsz, num_heads, seq_len, head_dim)
+            if "prev_key" in saved_state:
+                _prev_key = saved_state["prev_key"]
+                assert _prev_key is not None
+                prev_key = _prev_key.view(bsz * self.num_heads, -1, self.head_dim)
+                if static_kv:
+                    k = prev_key
+                else:
+                    assert k is not None
+                    k = paddle.concat([prev_key, k], axis=1)
+                src_len = k.size(1)
+            if "prev_value" in saved_state:
+                _prev_value = saved_state["prev_value"]
+                assert _prev_value is not None
+                prev_value = _prev_value.view(bsz * self.num_heads, -1, self.head_dim)
+                if static_kv:
+                    v = prev_value
+                else:
+                    assert v is not None
+                    v = paddle.concat([prev_value, v], axis=1)
+            prev_key_padding_mask: Optional[Tensor] = None
+            if "prev_key_padding_mask" in saved_state:
+                prev_key_padding_mask = saved_state["prev_key_padding_mask"]
+            assert k is not None and v is not None
+            key_padding_mask = MultiheadAttention._append_prev_key_padding_mask(
+                key_padding_mask=key_padding_mask,
+                prev_key_padding_mask=prev_key_padding_mask,
+                batch_size=bsz,
+                src_len=k.size(1),
+                static_kv=static_kv,
+            )
+
+            saved_state["prev_key"] = k.view(bsz, self.num_heads, -1, self.head_dim)
+            saved_state["prev_value"] = v.view(bsz, self.num_heads, -1, self.head_dim)
+            saved_state["prev_key_padding_mask"] = key_padding_mask
+            # In this branch incremental_state is never None
+            assert incremental_state is not None
+            incremental_state = self._set_input_buffer(incremental_state, saved_state)
+        assert k is not None
+        assert k.size(1) == src_len
+
+        # This is part of a workaround to get around fork/join parallelism
+        # not supporting Optional types.
+        if key_padding_mask is not None and key_padding_mask.dim() == 0:
+            key_padding_mask = None
+
+        if key_padding_mask is not None:
+            assert key_padding_mask.size(0) == bsz
+            assert key_padding_mask.size(1) == src_len
+
+        if self.add_zero_attn:
+            assert v is not None
+            src_len += 1
+            k = paddle.concat([k, k.new_zeros((k.size(0), 1) + k.shape[2:])], axis=1)
+            v = paddle.concat([v, v.new_zeros((v.size(0), 1) + v.shape[2:])], axis=1)
+            if attn_mask is not None:
+                attn_mask = paddle.concat(
+                    [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], axis=1
+                )
+
+            if key_padding_mask is not None:
+                key_padding_mask = paddle.concat(
+                    [
+                        key_padding_mask,
+                        paddle.zeros(key_padding_mask.size(0), 1).type_as(
+                            key_padding_mask
+                        ),
+                    ],
+                    axis=1,
+                )
+
+
+        attn_weights = paddle.matmul(q, k.transpose([0, 2, 1]))
+
+        attn_weights = self.apply_sparse_mask(attn_weights, tgt_len, src_len, bsz)
+
+        assert list(attn_weights.shape) == [bsz * self.num_heads, tgt_len, src_len]
+
+        if attn_mask is not None:
+            attn_mask = attn_mask.unsqueeze(0)
+            attn_weights += attn_mask
+
+        if key_padding_mask is not None:
+            # don't attend to padding symbols
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.masked_fill(
+                key_padding_mask.unsqueeze(1).unsqueeze(2).to(paddle.bool),
+                float("-inf"),
+            )
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        if before_softmax:
+            return attn_weights, v, position_bias
+
+        if position_bias is not None:
+            if self.gru_rel_pos == 1:
+                query_layer = q.view(bsz, self.num_heads, tgt_len, self.q_head_dim)
+                _B, _H, _L, __ = query_layer.shape
+                gate_a, gate_b = paddle.sigmoid(self.grep_linear(query_layer).view(
+                    _B, _H, _L, 2, 4).sum(-1, keepdim=False)).chunk(2, axis=-1)
+                
+                gate_a_1 = gate_a * (gate_b * self.grep_a - 1.0) + 2.0
+                position_bias = gate_a_1.view(bsz * self.num_heads, -1, 1) * position_bias
+
+            position_bias = position_bias.view(attn_weights.shape)
+
+            attn_weights = attn_weights + position_bias
+
+        attn_weights_float = F.softmax(
+            attn_weights, dim=-1
+        )
+        attn_weights = attn_weights_float.type_as(attn_weights)
+        attn_probs = self.dropout_module(attn_weights)
+
+        assert v is not None
+        attn = paddle.bmm(attn_probs, v)
+        assert list(attn.shape) == [bsz * self.num_heads, tgt_len, self.head_dim]
+        attn = attn.transpose([1, 0, 2]).reshape([tgt_len, bsz, embed_dim])
+        attn = self.out_proj(attn)
+        attn_weights: Optional[Tensor] = None
+        if need_weights:
+            attn_weights = attn_weights_float.view(
+                bsz, self.num_heads, tgt_len, src_len
+            ).transpose([1, 0, 2, 3])
+            if not need_head_weights:
+                # average attention weights over heads
+                attn_weights = attn_weights.mean(dim=0)
+
+        return attn, attn_weights, position_bias
+
+    @staticmethod
+    def _append_prev_key_padding_mask(
+            key_padding_mask: Optional[Tensor],
+            prev_key_padding_mask: Optional[Tensor],
+            batch_size: int,
+            src_len: int,
+            static_kv: bool,
+    ) -> Optional[Tensor]:
+        # saved key padding masks have shape (bsz, seq_len)
+        if prev_key_padding_mask is not None and static_kv:
+            new_key_padding_mask = prev_key_padding_mask
+        elif prev_key_padding_mask is not None and key_padding_mask is not None:
+            new_key_padding_mask = paddle.concat(
+                [prev_key_padding_mask.float(), key_padding_mask.float()], axis=1
+            )
+        # During incremental decoding, as the padding token enters and
+        # leaves the frame, there will be a time when prev or current
+        # is None
+        elif prev_key_padding_mask is not None:
+            if src_len > prev_key_padding_mask.size(1):
+                filler = paddle.zeros(
+                    (batch_size, src_len - prev_key_padding_mask.size(1)),
+                    device=prev_key_padding_mask.device,
+                )
+                new_key_padding_mask = paddle.concat(
+                    [prev_key_padding_mask.float(), filler.float()], axis=1
+                )
+
+            else:
+                new_key_padding_mask = prev_key_padding_mask.float()
+        elif key_padding_mask is not None:
+            if src_len > key_padding_mask.size(1):
+                filler = paddle.zeros(
+                    (batch_size, src_len - key_padding_mask.size(1)),
+                    device=key_padding_mask.device,
+                )
+                new_key_padding_mask = paddle.concat(
+                    [filler.float(), key_padding_mask.float()], axis=1
+                )
+
+            else:
+                new_key_padding_mask = key_padding_mask.float()
+        else:
+            new_key_padding_mask = prev_key_padding_mask
+        return new_key_padding_mask
+
+    def _get_input_buffer(
+            self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]]
+    ) -> Dict[str, Optional[Tensor]]:
+        result = self.get_incremental_state(incremental_state, "attn_state")
+        if result is not None:
+            return result
+        else:
+            empty_result: Dict[str, Optional[Tensor]] = {}
+            return empty_result
+
+    def _set_input_buffer(
+            self,
+            incremental_state: Dict[str, Dict[str, Optional[Tensor]]],
+            buffer: Dict[str, Optional[Tensor]],
+    ):
+        return self.set_incremental_state(incremental_state, "attn_state", buffer)
+
+    def apply_sparse_mask(self, attn_weights, tgt_len: int, src_len: int, bsz: int):
+        return attn_weights
\ No newline at end of file
diff --git a/paddlespeech/s2t/models/wavlm/wavlm_asr.py b/paddlespeech/s2t/models/wavlm/wavlm_asr.py
new file mode 100644
index 00000000..5764890d
--- /dev/null
+++ b/paddlespeech/s2t/models/wavlm/wavlm_asr.py
@@ -0,0 +1,323 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from collections import defaultdict
+from typing import Dict
+from typing import List
+from typing import Tuple
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddlespeech.s2t.models.wav2vec2.modules.VanillaNN import VanillaNN
+from paddlespeech.s2t.models.wav2vec2.processing.speech_augmentation import SpecAugment
+from paddlespeech.s2t.modules.ctc import CTCDecoderBase as CTC
+from paddlespeech.s2t.modules.initializer import DefaultInitializerContext
+from paddlespeech.s2t.utils.ctc_utils import remove_duplicates_and_blank
+from paddlespeech.s2t.utils.utility import log_add
+
+from .wavlm_paddle import WavLM, WavLMConfig
+
+
+class WavLMASR(nn.Layer):
+    def __init__(self, config: dict):
+        super().__init__()
+        init_type = config.get("init_type", None)
+        with DefaultInitializerContext(init_type):
+            self.config = config
+            wavlm_config = WavLMConfig(config)
+            wavlm = WavLM(wavlm_config)
+
+            self.normalize_wav = config.normalize_wav
+            self.output_norm = config.output_norm
+            if hasattr(config, 'spec_augment'):
+                self.spec_augment = SpecAugment(**config.spec_augment)
+
+            if config.freeze_wavlm:
+                wavlm.eval()
+                for parm in wavlm.parameters():
+                    parm.trainable = False
+            self.wavlm = wavlm
+            self.enc = VanillaNN(**config.enc)
+            self.ctc = CTC(**config.ctc,
+                           odim=config.output_dim,
+                           batch_average=False,
+                           reduction='mean')
+
+    def forward(self, wav, wavs_lens_rate, target, target_lens):
+        if self.normalize_wav:
+            wav = F.layer_norm(wav, wav.shape)
+
+        # Extract wav2vec output
+        out = self.wavlm(wav)
+        # We normalize the output if required
+        if self.output_norm:
+            out = F.layer_norm(out, out.shape)
+
+        if self.training and hasattr(self.config, 'spec_augment'):
+            feats = self.spec_augment(out)
+        else:
+            feats = out
+
+        x = self.enc(feats)
+        # x = feats
+
+        x_lens = (wavs_lens_rate * x.shape[1]).round().astype(paddle.int64)
+        target_lens = target_lens.astype(paddle.int64)
+        # target = target.astype(paddle.int32)
+        ctc_loss = self.ctc(x, x_lens, target, target_lens)
+
+        return ctc_loss
+
+    @paddle.no_grad()
+    def decode(self,
+               feats: paddle.Tensor,
+               text_feature: Dict[str, int],
+               decoding_method: str,
+               beam_size: int,
+               tokenizer: str=None,
+               sb_pipeline=False):
+        batch_size = feats.shape[0]
+
+        if decoding_method == 'ctc_prefix_beam_search' and batch_size > 1:
+            print(
+                f"decoding mode {decoding_method} must be running with batch_size == 1"
+            )
+            print(f"current batch_size is {batch_size}")
+
+        if decoding_method == 'ctc_greedy_search':
+            if tokenizer is None and sb_pipeline is False:
+                hyps = self.ctc_greedy_search(feats)
+                res = [text_feature.defeaturize(hyp) for hyp in hyps]
+                res_tokenids = [hyp for hyp in hyps]
+            else:
+                if sb_pipeline is True:
+                    hyps = self.ctc_greedy_search(feats.unsqueeze(-1))
+                else:
+                    hyps = self.ctc_greedy_search(feats)
+                res = []
+                res_tokenids = []
+                for sequence in hyps:
+                    # Decode token terms to words 
+                    predicted_tokens = text_feature.convert_ids_to_tokens(
+                        sequence)
+                tmp_res = []
+                tmp_res_tokenids = []
+                for c in predicted_tokens:
+                    if c == "[CLS]":
+                        continue
+                    elif c == "[SEP]" or c == "[PAD]":
+                        break
+                    else:
+                        tmp_res.append(c)
+                        tmp_res_tokenids.append(text_feature.vocab[c])
+                res.append(''.join(tmp_res))
+                res_tokenids.append(tmp_res_tokenids)
+
+        # ctc_prefix_beam_search and attention_rescoring only return one
+        # result in List[int], change it to List[List[int]] for compatible
+        # with other batch decoding mode
+        elif decoding_method == 'ctc_prefix_beam_search':
+            assert feats.shape[0] == 1
+            if tokenizer is None and sb_pipeline is False:
+                hyp = self.ctc_prefix_beam_search(feats, beam_size)
+                res = [text_feature.defeaturize(hyp)]
+                res_tokenids = [hyp]
+            else:
+                if sb_pipeline is True:
+                    hyp = self.ctc_prefix_beam_search(
+                        feats.unsqueeze(-1), beam_size)
+                else:
+                    hyp = self.ctc_prefix_beam_search(feats, beam_size)
+                res = []
+                res_tokenids = []
+                predicted_tokens = text_feature.convert_ids_to_tokens(hyp)
+                tmp_res = []
+                tmp_res_tokenids = []
+                for c in predicted_tokens:
+                    if c == "[CLS]":
+                        continue
+                    elif c == "[SEP]" or c == "[PAD]":
+                        break
+                    else:
+                        tmp_res.append(c)
+                        tmp_res_tokenids.append(text_feature.vocab[c])
+                res.append(''.join(tmp_res))
+                res_tokenids.append(tmp_res_tokenids)
+        else:
+            raise ValueError(
+                f"WavLM not support decoding method: {decoding_method}")
+
+        return res, res_tokenids
+
+    @classmethod
+    def from_config(cls, config):
+        model = cls(config)
+        return model
+
+    def ctc_greedy_search(self, wav) -> List[List[int]]:
+        """ Apply CTC greedy search
+        Args:
+            speech (paddle.Tensor): (batch, max_len)
+            speech_length (paddle.Tensor): (batch, )
+        Returns:
+            List[List[int]]: best path result
+        """
+        batch_size = wav.shape[0]
+        wav = wav[:, :, 0]
+        if self.normalize_wav:
+            wav = F.layer_norm(wav, wav.shape[1:])
+        # Extract wavlm output
+        out = self.wavlm(wav)
+        # We normalize the output if required
+        if self.output_norm:
+            out = F.layer_norm(out, out.shape[1:])
+        feats = out
+        x = self.enc(feats)
+        x_lens = x.shape[1]
+        ctc_probs = self.ctc.log_softmax(x)  # (B, maxlen, vocab_size)
+        topk_prob, topk_index = ctc_probs.topk(1, axis=2)  # (B, maxlen, 1)
+        topk_index = topk_index.view(batch_size, x_lens)  # (B, maxlen)
+
+        hyps = [hyp.tolist() for hyp in topk_index]
+        hyps = [remove_duplicates_and_blank(hyp) for hyp in hyps]
+        return hyps
+
+    def _ctc_prefix_beam_search(
+            self,
+            wav,
+            beam_size,
+            blank_id: int=0, ) -> Tuple[List[Tuple[int, float]], paddle.Tensor]:
+        """ CTC prefix beam search inner implementation
+        Args:
+            speech (paddle.Tensor): (batch, max_len, feat_dim)
+            speech_length (paddle.Tensor): (batch, )
+            beam_size (int): beam size for beam search
+            decoding_chunk_size (int): decoding chunk for dynamic chunk
+                trained model.
+                <0: for decoding, use full chunk.
+                >0: for decoding, use fixed chunk size as set.
+                0: used for training, it's prohibited here
+            simulate_streaming (bool): whether do encoder forward in a
+                streaming fashion
+        Returns:
+            List[Tuple[int, float]]: nbest results, (N,1), (text, likelihood)
+            paddle.Tensor: encoder output, (1, max_len, encoder_dim),
+                it will be used for rescoring in attention rescoring mode
+        """
+        wav = wav[:, :, 0]
+
+        if self.normalize_wav:
+            wav = F.layer_norm(wav, wav.shape[1:])
+        # Extract wavlm output
+        out = self.wavlm(wav)
+        # We normalize the output if required
+        if self.output_norm:
+            out = F.layer_norm(out, out.shape[1:])
+        feats = out
+
+        x = self.enc(feats)
+        maxlen = x.shape[1]
+        ctc_probs = self.ctc.log_softmax(x)  # (1, maxlen, vocab_size)
+        ctc_probs = ctc_probs.squeeze(0)
+
+        # cur_hyps: (prefix, (blank_ending_score, none_blank_ending_score))
+        # blank_ending_score and  none_blank_ending_score in ln domain
+        cur_hyps = [(tuple(), (0.0, -float('inf')))]
+        # 2. CTC beam search step by step
+        for t in range(0, maxlen):
+            logp = ctc_probs[t]  # (vocab_size,)
+            # key: prefix, value (pb, pnb), default value(-inf, -inf)
+            next_hyps = defaultdict(lambda: (-float('inf'), -float('inf')))
+            # 2.1 First beam prune: select topk best
+            top_k_logp, top_k_index = logp.topk(beam_size)  # (beam_size,)
+            for s in top_k_index:
+                s = s.item()
+                ps = logp[s].item()
+                for prefix, (pb, pnb) in cur_hyps:
+                    last = prefix[-1] if len(prefix) > 0 else None
+                    if s == blank_id:  # blank
+                        n_pb, n_pnb = next_hyps[prefix]
+                        n_pb = log_add([n_pb, pb + ps, pnb + ps])
+                        next_hyps[prefix] = (n_pb, n_pnb)
+                    elif s == last:
+                        #  Update *ss -> *s;
+                        n_pb, n_pnb = next_hyps[prefix]
+                        n_pnb = log_add([n_pnb, pnb + ps])
+                        next_hyps[prefix] = (n_pb, n_pnb)
+                        # Update *s-s -> *ss, - is for blank
+                        n_prefix = prefix + (s, )
+                        n_pb, n_pnb = next_hyps[n_prefix]
+                        n_pnb = log_add([n_pnb, pb + ps])
+                        next_hyps[n_prefix] = (n_pb, n_pnb)
+                    else:
+                        n_prefix = prefix + (s, )
+                        n_pb, n_pnb = next_hyps[n_prefix]
+                        n_pnb = log_add([n_pnb, pb + ps, pnb + ps])
+                        next_hyps[n_prefix] = (n_pb, n_pnb)
+
+            # 2.2 Second beam prune
+            next_hyps = sorted(
+                next_hyps.items(),
+                key=lambda x: log_add(list(x[1])),
+                reverse=True)
+            cur_hyps = next_hyps[:beam_size]
+
+        hyps = [(y[0], log_add([y[1][0], y[1][1]])) for y in cur_hyps]
+        return hyps
+
+    def ctc_prefix_beam_search(self, wav, beam_size) -> List[int]:
+        """ Apply CTC prefix beam search
+        Args:
+            speech (paddle.Tensor): (batch, max_len, feat_dim)
+            speech_length (paddle.Tensor): (batch, )
+            beam_size (int): beam size for beam search
+            decoding_chunk_size (int): decoding chunk for dynamic chunk
+                trained model.
+                <0: for decoding, use full chunk.
+                >0: for decoding, use fixed chunk size as set.
+                0: used for training, it's prohibited here
+            simulate_streaming (bool): whether do encoder forward in a
+                streaming fashion
+        Returns:
+            List[int]: CTC prefix beam search nbest results
+        """
+        hyps = self._ctc_prefix_beam_search(wav, beam_size)
+        return hyps[0][0]
+
+
+class WavLMBase(nn.Layer):
+    """WavLM model"""
+
+    def __init__(self, config: dict):
+        super().__init__()
+        wavlm_config = WavLMConfig(config)
+        wavlm = WavLM(wavlm_config)
+        self.wavlm = wavlm
+
+    @classmethod
+    def from_config(cls, configs: dict):
+        """init model.
+        Args:
+            configs (dict): config dict.
+        Raises:
+            ValueError: raise when using not support encoder type.
+        Returns:
+            nn.Layer: WavLMBase
+        """
+        model = cls(configs)
+        return model
+
+    def forward(self, wav):
+        out = self.wavlm(wav)
+        return out
diff --git a/paddlespeech/s2t/models/wavlm/wavlm_paddle.py b/paddlespeech/s2t/models/wavlm/wavlm_paddle.py
new file mode 100644
index 00000000..6ed9ecd0
--- /dev/null
+++ b/paddlespeech/s2t/models/wavlm/wavlm_paddle.py
@@ -0,0 +1,756 @@
+# --------------------------------------------------------
+# WavLM: Large-Scale Self-Supervised  Pre-training  for Full Stack Speech Processing (https://arxiv.org/abs/2110.13900.pdf)
+# Github source: https://github.com/microsoft/unilm/tree/master/wavlm
+# Copyright (c) 2021 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Based on fairseq code bases
+# https://github.com/pytorch/fairseq
+# --------------------------------------------------------
+
+import math
+import logging
+from typing import List, Optional, Tuple
+
+import numpy as np
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.nn import LayerNorm
+from paddle import Tensor
+from .modules.modules import (
+    MultiheadAttention,
+    SamePad,
+    get_activation_fn,
+    TransposeLast,
+    GLU_Linear,
+)
+
+logger = logging.getLogger(__name__)
+
+
+def compute_mask_indices(
+    shape: Tuple[int, int],
+    padding_mask: Optional[Tensor],
+    mask_prob: float,
+    mask_length: int,
+    mask_type: str = "static",
+    mask_other: float = 0.0,
+    min_masks: int = 0,
+    no_overlap: bool = False,
+    min_space: int = 0,
+) -> np.ndarray:
+    """
+    Computes random mask spans for a given shape
+
+    Args:
+        shape: the the shape for which to compute masks.
+            should be of size 2 where first element is batch size and 2nd is timesteps
+        padding_mask: optional padding mask of the same size as shape, which will prevent masking padded elements
+        mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by
+            number of timesteps divided by length of mask span to mask approximately this percentage of all elements.
+            however due to overlaps, the actual number will be smaller (unless no_overlap is True)
+        mask_type: how to compute mask lengths
+            static = fixed size
+            uniform = sample from uniform distribution [mask_other, mask_length*2]
+            normal = sample from normal distribution with mean mask_length and stdev mask_other. mask is min 1 element
+            poisson = sample from possion distribution with lambda = mask length
+        min_masks: minimum number of masked spans
+        no_overlap: if false, will switch to an alternative recursive algorithm that prevents spans from overlapping
+        min_space: only used if no_overlap is True, this is how many elements to keep unmasked between spans
+    """
+
+    bsz, all_sz = shape
+    mask = np.full((bsz, all_sz), False)
+
+    all_num_mask = int(
+        # add a random number for probabilistic rounding
+        mask_prob * all_sz / float(mask_length)
+        + np.random.rand()
+    )
+
+    all_num_mask = max(min_masks, all_num_mask)
+
+    mask_idcs = []
+    for i in range(bsz):
+        if padding_mask is not None:
+            sz = all_sz - padding_mask[i].long().sum().item()
+            num_mask = int(
+                # add a random number for probabilistic rounding
+                mask_prob * sz / float(mask_length)
+                + np.random.rand()
+            )
+            num_mask = max(min_masks, num_mask)
+        else:
+            sz = all_sz
+            num_mask = all_num_mask
+
+        if mask_type == "static":
+            lengths = np.full(num_mask, mask_length)
+        elif mask_type == "uniform":
+            lengths = np.random.randint(mask_other, mask_length * 2 + 1, size=num_mask)
+        elif mask_type == "normal":
+            lengths = np.random.normal(mask_length, mask_other, size=num_mask)
+            lengths = [max(1, int(round(x))) for x in lengths]
+        elif mask_type == "poisson":
+            lengths = np.random.poisson(mask_length, size=num_mask)
+            lengths = [int(round(x)) for x in lengths]
+        else:
+            raise Exception("unknown mask selection " + mask_type)
+
+        if sum(lengths) == 0:
+            lengths[0] = min(mask_length, sz - 1)
+
+        if no_overlap:
+            mask_idc = []
+
+            def arrange(s, e, length, keep_length):
+                span_start = np.random.randint(s, e - length)
+                mask_idc.extend(span_start + i for i in range(length))
+
+                new_parts = []
+                if span_start - s - min_space >= keep_length:
+                    new_parts.append((s, span_start - min_space + 1))
+                if e - span_start - keep_length - min_space > keep_length:
+                    new_parts.append((span_start + length + min_space, e))
+                return new_parts
+
+            parts = [(0, sz)]
+            min_length = min(lengths)
+            for length in sorted(lengths, reverse=True):
+                lens = np.fromiter(
+                    (e - s if e - s >= length + min_space else 0 for s, e in parts),
+                    np.int,
+                )
+                l_sum = np.sum(lens)
+                if l_sum == 0:
+                    break
+                probs = lens / np.sum(lens)
+                c = np.random.choice(len(parts), p=probs)
+                s, e = parts.pop(c)
+                parts.extend(arrange(s, e, length, min_length))
+            mask_idc = np.asarray(mask_idc)
+        else:
+            min_len = min(lengths)
+            if sz - min_len <= num_mask:
+                min_len = sz - num_mask - 1
+
+            mask_idc = np.random.choice(sz - min_len, num_mask, replace=False)
+
+            mask_idc = np.asarray(
+                [
+                    mask_idc[j] + offset
+                    for j in range(len(mask_idc))
+                    for offset in range(lengths[j])
+                ]
+            )
+
+        mask_idcs.append(np.unique(mask_idc[mask_idc < sz]))
+
+    min_len = min([len(m) for m in mask_idcs])
+    for i, mask_idc in enumerate(mask_idcs):
+        if len(mask_idc) > min_len:
+            mask_idc = np.random.choice(mask_idc, min_len, replace=False)
+        mask[i, mask_idc] = True
+
+    return mask
+
+
+class WavLMConfig:
+    def __init__(self, cfg=None):
+        self.extractor_mode: str = "default"     # mode for feature extractor. default has a single group norm with d groups in the first conv block, whereas layer_norm has layer norms in every block (meant to use with normalize=True)
+        self.encoder_layers: int = 12     # num encoder layers in the transformer
+
+        self.encoder_embed_dim: int = 768     # encoder embedding dimension
+        self.encoder_ffn_embed_dim: int = 3072     # encoder embedding dimension for FFN
+        self.encoder_attention_heads: int = 12     # num encoder attention heads
+        self.activation_fn: str = "gelu"     # activation function to use
+
+        self.layer_norm_first: bool = False     # apply layernorm first in the transformer
+        self.conv_feature_layers: str = "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2"     # string describing convolutional feature extraction layers in form of a python list that contains [(dim, kernel_size, stride), ...]
+        self.conv_bias: bool = False     # include bias in conv encoder
+        self.feature_grad_mult: float = 1.0     # multiply feature extractor var grads by this
+
+        self.normalize: bool = False  # normalize input to have 0 mean and unit variance during training
+
+        # dropouts
+        self.dropout: float = 0.1     # dropout probability for the transformer
+        self.attention_dropout: float = 0.1     # dropout probability for attention weights
+        self.activation_dropout: float = 0.0     # dropout probability after activation in FFN
+        self.encoder_layerdrop: float = 0.0     # probability of dropping a tarnsformer layer
+        self.dropout_input: float = 0.0     # dropout to apply to the input (after feat extr)
+        self.dropout_features: float = 0.0     # dropout to apply to the features (after feat extr)
+
+        # masking
+        self.mask_length: int = 10     # mask length
+        self.mask_prob: float = 0.65     # probability of replacing a token with mask
+        self.mask_selection: str = "static"     # how to choose mask length
+        self.mask_other: float = 0     # secondary mask argument (used for more complex distributions), see help in compute_mask_indicesh
+        self.no_mask_overlap: bool = False     # whether to allow masks to overlap
+        self.mask_min_space: int = 1     # min space between spans (if no overlap is enabled)
+
+        # channel masking
+        self.mask_channel_length: int = 10     # length of the mask for features (channels)
+        self.mask_channel_prob: float = 0.0     # probability of replacing a feature with 0
+        self.mask_channel_selection: str = "static"     # how to choose mask length for channel masking
+        self.mask_channel_other: float = 0     # secondary mask argument (used for more complex distributions), see help in compute_mask_indices
+        self.no_mask_channel_overlap: bool = False     # whether to allow channel masks to overlap
+        self.mask_channel_min_space: int = 1     # min space between spans (if no overlap is enabled)
+
+        # positional embeddings
+        self.conv_pos: int = 128     # number of filters for convolutional positional embeddings
+        self.conv_pos_groups: int = 16     # number of groups for convolutional positional embedding
+
+        # relative position embedding
+        self.relative_position_embedding: bool = True     # apply relative position embedding
+        self.num_buckets: int = 320     # number of buckets for relative position embedding
+        self.max_distance: int = 1280     # maximum distance for relative position embedding
+        self.gru_rel_pos: bool = True     # apply gated relative position embedding
+
+        if cfg is not None:
+            self.update(cfg)
+
+    def update(self, cfg: dict):
+        self.__dict__.update(cfg)
+
+
+class WavLM(nn.Layer):
+    def __init__(
+        self,
+        cfg: WavLMConfig,
+    ) -> None:
+        super().__init__()
+        logger.info(f"WavLM Config: {cfg.__dict__}")
+
+        self.cfg = cfg
+        feature_enc_layers = eval(cfg.conv_feature_layers)
+        self.embed = feature_enc_layers[-1][0]
+
+        self.feature_extractor = ConvFeatureExtractionModel(
+            conv_layers=feature_enc_layers,
+            dropout=0.0,
+            mode=cfg.extractor_mode,
+            conv_bias=cfg.conv_bias,
+        )
+
+        self.post_extract_proj = (
+            nn.Linear(self.embed, cfg.encoder_embed_dim)
+            if self.embed != cfg.encoder_embed_dim
+            else None
+        )
+
+        self.mask_prob = cfg.mask_prob
+        self.mask_selection = cfg.mask_selection
+        self.mask_other = cfg.mask_other
+        self.mask_length = cfg.mask_length
+        self.no_mask_overlap = cfg.no_mask_overlap
+        self.mask_min_space = cfg.mask_min_space
+
+        self.mask_channel_prob = cfg.mask_channel_prob
+        self.mask_channel_selection = cfg.mask_channel_selection
+        self.mask_channel_other = cfg.mask_channel_other
+        self.mask_channel_length = cfg.mask_channel_length
+        self.no_mask_channel_overlap = cfg.no_mask_channel_overlap
+        self.mask_channel_min_space = cfg.mask_channel_min_space
+
+        self.dropout_input = nn.Dropout(cfg.dropout_input)
+        self.dropout_features = nn.Dropout(cfg.dropout_features)
+
+        self.feature_grad_mult = cfg.feature_grad_mult
+
+        self.mask_emb = self.create_parameter(
+            shape=[cfg.encoder_embed_dim],
+            default_initializer=nn.initializer.Uniform(),
+        )
+
+        self.encoder = TransformerEncoder(cfg)
+        self.layer_norm = LayerNorm(self.embed)
+
+    def apply_mask(self, x, padding_mask):
+        B, T, C = x.shape
+        if self.mask_prob > 0:
+            mask_indices = compute_mask_indices(
+                (B, T),
+                padding_mask,
+                self.mask_prob,
+                self.mask_length,
+                self.mask_selection,
+                self.mask_other,
+                min_masks=2,
+                no_overlap=self.no_mask_overlap,
+                min_space=self.mask_min_space,
+            )
+            # mask_indices = torch.from_numpy(mask_indices).to(x.device)
+            mask_indices = paddle.to_tensor(mask_indices, dtype='int64')
+            x[mask_indices] = self.mask_emb
+        else:
+            mask_indices = None
+
+        if self.mask_channel_prob > 0:
+            mask_channel_indices = compute_mask_indices(
+                (B, C),
+                None,
+                self.mask_channel_prob,
+                self.mask_channel_length,
+                self.mask_channel_selection,
+                self.mask_channel_other,
+                no_overlap=self.no_mask_channel_overlap,
+                min_space=self.mask_channel_min_space,
+            )
+            mask_channel_indices = (
+                # torch.from_numpy(mask_channel_indices)
+                paddle.to_tensor(mask_channel_indices, dtype='int64')
+                .to(x.device)
+                .unsqueeze(1)
+                .expand(-1, T, -1)
+            )
+            x[mask_channel_indices] = 0
+
+        return x, mask_indices
+
+    def forward_padding_mask(
+            self, features: Tensor, padding_mask: Tensor,
+    ) -> Tensor:
+        extra = padding_mask.size(1) % features.size(1)
+        if extra > 0:
+            padding_mask = padding_mask[:, :-extra]
+        padding_mask = padding_mask.view(
+            padding_mask.size(0), features.size(1), -1
+        )
+        padding_mask = padding_mask.all(-1)
+        return padding_mask
+
+    def extract_features(
+        self,
+        source: Tensor,
+        padding_mask: Optional[Tensor] = None,
+        mask: bool = False,
+        ret_conv: bool = False,
+        output_layer: Optional[int] = None,
+        ret_layer_results: bool = False,
+    ):
+
+        if self.feature_grad_mult > 0:
+            features = self.feature_extractor(source)
+            # if self.feature_grad_mult != 1.0:
+            #     features = GradMultiply.apply(features, self.feature_grad_mult)
+        else:
+            # with torch.no_grad():
+            with paddle.no_grad():
+                features = self.feature_extractor(source)
+
+        features = features.transpose([0, 2, 1]) # [1, 49, 512]
+        features = self.layer_norm(features)
+
+        if padding_mask is not None:
+            padding_mask = self.forward_padding_mask(features, padding_mask)
+
+        if self.post_extract_proj is not None:
+            features = self.post_extract_proj(features)
+        # [1, 49, 768]
+        features = self.dropout_input(features)
+
+        if mask:
+            x, mask_indices = self.apply_mask(
+                features, padding_mask
+            )
+        else:
+            x = features
+
+        # feature: (B, T, D), float
+        # target: (B, T), long
+        # x: (B, T, D), float
+        # padding_mask: (B, T), bool
+        # mask_indices: (B, T), bool
+        
+        x, layer_results = self.encoder(
+            x,
+            padding_mask=padding_mask,
+            layer=None if output_layer is None else output_layer - 1
+        )
+        # print(f"Debugging: x.shape: {x.shape}, x.mean(): {x.mean()}, x.std(): {x.std()}")
+        res = {"x": x, "padding_mask": padding_mask, "features": features, "layer_results": layer_results}
+
+        feature = res["features"] if ret_conv else res["x"]
+        if ret_layer_results:
+            feature = (feature, res["layer_results"])
+        return feature, res["padding_mask"]
+    
+    def forward(self, x):
+        return self.extract_features(x)[0]
+
+
+class ConvFeatureExtractionModel(nn.Layer):
+    def __init__(
+            self,
+            conv_layers: List[Tuple[int, int, int]],
+            dropout: float = 0.0,
+            mode: str = "default",
+            conv_bias: bool = False,
+            conv_type: str = "default"
+    ):
+        super().__init__()
+
+        assert mode in {"default", "layer_norm"}
+
+        def block(
+                n_in,
+                n_out,
+                k,
+                stride,
+                is_layer_norm=False,
+                is_group_norm=False,
+                conv_bias=False,
+        ):
+            def make_conv():
+                conv = nn.Conv1D(n_in, n_out, k, stride=stride, bias_attr=conv_bias,
+                                 weight_attr=nn.initializer.KaimingNormal())
+                # nn.init.kaiming_normal_(conv.weight)
+                return conv
+
+            assert (
+                           is_layer_norm and is_group_norm
+                   ) == False, "layer norm and group norm are exclusive"
+
+            if is_layer_norm:
+                return nn.Sequential(
+                    make_conv(),
+                    nn.Dropout(p=dropout),
+                    nn.Sequential(
+                        TransposeLast(),
+                        nn.LayerNorm(normalized_shape=dim, epsilon=1e-5),
+                        TransposeLast(),
+                    ),
+                    nn.GELU(),
+                )
+            elif is_group_norm:
+                return nn.Sequential(
+                    make_conv(),
+                    nn.Dropout(p=dropout),
+                    nn.GroupNorm(num_groups=dim, num_channels=dim, epsilon=1e-5),
+                    nn.GELU(),
+                )
+            else:
+                return nn.Sequential(make_conv(), nn.Dropout(p=dropout), nn.GELU())
+
+        self.conv_type = conv_type
+        if self.conv_type == "default":
+            in_d = 1
+            self.conv_layers = nn.LayerList()
+            for i, cl in enumerate(conv_layers):
+                assert len(cl) == 3, "invalid conv definition: " + str(cl)
+                (dim, k, stride) = cl
+
+                self.conv_layers.append(
+                    block(
+                        in_d,
+                        dim,
+                        k,
+                        stride,
+                        is_layer_norm=mode == "layer_norm",
+                        is_group_norm=mode == "default" and i == 0,
+                        conv_bias=conv_bias,
+                    )
+                )
+                in_d = dim
+        elif self.conv_type == "conv2d":
+            in_d = 1
+            self.conv_layers = nn.LayerList()
+            for i, cl in enumerate(conv_layers):
+                assert len(cl) == 3
+                (dim, k, stride) = cl
+
+                self.conv_layers.append(
+                    paddle.nn.Conv2D(in_d, dim, k, stride)
+                )
+                self.conv_layers.append(paddle.nn.ReLU())
+                in_d = dim
+        elif self.conv_type == "custom":
+            in_d = 1
+            idim = 80
+            self.conv_layers = nn.LayerList()
+            for i, cl in enumerate(conv_layers):
+                assert len(cl) == 3
+                (dim, k, stride) = cl
+                self.conv_layers.append(
+                    paddle.nn.Conv2D(in_d, dim, k, stride, padding=1)
+                )
+                self.conv_layers.append(
+                    paddle.nn.LayerNorm([dim, idim])
+                )
+                self.conv_layers.append(paddle.nn.ReLU())
+                in_d = dim
+                if (i + 1) % 2 == 0:
+                    self.conv_layers.append(
+                        paddle.nn.MaxPool2D(2, stride=2, ceil_mode=True)
+                    )
+                    idim = int(math.ceil(idim / 2))
+        else:
+            pass
+
+    def forward(self, x, mask=None):
+
+        # BxT -> BxCxT
+        x = x.unsqueeze(1)
+        if self.conv_type == "custom":
+            for conv in self.conv_layers:
+                if isinstance(conv, nn.LayerNorm):
+                    x = x.transpose([0, 2, 1])
+                    x = conv(x).transpose([0, 2, 1])
+                else:
+                    x = conv(x)
+            x = x.transpose([0, 1, 3, 2]).contiguous()
+            x = x.view(x.size(0), -1, x.size(-1))
+        else:
+            for conv in self.conv_layers:
+                x = conv(x)
+            if self.conv_type == "conv2d":
+                b, c, t, f = x.size()
+                # x = x.transpose(2, 3).contiguous().view(b, c * f, t)
+                x = x.transpose([0, 1, 3, 2]).contiguous().view(b, c * f, t)
+        return x
+
+
+class TransformerEncoder(nn.Layer):
+    def __init__(self, args):
+        super().__init__()
+
+        self.dropout = args.dropout
+        self.embedding_dim = args.encoder_embed_dim
+        dropout = 0
+        std = math.sqrt((4 * (1.0 - dropout)) / (args.conv_pos * self.embedding_dim))
+
+
+        self.pos_conv = nn.Conv1D(
+            self.embedding_dim,
+            self.embedding_dim,
+            kernel_size=args.conv_pos,
+            padding=args.conv_pos // 2,
+            groups=args.conv_pos_groups,
+            weight_attr=nn.initializer.Normal(mean=0, std=std),
+            bias_attr=True
+        )
+        # nn.init.normal_(self.pos_conv.weight, mean=0, std=std)
+        # nn.init.constant_(self.pos_conv.bias, 0)
+
+        # self.pos_conv = nn.utils.weight_norm(self.pos_conv, name="weight", dim=2)
+        # self.pos_conv.weight_g = self.pos_conv.weight_g.unsqueeze(0).unsqueeze(0)
+        self.pos_conv = nn.utils.weight_norm(self.pos_conv, name="weight", dim=2)
+        self.pos_conv = nn.Sequential(self.pos_conv, SamePad(args.conv_pos), nn.GELU())
+
+        if hasattr(args, "relative_position_embedding"):
+            self.relative_position_embedding = args.relative_position_embedding
+            self.num_buckets = args.num_buckets
+            self.max_distance = args.max_distance
+        else:
+            self.relative_position_embedding = False
+            self.num_buckets = 0
+            self.max_distance = 0
+
+        self.layers = nn.LayerList(
+            [
+                TransformerSentenceEncoderLayer(
+                    embedding_dim=self.embedding_dim,
+                    ffn_embedding_dim=args.encoder_ffn_embed_dim,
+                    num_attention_heads=args.encoder_attention_heads,
+                    dropout=self.dropout,
+                    attention_dropout=args.attention_dropout,
+                    activation_dropout=args.activation_dropout,
+                    activation_fn=args.activation_fn,
+                    layer_norm_first=args.layer_norm_first,
+                    has_relative_attention_bias=(self.relative_position_embedding and i == 0),
+                    num_buckets=self.num_buckets,
+                    max_distance=self.max_distance,
+                    gru_rel_pos=args.gru_rel_pos,
+                )
+                for i in range(args.encoder_layers)
+            ]
+        )
+
+        self.layer_norm_first = args.layer_norm_first
+        self.layer_norm = LayerNorm(self.embedding_dim)
+        self.layerdrop = args.encoder_layerdrop
+
+        # self.apply(init_bert_params)
+
+    def forward(self, x, padding_mask=None, streaming_mask=None, layer=None):
+        x, layer_results = self.extract_features(x, padding_mask, streaming_mask, layer)
+        # print("x.shape", x.shape)
+        if self.layer_norm_first and layer is None:
+            x = self.layer_norm(x)
+
+        return x, layer_results
+
+    def extract_features(self, x, padding_mask=None, streaming_mask=None, tgt_layer=None):
+
+        if padding_mask is not None:
+            x[padding_mask] = 0
+
+        x_conv = self.pos_conv(x.transpose([0, 2, 1]))
+        x_conv = x_conv.transpose([0, 2, 1])
+        x += x_conv
+        if not self.layer_norm_first:
+            x = self.layer_norm(x)
+
+        x = F.dropout(x, p=self.dropout, training=self.training)
+
+        # B x T x C -> T x B x C
+        # x = x.transpose(0, 1)
+        x = x.transpose([1, 0, 2])
+
+        
+        layer_results = []
+        z = None
+        if tgt_layer is not None:
+            layer_results.append((x, z))
+        r = None
+        pos_bias = None
+        for i, layer in enumerate(self.layers):
+            dropout_probability = np.random.random()
+            if not self.training or (dropout_probability > self.layerdrop):
+                x, z, pos_bias = layer(x, self_attn_padding_mask=padding_mask, need_weights=False,self_attn_mask=streaming_mask, pos_bias=pos_bias)
+            if tgt_layer is not None:
+                layer_results.append((x, z))
+            if i == tgt_layer:
+                r = x
+                break
+
+        if r is not None:
+            x = r
+
+        # T x B x C -> B x T x C
+        # x = x.transpose(0, 1)
+        x = x.transpose([1, 0, 2])
+
+        return x, layer_results
+
+
+class TransformerSentenceEncoderLayer(nn.Layer):
+    """
+    Implements a Transformer Encoder Layer used in BERT/XLM style pre-trained
+    models.
+    """
+
+    def __init__(
+            self,
+            embedding_dim: float = 768,
+            ffn_embedding_dim: float = 3072,
+            num_attention_heads: float = 8,
+            dropout: float = 0.1,
+            attention_dropout: float = 0.1,
+            activation_dropout: float = 0.1,
+            activation_fn: str = "relu",
+            layer_norm_first: bool = False,
+            has_relative_attention_bias: bool = True,
+            num_buckets: int = 0,
+            max_distance: int = 0,
+            rescale_init: bool = False,
+            gru_rel_pos: bool = True,
+    ) -> None:
+
+        super().__init__()
+        # Initialize parameters
+        self.embedding_dim = embedding_dim
+        self.dropout = dropout
+        self.activation_dropout = activation_dropout
+
+        # Initialize blocks
+        self.activation_name = activation_fn
+        self.activation_fn = get_activation_fn(activation_fn)
+        self.self_attn = MultiheadAttention(
+            self.embedding_dim,
+            num_attention_heads,
+            dropout=attention_dropout,
+            self_attention=True,
+            has_relative_attention_bias=has_relative_attention_bias,
+            num_buckets=num_buckets,
+            max_distance=max_distance,
+            rescale_init=rescale_init,
+            gru_rel_pos=gru_rel_pos,
+        )
+
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(self.activation_dropout)
+        self.dropout3 = nn.Dropout(dropout)
+
+        self.layer_norm_first = layer_norm_first
+
+        # layer norm associated with the self attention layer
+        self.self_attn_layer_norm = LayerNorm(self.embedding_dim)
+
+        if self.activation_name == "glu":
+            self.fc1 = GLU_Linear(self.embedding_dim, ffn_embedding_dim, "swish")
+        else:
+            self.fc1 = nn.Linear(self.embedding_dim, ffn_embedding_dim)
+        self.fc2 = nn.Linear(ffn_embedding_dim, self.embedding_dim)
+
+        # layer norm associated with the position wise feed-forward NN
+        self.final_layer_norm = LayerNorm(self.embedding_dim)
+
+    def forward(
+            self,
+            x: Tensor,
+            self_attn_mask: Tensor = None,
+            self_attn_padding_mask: Tensor = None,
+            need_weights: bool = False,
+            pos_bias=None
+    ):
+        """
+        LayerNorm is applied either before or after the self-attention/ffn
+        modules similar to the original Transformer imlementation.
+        """
+        residual = x
+        if self.layer_norm_first:
+            
+            x = self.self_attn_layer_norm(x)
+            x, attn, pos_bias = self.self_attn(
+                query=x,
+                key=x,
+                value=x,
+                key_padding_mask=self_attn_padding_mask,
+                need_weights=False,
+                attn_mask=self_attn_mask,
+                position_bias=pos_bias
+            )
+            # import pdb; pdb.set_trace()
+            x = self.dropout1(x)
+            x = residual + x
+
+            residual = x
+            x = self.final_layer_norm(x)
+            if self.activation_name == "glu":
+                x = self.fc1(x)
+            else:
+                x = self.activation_fn(self.fc1(x))
+            x = self.dropout2(x)
+            x = self.fc2(x)
+            x = self.dropout3(x)
+            x = residual + x
+        else:
+            x, attn, pos_bias = self.self_attn(
+                query=x,
+                key=x,
+                value=x,
+                key_padding_mask=self_attn_padding_mask,
+                need_weights=need_weights,
+                attn_mask=self_attn_mask,
+                position_bias=pos_bias
+            )
+
+            x = self.dropout1(x)
+            x = residual + x
+
+            x = self.self_attn_layer_norm(x)
+
+            residual = x
+            if self.activation_name == "glu":
+                x = self.fc1(x)
+            else:
+                x = self.activation_fn(self.fc1(x))
+            x = self.dropout2(x)
+            x = self.fc2(x)
+            x = self.dropout3(x)
+            x = residual + x
+            x = self.final_layer_norm(x)
+
+        return x, attn, pos_bias
diff --git a/paddlespeech/s2t/modules/attention.py b/paddlespeech/s2t/modules/attention.py
index d9568dcc..7f040d3e 100644
--- a/paddlespeech/s2t/modules/attention.py
+++ b/paddlespeech/s2t/modules/attention.py
@@ -15,6 +15,7 @@
 # Modified from wenet(https://github.com/wenet-e2e/wenet)
 """Multi-Head Attention layer definition."""
 import math
+from typing import List
 from typing import Tuple
 
 import paddle
@@ -26,7 +27,10 @@ from paddlespeech.s2t.utils.log import Log
 
 logger = Log(__name__).getlog()
 
-__all__ = ["MultiHeadedAttention", "RelPositionMultiHeadedAttention"]
+__all__ = [
+    "MultiHeadedAttention", "RelPositionMultiHeadedAttention",
+    "RoPERelPositionMultiHeadedAttention"
+]
 
 # Relative Positional Encodings
 # https://www.jianshu.com/p/c0608efcc26f
@@ -75,9 +79,9 @@ class MultiHeadedAttention(nn.Layer):
         """
         n_batch = query.shape[0]
 
-        q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k)
-        k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k)
-        v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k)
+        q = self.linear_q(query).reshape([n_batch, -1, self.h, self.d_k])
+        k = self.linear_k(key).reshape([n_batch, -1, self.h, self.d_k])
+        v = self.linear_v(value).reshape([n_batch, -1, self.h, self.d_k])
 
         q = q.transpose([0, 2, 1, 3])  # (batch, head, time1, d_k)
         k = k.transpose([0, 2, 1, 3])  # (batch, head, time2, d_k)
@@ -125,8 +129,8 @@ class MultiHeadedAttention(nn.Layer):
 
         p_attn = self.dropout(attn)
         x = paddle.matmul(p_attn, value)  # (batch, head, time1, d_k)
-        x = x.transpose([0, 2, 1, 3]).view(n_batch, -1, self.h *
-                                           self.d_k)  # (batch, time1, d_model)
+        x = x.transpose([0, 2, 1, 3]).reshape([n_batch, -1, self.h *
+                                           self.d_k])  # (batch, time1, d_model)
 
         return self.linear_out(x)  # (batch, time1, d_model)
 
@@ -165,6 +169,7 @@ class MultiHeadedAttention(nn.Layer):
                 and `head * d_k == size`
 
         """
+        # (B,T,D) -> (B,T,H,D/H)
         q, k, v = self.forward_qkv(query, key, value)
 
         #   when export onnx model, for 1st chunk, we feed
@@ -200,7 +205,12 @@ class MultiHeadedAttention(nn.Layer):
 class RelPositionMultiHeadedAttention(MultiHeadedAttention):
     """Multi-Head Attention layer with relative position encoding."""
 
-    def __init__(self, n_head, n_feat, dropout_rate):
+    def __init__(self,
+                 n_head,
+                 n_feat,
+                 dropout_rate,
+                 adaptive_scale=False,
+                 init_weights=False):
         """Construct an RelPositionMultiHeadedAttention object.
         Paper: https://arxiv.org/abs/1901.02860
         Args:
@@ -223,6 +233,39 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):
         pos_bias_v = self.create_parameter(
             (self.h, self.d_k), default_initializer=I.XavierUniform())
         self.add_parameter('pos_bias_v', pos_bias_v)
+        self.adaptive_scale = adaptive_scale
+        if self.adaptive_scale:
+            ada_scale = self.create_parameter(
+                [1, 1, n_feat], default_initializer=I.Constant(1.0))
+            self.add_parameter('ada_scale', ada_scale)
+            ada_bias = self.create_parameter(
+                [1, 1, n_feat], default_initializer=I.Constant(0.0))
+            self.add_parameter('ada_bias', ada_bias)
+        if init_weights:
+            self.init_weights()
+
+    def init_weights(self):
+        input_max = (self.h * self.d_k)**-0.5
+        self.linear_q._param_attr = paddle.nn.initializer.Uniform(
+            low=-input_max, high=input_max)
+        self.linear_q._bias_attr = paddle.nn.initializer.Uniform(
+            low=-input_max, high=input_max)
+        self.linear_k._param_attr = paddle.nn.initializer.Uniform(
+            low=-input_max, high=input_max)
+        self.linear_k._bias_attr = paddle.nn.initializer.Uniform(
+            low=-input_max, high=input_max)
+        self.linear_v._param_attr = paddle.nn.initializer.Uniform(
+            low=-input_max, high=input_max)
+        self.linear_v._bias_attr = paddle.nn.initializer.Uniform(
+            low=-input_max, high=input_max)
+        self.linear_pos._param_attr = paddle.nn.initializer.Uniform(
+            low=-input_max, high=input_max)
+        self.linear_pos._bias_attr = paddle.nn.initializer.Uniform(
+            low=-input_max, high=input_max)
+        self.linear_out._param_attr = paddle.nn.initializer.Uniform(
+            low=-input_max, high=input_max)
+        self.linear_out._bias_attr = paddle.nn.initializer.Uniform(
+            low=-input_max, high=input_max)
 
     def rel_shift(self, x, zero_triu: bool=False):
         """Compute relative positinal encoding.
@@ -273,6 +316,11 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):
                 where `cache_t == chunk_size * num_decoding_left_chunks`
                 and `head * d_k == size`
         """
+        if self.adaptive_scale:
+            query = self.ada_scale * query + self.ada_bias
+            key = self.ada_scale * key + self.ada_bias
+            value = self.ada_scale * value + self.ada_bias
+
         q, k, v = self.forward_qkv(query, key, value)
         # q = q.transpose([0, 2, 1, 3])  # (batch, time1, head, d_k)
 
@@ -301,7 +349,7 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):
         new_cache = paddle.concat((k, v), axis=-1)
 
         n_batch_pos = pos_emb.shape[0]
-        p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k)
+        p = self.linear_pos(pos_emb).reshape([n_batch_pos, -1, self.h, self.d_k])
         p = p.transpose([0, 2, 1, 3])  # (batch, head, time1, d_k)
 
         # (batch, head, time1, d_k)
@@ -330,3 +378,139 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):
             self.d_k)  # (batch, head, time1, time2)
 
         return self.forward_attention(v, scores, mask), new_cache
+
+
+class RoPERelPositionMultiHeadedAttention(MultiHeadedAttention):
+    """Multi-Head Attention layer with RoPE relative position encoding."""
+
+    def __init__(self,
+                 n_head,
+                 n_feat,
+                 dropout_rate,
+                 adaptive_scale=False,
+                 init_weights=False):
+        """Construct an RelPositionMultiHeadedAttention object.
+        Paper: https://arxiv.org/abs/1901.02860
+        Args:
+            n_head (int): The number of heads.
+            n_feat (int): The number of features.
+            dropout_rate (float): Dropout rate.
+        """
+        super().__init__(n_head, n_feat, dropout_rate)
+
+    def align(self, tensor: paddle.Tensor, axes: List[int], ndim=None):
+        """重新对齐tensor（批量版expand_dims）
+        axes：原来的第i维对齐新tensor的第axes[i]维；
+        ndim：新tensor的维度。
+        """
+        assert len(axes) == tensor.dim()
+        assert ndim or min(axes) >= 0
+
+        ndim = ndim or max(axes) + 1
+
+        # a[0, None, 1] = a[0, np.newaxis, 1]
+        indices = [None] * ndim
+        for i in axes:
+            # slice nothing, a[0, slice(None), 1] = a[0, :, 1]
+            indices[i] = slice(None)
+
+        return tensor[indices]
+
+    def apply_rotary_position_embeddings(self, sinusoidal, *tensors):
+        """应用RoPE到tensors中
+        其中，sinusoidal.shape=[B, T, D]，tensors为tensor的列表，而
+        tensor.shape=[B, T, ..., D], or (B,H,T,D/H)
+        """
+        assert len(tensors) > 0, 'at least one input tensor'
+        assert all(
+            [tensor.shape == tensors[0].shape
+             for tensor in tensors[1:]]), 'all tensors must have the same shape'
+
+        # (B,H,T,D)
+        ndim = tensors[0].dim()
+        _, H, T, D = tensors[0].shape
+
+        # sinusoidal shape same with tensors[0]
+        # [B,T,D] -> [B,T,H,D/H] -> (B,H,T,D/H)
+        # sinusoidal = self.align(sinusoidal, [0, 1, -1], ndim)
+        sinusoidal = sinusoidal.reshape((1, T, H, D)).transpose([0, 2, 1, 3])
+
+        # http://man.hubwiz.com/docset/TensorFlow.docset/Contents/Resources/Documents/api_docs/python/tf/keras/backend/repeat_elements.html
+        # like np.repeat, x (s1, s2, s3), axis 1, (s1, s2*rep, s3)
+        # [b,T, ..., d/2] -> [b,T, ..., d]
+        cos_pos = paddle.repeat_interleave(sinusoidal[..., 1::2], 2, axis=-1)
+        sin_pos = paddle.repeat_interleave(sinusoidal[..., 0::2], 2, axis=-1)
+        outputs = []
+        for tensor in tensors:
+            # x2 = [-x2, x1, -x4, x3, ..., -x_d, x_{d-1}]
+            tensor2 = paddle.stack([-tensor[..., 1::2], tensor[..., ::2]], ndim)
+            tensor2 = paddle.reshape(tensor2, paddle.shape(tensor))
+
+            # 公式 34, out = x * cos_pos + x2 * sin_pos
+            outputs.append(tensor * cos_pos + tensor2 * sin_pos)
+        return outputs[0] if len(outputs) == 1 else outputs
+
+    def forward(self,
+                query: paddle.Tensor,
+                key: paddle.Tensor,
+                value: paddle.Tensor,
+                mask: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool),
+                pos_emb: paddle.Tensor=paddle.empty([0]),
+                cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0])
+                ) -> Tuple[paddle.Tensor, paddle.Tensor]:
+        """Compute 'Scaled Dot Product Attention' with rel. positional encoding.
+        Ref: https://github.com/facebookresearch/llama/blob/main/llama/model.py
+        Args:
+            query (paddle.Tensor): Query tensor (#batch, time1, size).
+            key (paddle.Tensor): Key tensor (#batch, time2, size).
+            value (paddle.Tensor): Value tensor (#batch, time2, size).
+            mask (paddle.Tensor): Mask tensor (#batch, 1, time2) or
+                (#batch, time1, time2), (0, 0, 0) means fake mask.
+            pos_emb (paddle.Tensor): Positional embedding tensor
+                (#batch, time2, size).
+            cache (paddle.Tensor): Cache tensor (1, head, cache_t, d_k * 2),
+                where `cache_t == chunk_size * num_decoding_left_chunks`
+                and `head * d_k == size`
+        Returns:
+            paddle.Tensor: Output tensor (#batch, time1, d_model).
+            paddle.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2)
+                where `cache_t == chunk_size * num_decoding_left_chunks`
+                and `head * d_k == size`
+        """
+        q, k, v = self.forward_qkv(query, key, value)
+        # q = q.transpose([0, 2, 1, 3])  # (batch, time1, head, d_k)
+
+        # f{q,k}(x_m, m) = R^d_{\theta, m} W_{q,k} x_m, m is position index
+        # q_t always is chunk_size
+        q_t = q.shape[2]
+        q = self.apply_rotary_position_embeddings(pos_emb[:, -q_t:, :], q)
+        # k will increase when in streaming decoding.
+        k = self.apply_rotary_position_embeddings(pos_emb[:, -q_t:, :], k)
+
+        #   when export onnx model, for 1st chunk, we feed
+        #       cache(1, head, 0, d_k * 2) (16/-1, -1/-1, 16/0 mode)
+        #       or cache(1, head, real_cache_t, d_k * 2) (16/4 mode).
+        #       In all modes, `if cache.size(0) > 0` will alwayse be `True`
+        #       and we will always do splitting and
+        #       concatnation(this will simplify onnx export). Note that
+        #       it's OK to concat & split zero-shaped tensors(see code below).
+        #   when export jit  model, for 1st chunk, we always feed
+        #       cache(0, 0, 0, 0) since jit supports dynamic if-branch.
+        # >>> a = torch.ones((1, 2, 0, 4))
+        # >>> b = torch.ones((1, 2, 3, 4))
+        # >>> c = torch.cat((a, b), dim=2)
+        # >>> torch.equal(b, c)        # True
+        # >>> d = torch.split(a, 2, dim=-1)
+        # >>> torch.equal(d[0], d[1])  # True
+        if cache.shape[0] > 0:
+            # last dim `d_k * 2` for (key, val)
+            key_cache, value_cache = paddle.split(cache, 2, axis=-1)
+            k = paddle.concat([key_cache, k], axis=2)
+            v = paddle.concat([value_cache, v], axis=2)
+        # We do cache slicing in encoder.forward_chunk, since it's
+        #   non-trivial to calculate `next_cache_start` here.
+        new_cache = paddle.concat((k, v), axis=-1)
+
+        # dot(q, k)
+        scores = paddle.matmul(q, k, transpose_y=True) / math.sqrt(self.d_k)
+        return self.forward_attention(v, scores, mask), new_cache
diff --git a/paddlespeech/s2t/modules/conformer_convolution.py b/paddlespeech/s2t/modules/conformer_convolution.py
index 09d903ee..7a0c72f3 100644
--- a/paddlespeech/s2t/modules/conformer_convolution.py
+++ b/paddlespeech/s2t/modules/conformer_convolution.py
@@ -18,6 +18,7 @@ from typing import Tuple
 
 import paddle
 from paddle import nn
+from paddle.nn import initializer as I
 from typeguard import check_argument_types
 
 from paddlespeech.s2t.modules.align import BatchNorm1D
@@ -39,7 +40,9 @@ class ConvolutionModule(nn.Layer):
                  activation: nn.Layer=nn.ReLU(),
                  norm: str="batch_norm",
                  causal: bool=False,
-                 bias: bool=True):
+                 bias: bool=True,
+                 adaptive_scale: bool=False,
+                 init_weights: bool=False):
         """Construct an ConvolutionModule object.
         Args:
             channels (int): The number of channels of conv layers.
@@ -51,6 +54,18 @@ class ConvolutionModule(nn.Layer):
         """
         assert check_argument_types()
         super().__init__()
+        self.bias = bias
+        self.channels = channels
+        self.kernel_size = kernel_size
+        self.adaptive_scale = adaptive_scale
+        if self.adaptive_scale:
+            ada_scale = self.create_parameter(
+                [1, 1, channels], default_initializer=I.Constant(1.0))
+            self.add_parameter('ada_scale', ada_scale)
+            ada_bias = self.create_parameter(
+                [1, 1, channels], default_initializer=I.Constant(0.0))
+            self.add_parameter('ada_bias', ada_bias)
+
         self.pointwise_conv1 = Conv1D(
             channels,
             2 * channels,
@@ -105,6 +120,28 @@ class ConvolutionModule(nn.Layer):
         )
         self.activation = activation
 
+        if init_weights:
+            self.init_weights()
+
+    def init_weights(self):
+        pw_max = self.channels**-0.5
+        dw_max = self.kernel_size**-0.5
+        self.pointwise_conv1._param_attr = paddle.nn.initializer.Uniform(
+            low=-pw_max, high=pw_max)
+        if self.bias:
+            self.pointwise_conv1._bias_attr = paddle.nn.initializer.Uniform(
+                low=-pw_max, high=pw_max)
+        self.depthwise_conv._param_attr = paddle.nn.initializer.Uniform(
+            low=-dw_max, high=dw_max)
+        if self.bias:
+            self.depthwise_conv._bias_attr = paddle.nn.initializer.Uniform(
+                low=-dw_max, high=dw_max)
+        self.pointwise_conv2._param_attr = paddle.nn.initializer.Uniform(
+            low=-pw_max, high=pw_max)
+        if self.bias:
+            self.pointwise_conv2._bias_attr = paddle.nn.initializer.Uniform(
+                low=-pw_max, high=pw_max)
+
     def forward(
             self,
             x: paddle.Tensor,
@@ -123,6 +160,9 @@ class ConvolutionModule(nn.Layer):
             paddle.Tensor: Output tensor (#batch, time, channels).
             paddle.Tensor: Output cache tensor (#batch, channels, time')
         """
+        if self.adaptive_scale:
+            x = self.ada_scale * x + self.ada_bias
+
         # exchange the temporal dimension and the feature dimension
         x = x.transpose([0, 2, 1])  # [B, C, T]
 
diff --git a/paddlespeech/s2t/modules/conv2d.py b/paddlespeech/s2t/modules/conv2d.py
new file mode 100644
index 00000000..ca6e136a
--- /dev/null
+++ b/paddlespeech/s2t/modules/conv2d.py
@@ -0,0 +1,62 @@
+from typing import Optional
+from typing import Union
+
+import paddle
+import paddle.nn.functional as F
+from paddle.nn.layer.conv import _ConvNd
+
+__all__ = ['Conv2DValid']
+
+
+class Conv2DValid(_ConvNd):
+    """
+    Conv2d operator for VALID mode padding.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: int,
+                 stride: int=1,
+                 padding: Union[str, int]=0,
+                 dilation: int=1,
+                 groups: int=1,
+                 padding_mode: str='zeros',
+                 weight_attr=None,
+                 bias_attr=None,
+                 data_format="NCHW",
+                 valid_trigx: bool=False,
+                 valid_trigy: bool=False) -> None:
+        super(Conv2DValid, self).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            False,
+            2,
+            stride=stride,
+            padding=padding,
+            padding_mode=padding_mode,
+            dilation=dilation,
+            groups=groups,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr,
+            data_format=data_format)
+        self.valid_trigx = valid_trigx
+        self.valid_trigy = valid_trigy
+
+    def _conv_forward(self,
+                      input: paddle.Tensor,
+                      weight: paddle.Tensor,
+                      bias: Optional[paddle.Tensor]):
+        validx, validy = 0, 0
+        if self.valid_trigx:
+            validx = (input.shape[-2] *
+                      (self._stride[-2] - 1) - 1 + self._kernel_size[-2]) // 2
+        if self.valid_trigy:
+            validy = (input.shape[-1] *
+                      (self._stride[-1] - 1) - 1 + self._kernel_size[-1]) // 2
+        return F.conv2d(input, weight, bias, self._stride, (validx, validy),
+                        self._dilation, self._groups)
+
+    def forward(self, input: paddle.Tensor) -> paddle.Tensor:
+        return self._conv_forward(input, self.weight, self.bias)
diff --git a/paddlespeech/s2t/modules/embedding.py b/paddlespeech/s2t/modules/embedding.py
index f41a7b5d..1e9f0101 100644
--- a/paddlespeech/s2t/modules/embedding.py
+++ b/paddlespeech/s2t/modules/embedding.py
@@ -85,18 +85,21 @@ class PositionalEncoding(nn.Layer, PositionalEncodingInterface):
             reverse (bool, optional): Not used. Defaults to False.
         """
         nn.Layer.__init__(self)
-        self.d_model = d_model
+        self.d_model = paddle.to_tensor(d_model)
         self.max_len = max_len
         self.xscale = paddle.to_tensor(math.sqrt(self.d_model))
         self.dropout = nn.Dropout(p=dropout_rate)
+        self.base = paddle.to_tensor(10000.0)
         self.pe = paddle.zeros([1, self.max_len, self.d_model])  #[B=1,T,D]
 
         position = paddle.arange(
             0, self.max_len, dtype=paddle.float32).unsqueeze(1)  #[T, 1]
+        # base^{-2(i-1)/d)}, i \in (1,2...,d/2)
         div_term = paddle.exp(
-            paddle.arange(0, self.d_model, 2, dtype=paddle.float32) *
-            -(math.log(10000.0) / self.d_model))
+            -paddle.arange(0, self.d_model, 2, dtype=paddle.float32) *
+            (paddle.log(self.base) / self.d_model))
 
+        # [B,T,D]
         self.pe[:, :, 0::2] = paddle.sin(position * div_term)
         self.pe[:, :, 1::2] = paddle.cos(position * div_term)
 
@@ -161,6 +164,98 @@ class RelPositionalEncoding(PositionalEncoding):
         assert offset + x.shape[
             1] < self.max_len, "offset: {} + x.shape[1]: {} is larger than the max_len: {}".format(
                 offset, x.shape[1], self.max_len)
+
         x = x * self.xscale
         pos_emb = self.pe[:, offset:offset + x.shape[1]]
         return self.dropout(x), self.dropout(pos_emb)
+
+
+# RotaryRelPositionalEncoding is same to RelPositionalEncoding
+class ScaledRotaryRelPositionalEncoding(RelPositionalEncoding):
+    """Scaled Rotary Relative positional encoding module.
+    POSITION INTERPOLATION:  : https://arxiv.org/pdf/2306.15595v2.pdf
+    """
+
+    def __init__(self,
+                 d_model: int,
+                 dropout_rate: float,
+                 max_len: int=5000,
+                 scale=1):
+        """
+        Args:
+            d_model (int): Embedding dimension.
+            dropout_rate (float): Dropout rate.
+            max_len (int, optional): [Maximum input length.]. Defaults to 5000.
+            scale (int): Interpolation max input length to `scale * max_len` positions.
+        """
+        super().__init__(d_model, dropout_rate, max_len, reverse=True)
+        self.pscale = paddle.to_tensor(scale)
+        self.max_len = max_len * scale
+
+    def sinusoidal_embeddings(self,
+                              pos: paddle.Tensor,
+                              dim: paddle.Tensor,
+                              base=10000) -> paddle.Tensor:
+        """计算pos位置的dim维sinusoidal编码"""
+        assert dim % 2 == 0
+        # (d/2,)
+        indices = paddle.arange(0, dim // 2, dtype=pos.dtype)
+        indices = paddle.pow(paddle.cast(base, pos.dtype), -2 * indices / dim)
+        # pos (1, T), indices (d/2,) -> (1, T, d/2)
+        embeddings = paddle.einsum('...,d->...d', pos, indices)
+        # (1, T, d/2, 2)
+        embeddings = paddle.stack(
+            [paddle.sin(embeddings), paddle.cos(embeddings)], axis=-1)
+        # (1, T, d)
+        embeddings = paddle.flatten(embeddings, start_axis=-2, stop_axis=-1)
+        return embeddings
+
+    def forward(self, x: paddle.Tensor,
+                offset: int=0) -> Tuple[paddle.Tensor, paddle.Tensor]:
+        """Compute positional encoding.
+        Args:
+            x (paddle.Tensor): Input tensor (batch, time, `*`).
+        Returns:
+            paddle.Tensor: Encoded tensor (batch, time, `*`).
+            paddle.Tensor: Positional embedding tensor (1, time, `*`).
+        """
+        x = x * self.xscale
+
+        B, T, D = x.shape
+        assert D == self.d_model
+
+        # postion interploation
+        start = 0
+        end = T * self.pscale
+        assert end <= self.max_len
+        position = paddle.arange(start, end, dtype=x.dtype).unsqueeze(0)
+        position *= 1.0 / self.pscale
+        pe = self.sinusoidal_embeddings(position, self.d_model, base=self.base)
+
+        pos_emb = pe[:, offset:offset + x.shape[1]]
+        return self.dropout(x), self.dropout(pos_emb)
+
+    def position_encoding(self, offset: int, size: int) -> paddle.Tensor:
+        """ For getting encoding in a streaming fashion
+        Attention!!!!!
+        we apply dropout only once at the whole utterance level in a none
+        streaming way, but will call this function several times with
+        increasing input size in a streaming scenario, so the dropout will
+        be applied several times.
+        Args:
+            offset (int): start offset
+            size (int): requried size of position encoding
+        Returns:
+            paddle.Tensor: Corresponding position encoding, #[1, T, D].
+        """
+        # postion interploation
+        start = offset
+        end = (offset + size) * self.pscale
+        assert end <= self.max_len
+        position = paddle.arange(
+            start, end, dtype=paddle.get_default_dtype()).unsqueeze(0)
+        position *= 1.0 / self.pscale
+
+        pe = self.sinusoidal_embeddings(position, self.d_model, base=self.base)
+
+        return self.dropout(pe)
diff --git a/paddlespeech/s2t/modules/encoder.py b/paddlespeech/s2t/modules/encoder.py
index fd7bd7b9..27d7ffbd 100644
--- a/paddlespeech/s2t/modules/encoder.py
+++ b/paddlespeech/s2t/modules/encoder.py
@@ -14,7 +14,10 @@
 # limitations under the License.
 # Modified from wenet(https://github.com/wenet-e2e/wenet)
 """Encoder definition."""
+from typing import List
+from typing import Optional
 from typing import Tuple
+from typing import Union
 
 import paddle
 from paddle import nn
@@ -22,13 +25,16 @@ from typeguard import check_argument_types
 
 from paddlespeech.s2t.modules.activation import get_activation
 from paddlespeech.s2t.modules.align import LayerNorm
+from paddlespeech.s2t.modules.align import Linear
 from paddlespeech.s2t.modules.attention import MultiHeadedAttention
 from paddlespeech.s2t.modules.attention import RelPositionMultiHeadedAttention
+from paddlespeech.s2t.modules.attention import RoPERelPositionMultiHeadedAttention
 from paddlespeech.s2t.modules.conformer_convolution import ConvolutionModule
 from paddlespeech.s2t.modules.embedding import NoPositionalEncoding
 from paddlespeech.s2t.modules.embedding import PositionalEncoding
 from paddlespeech.s2t.modules.embedding import RelPositionalEncoding
 from paddlespeech.s2t.modules.encoder_layer import ConformerEncoderLayer
+from paddlespeech.s2t.modules.encoder_layer import SqueezeformerEncoderLayer
 from paddlespeech.s2t.modules.encoder_layer import TransformerEncoderLayer
 from paddlespeech.s2t.modules.mask import add_optional_chunk_mask
 from paddlespeech.s2t.modules.mask import make_non_pad_mask
@@ -36,12 +42,19 @@ from paddlespeech.s2t.modules.positionwise_feed_forward import PositionwiseFeedF
 from paddlespeech.s2t.modules.subsampling import Conv2dSubsampling4
 from paddlespeech.s2t.modules.subsampling import Conv2dSubsampling6
 from paddlespeech.s2t.modules.subsampling import Conv2dSubsampling8
+from paddlespeech.s2t.modules.subsampling import DepthwiseConv2DSubsampling4
 from paddlespeech.s2t.modules.subsampling import LinearNoSubsampling
+from paddlespeech.s2t.modules.time_reduction import TimeReductionLayer1D
+from paddlespeech.s2t.modules.time_reduction import TimeReductionLayer2D
+from paddlespeech.s2t.modules.time_reduction import TimeReductionLayerStream
 from paddlespeech.s2t.utils.log import Log
 
 logger = Log(__name__).getlog()
 
-__all__ = ["BaseEncoder", 'TransformerEncoder', "ConformerEncoder"]
+__all__ = [
+    "BaseEncoder", 'TransformerEncoder', "ConformerEncoder",
+    "SqueezeformerEncoder"
+]
 
 
 class BaseEncoder(nn.Layer):
@@ -103,6 +116,8 @@ class BaseEncoder(nn.Layer):
             pos_enc_class = PositionalEncoding
         elif pos_enc_layer_type == "rel_pos":
             pos_enc_class = RelPositionalEncoding
+        elif pos_enc_layer_type == "rope_pos":
+            pos_enc_class = RelPositionalEncoding
         elif pos_enc_layer_type == "no_pos":
             pos_enc_class = NoPositionalEncoding
         else:
@@ -218,14 +233,14 @@ class BaseEncoder(nn.Layer):
             xs = self.global_cmvn(xs)
 
         # before embed, xs=(B, T, D1), pos_emb=(B=1, T, D)
-        xs, pos_emb, _ = self.embed(xs, tmp_masks, offset=offset)
+        xs, _, _ = self.embed(xs, tmp_masks, offset=offset)
         # after embed, xs=(B=1, chunk_size, hidden-dim)
 
         elayers, _, cache_t1, _ = att_cache.shape
         chunk_size = xs.shape[1]
         attention_key_size = cache_t1 + chunk_size
 
-        # only used when using `RelPositionMultiHeadedAttention`
+        # only used when using `RelPositionMultiHeadedAttention` and `RoPERelPositionMultiHeadedAttention`
         pos_emb = self.embed.position_encoding(
             offset=offset - cache_t1, size=attention_key_size)
 
@@ -462,21 +477,35 @@ class ConformerEncoder(BaseEncoder):
         activation = get_activation(activation_type)
 
         # self-attention module definition
-        encoder_selfattn_layer = RelPositionMultiHeadedAttention
-        encoder_selfattn_layer_args = (attention_heads, output_size,
-                                       attention_dropout_rate)
+        encoder_dim = output_size
+        if pos_enc_layer_type == "abs_pos":
+            encoder_selfattn_layer = MultiHeadedAttention
+            encoder_selfattn_layer_args = (attention_heads, encoder_dim,
+                                           attention_dropout_rate)
+        elif pos_enc_layer_type == "rel_pos":
+            encoder_selfattn_layer = RelPositionMultiHeadedAttention
+            encoder_selfattn_layer_args = (attention_heads, encoder_dim,
+                                           attention_dropout_rate)
+        elif pos_enc_layer_type == "rope_pos":
+            encoder_selfattn_layer = RoPERelPositionMultiHeadedAttention
+            encoder_selfattn_layer_args = (attention_heads, encoder_dim,
+                                           attention_dropout_rate)
+        else:
+            raise ValueError(
+                f"pos_enc_layer_type {pos_enc_layer_type} not supported.")
+
         # feed-forward module definition
         positionwise_layer = PositionwiseFeedForward
-        positionwise_layer_args = (output_size, linear_units, dropout_rate,
+        positionwise_layer_args = (encoder_dim, linear_units, dropout_rate,
                                    activation)
         # convolution module definition
         convolution_layer = ConvolutionModule
-        convolution_layer_args = (output_size, cnn_module_kernel, activation,
+        convolution_layer_args = (encoder_dim, cnn_module_kernel, activation,
                                   cnn_module_norm, causal)
 
         self.encoders = nn.LayerList([
             ConformerEncoderLayer(
-                size=output_size,
+                size=encoder_dim,
                 self_attn=encoder_selfattn_layer(*encoder_selfattn_layer_args),
                 feed_forward=positionwise_layer(*positionwise_layer_args),
                 feed_forward_macaron=positionwise_layer(
@@ -487,3 +516,374 @@ class ConformerEncoder(BaseEncoder):
                 normalize_before=normalize_before,
                 concat_after=concat_after) for _ in range(num_blocks)
         ])
+
+
+class SqueezeformerEncoder(nn.Layer):
+    def __init__(self,
+                 input_size: int,
+                 encoder_dim: int=256,
+                 output_size: int=256,
+                 attention_heads: int=4,
+                 num_blocks: int=12,
+                 reduce_idx: Optional[Union[int, List[int]]]=5,
+                 recover_idx: Optional[Union[int, List[int]]]=11,
+                 feed_forward_expansion_factor: int=4,
+                 dw_stride: bool=False,
+                 input_dropout_rate: float=0.1,
+                 pos_enc_layer_type: str="rel_pos",
+                 time_reduction_layer_type: str="conv1d",
+                 feed_forward_dropout_rate: float=0.1,
+                 attention_dropout_rate: float=0.1,
+                 cnn_module_kernel: int=31,
+                 cnn_norm_type: str="layer_norm",
+                 dropout: float=0.1,
+                 causal: bool=False,
+                 adaptive_scale: bool=True,
+                 activation_type: str="swish",
+                 init_weights: bool=True,
+                 global_cmvn: paddle.nn.Layer=None,
+                 normalize_before: bool=False,
+                 use_dynamic_chunk: bool=False,
+                 concat_after: bool=False,
+                 static_chunk_size: int=0,
+                 use_dynamic_left_chunk: bool=False):
+        """Construct SqueezeformerEncoder
+
+        Args:
+            input_size to use_dynamic_chunk, see in Transformer BaseEncoder.
+            encoder_dim (int): The hidden dimension of encoder layer.
+            output_size (int): The output dimension of final projection layer.
+            attention_heads (int): Num of attention head in attention module.
+            num_blocks (int): Num of encoder layers.
+            reduce_idx Optional[Union[int, List[int]]]:
+                reduce layer index, from 40ms to 80ms per frame.
+            recover_idx Optional[Union[int, List[int]]]:
+                recover layer index, from 80ms to 40ms per frame.
+            feed_forward_expansion_factor (int): Enlarge coefficient of FFN.
+            dw_stride (bool): Whether do depthwise convolution
+                              on subsampling module.
+            input_dropout_rate (float): Dropout rate of input projection layer.
+            pos_enc_layer_type (str): Self attention type.
+            time_reduction_layer_type (str): Conv1d or Conv2d reduction layer.
+            cnn_module_kernel (int): Kernel size of CNN module.
+            activation_type (str): Encoder activation function type.
+            cnn_module_kernel (int): Kernel size of convolution module.
+            adaptive_scale (bool): Whether to use adaptive scale.
+            init_weights (bool): Whether to initialize weights.
+            causal (bool): whether to use causal convolution or not.
+        """
+        assert check_argument_types()
+        super().__init__()
+        self.global_cmvn = global_cmvn
+        self.reduce_idx: Optional[Union[int, List[int]]] = [reduce_idx] \
+            if type(reduce_idx) == int else reduce_idx
+        self.recover_idx: Optional[Union[int, List[int]]] = [recover_idx] \
+            if type(recover_idx) == int else recover_idx
+        self.check_ascending_list()
+        if reduce_idx is None:
+            self.time_reduce = None
+        else:
+            if recover_idx is None:
+                self.time_reduce = 'normal'  # no recovery at the end
+            else:
+                self.time_reduce = 'recover'  # recovery at the end
+                assert len(self.reduce_idx) == len(self.recover_idx)
+            self.reduce_stride = 2
+        self._output_size = output_size
+        self.normalize_before = normalize_before
+        self.static_chunk_size = static_chunk_size
+        self.use_dynamic_chunk = use_dynamic_chunk
+        self.use_dynamic_left_chunk = use_dynamic_left_chunk
+        activation = get_activation(activation_type)
+
+        # self-attention module definition
+        if pos_enc_layer_type == "abs_pos":
+            encoder_selfattn_layer = MultiHeadedAttention
+            encoder_selfattn_layer_args = (attention_heads, output_size,
+                                           attention_dropout_rate)
+        elif pos_enc_layer_type == "rel_pos":
+            encoder_selfattn_layer = RelPositionMultiHeadedAttention
+            encoder_selfattn_layer_args = (attention_heads, encoder_dim,
+                                           attention_dropout_rate,
+                                           adaptive_scale, init_weights)
+        elif pos_enc_layer_type == "rope_pos":
+            encoder_selfattn_layer = RoPERelPositionMultiHeadedAttention
+            encoder_selfattn_layer_args = (attention_heads, encoder_dim,
+                                           attention_dropout_rate,
+                                           adaptive_scale, init_weights)
+        else:
+            raise ValueError(
+                f"pos_enc_layer_type {pos_enc_layer_type} not supported.")
+
+        # feed-forward module definition
+        positionwise_layer = PositionwiseFeedForward
+        positionwise_layer_args = (
+            encoder_dim, encoder_dim * feed_forward_expansion_factor,
+            feed_forward_dropout_rate, activation, adaptive_scale, init_weights)
+
+        # convolution module definition
+        convolution_layer = ConvolutionModule
+        convolution_layer_args = (encoder_dim, cnn_module_kernel, activation,
+                                  cnn_norm_type, causal, True, adaptive_scale,
+                                  init_weights)
+
+        self.embed = DepthwiseConv2DSubsampling4(
+            1, encoder_dim,
+            RelPositionalEncoding(encoder_dim, dropout_rate=0.1), dw_stride,
+            input_size, input_dropout_rate, init_weights)
+
+        self.preln = LayerNorm(encoder_dim)
+        self.encoders = paddle.nn.LayerList([
+            SqueezeformerEncoderLayer(
+                encoder_dim,
+                encoder_selfattn_layer(*encoder_selfattn_layer_args),
+                positionwise_layer(*positionwise_layer_args),
+                convolution_layer(*convolution_layer_args),
+                positionwise_layer(*positionwise_layer_args), normalize_before,
+                dropout, concat_after) for _ in range(num_blocks)
+        ])
+        if time_reduction_layer_type == 'conv1d':
+            time_reduction_layer = TimeReductionLayer1D
+            time_reduction_layer_args = {
+                'channel': encoder_dim,
+                'out_dim': encoder_dim,
+            }
+        elif time_reduction_layer_type == 'stream':
+            time_reduction_layer = TimeReductionLayerStream
+            time_reduction_layer_args = {
+                'channel': encoder_dim,
+                'out_dim': encoder_dim,
+            }
+        else:
+            time_reduction_layer = TimeReductionLayer2D
+            time_reduction_layer_args = {'encoder_dim': encoder_dim}
+
+        self.time_reduction_layer = time_reduction_layer(
+            **time_reduction_layer_args)
+        self.time_recover_layer = Linear(encoder_dim, encoder_dim)
+        self.final_proj = None
+        if output_size != encoder_dim:
+            self.final_proj = Linear(encoder_dim, output_size)
+
+    def output_size(self) -> int:
+        return self._output_size
+
+    def forward(
+            self,
+            xs: paddle.Tensor,
+            xs_lens: paddle.Tensor,
+            decoding_chunk_size: int=0,
+            num_decoding_left_chunks: int=-1,
+    ) -> Tuple[paddle.Tensor, paddle.Tensor]:
+        """Embed positions in tensor.
+        Args:
+            xs: padded input tensor (B, L, D)
+            xs_lens: input length (B)
+            decoding_chunk_size: decoding chunk size for dynamic chunk
+                0: default for training, use random dynamic chunk.
+                <0: for decoding, use full chunk.
+                >0: for decoding, use fixed chunk size as set.
+            num_decoding_left_chunks: number of left chunks, this is for decoding,
+                the chunk size is decoding_chunk_size.
+                >=0: use num_decoding_left_chunks
+                <0: use all left chunks
+        Returns:
+            encoder output tensor, lens and mask
+        """
+        masks = make_non_pad_mask(xs_lens).unsqueeze(1)  # (B, 1, L)
+
+        if self.global_cmvn is not None:
+            xs = self.global_cmvn(xs)
+        xs, pos_emb, masks = self.embed(xs, masks)
+        mask_pad = masks
+        chunk_masks = add_optional_chunk_mask(
+            xs, masks, self.use_dynamic_chunk, self.use_dynamic_left_chunk,
+            decoding_chunk_size, self.static_chunk_size,
+            num_decoding_left_chunks)
+        xs_lens = chunk_masks.squeeze(1).sum(1)
+        xs = self.preln(xs)
+        recover_activations: \
+            List[Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]] = []
+        index = 0
+        for i, layer in enumerate(self.encoders):
+            if self.reduce_idx is not None:
+                if self.time_reduce is not None and i in self.reduce_idx:
+                    recover_activations.append(
+                        (xs, chunk_masks, pos_emb, mask_pad))
+                    xs, xs_lens, chunk_masks, mask_pad = self.time_reduction_layer(
+                        xs, xs_lens, chunk_masks, mask_pad)
+                    pos_emb = pos_emb[:, ::2, :]
+                    index += 1
+
+            if self.recover_idx is not None:
+                if self.time_reduce == 'recover' and i in self.recover_idx:
+                    index -= 1
+                    recover_tensor, recover_chunk_masks, recover_pos_emb, recover_mask_pad = recover_activations[
+                        index]
+                    # recover output length for ctc decode
+                    xs = paddle.repeat_interleave(xs, repeats=2, axis=1)
+                    xs = self.time_recover_layer(xs)
+                    recoverd_t = recover_tensor.shape[1]
+                    xs = recover_tensor + xs[:, :recoverd_t, :]
+                    chunk_masks = recover_chunk_masks
+                    pos_emb = recover_pos_emb
+                    mask_pad = recover_mask_pad
+
+            xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad)
+
+        if self.final_proj is not None:
+            xs = self.final_proj(xs)
+        return xs, masks
+
+    def check_ascending_list(self):
+        if self.reduce_idx is not None:
+            assert self.reduce_idx == sorted(self.reduce_idx), \
+                "reduce_idx should be int or ascending list"
+        if self.recover_idx is not None:
+            assert self.recover_idx == sorted(self.recover_idx), \
+                "recover_idx should be int or ascending list"
+
+    def calculate_downsampling_factor(self, i: int) -> int:
+        if self.reduce_idx is None:
+            return 1
+        else:
+            reduce_exp, recover_exp = 0, 0
+            for exp, rd_idx in enumerate(self.reduce_idx):
+                if i >= rd_idx:
+                    reduce_exp = exp + 1
+            if self.recover_idx is not None:
+                for exp, rc_idx in enumerate(self.recover_idx):
+                    if i >= rc_idx:
+                        recover_exp = exp + 1
+            return int(2**(reduce_exp - recover_exp))
+
+    def forward_chunk(
+            self,
+            xs: paddle.Tensor,
+            offset: int,
+            required_cache_size: int,
+            att_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]),
+            cnn_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]),
+            att_mask: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool),
+    ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
+        """ Forward just one chunk
+
+        Args:
+            xs (paddle.Tensor): chunk input, with shape (b=1, time, mel-dim),
+                where `time == (chunk_size - 1) * subsample_rate + \
+                        subsample.right_context + 1`
+            offset (int): current offset in encoder output time stamp
+            required_cache_size (int): cache size required for next chunk
+                compuation
+                >=0: actual cache size
+                <0: means all history cache is required
+            att_cache (paddle.Tensor): cache tensor for KEY & VALUE in
+                transformer/conformer attention, with shape
+                (elayers, head, cache_t1, d_k * 2), where
+                `head * d_k == hidden-dim` and
+                `cache_t1 == chunk_size * num_decoding_left_chunks`.
+            cnn_cache (paddle.Tensor): cache tensor for cnn_module in conformer,
+                (elayers, b=1, hidden-dim, cache_t2), where
+                `cache_t2 == cnn.lorder - 1`
+
+        Returns:
+            paddle.Tensor: output of current input xs,
+                with shape (b=1, chunk_size, hidden-dim).
+            paddle.Tensor: new attention cache required for next chunk, with
+                dynamic shape (elayers, head, ?, d_k * 2)
+                depending on required_cache_size.
+            paddle.Tensor: new conformer cnn cache required for next chunk, with
+                same shape as the original cnn_cache.
+        """
+        assert xs.shape[0] == 1  # batch size must be one
+
+        if self.global_cmvn is not None:
+            xs = self.global_cmvn(xs)
+
+        # tmp_masks is just for interface compatibility, [B=1, C=1, T]
+        tmp_masks = paddle.ones([1, 1, xs.shape[1]], dtype=paddle.bool)
+        # before embed, xs=(B, T, D1), pos_emb=(B=1, T, D)
+        xs, pos_emb, _ = self.embed(xs, tmp_masks, offset=offset)
+
+        # NOTE(xcsong): After  embed, shape(xs) is (b=1, chunk_size, hidden-dim)
+        elayers, cache_t1 = att_cache.shape[0], att_cache.shape[2]
+        chunk_size = xs.shape[1]
+        attention_key_size = cache_t1 + chunk_size
+        pos_emb = self.embed.position_encoding(
+            offset=offset - cache_t1, size=attention_key_size)
+        if required_cache_size < 0:
+            next_cache_start = 0
+        elif required_cache_size == 0:
+            next_cache_start = attention_key_size
+        else:
+            next_cache_start = max(attention_key_size - required_cache_size, 0)
+
+        r_att_cache = []
+        r_cnn_cache = []
+
+        mask_pad = paddle.ones([1, xs.shape[1]], dtype=paddle.bool)
+        mask_pad = mask_pad.unsqueeze(1)
+        max_att_len: int = 0
+        recover_activations: \
+            List[Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]] = []
+        index = 0
+        xs_lens = paddle.to_tensor([xs.shape[1]], dtype=paddle.int32)
+        xs = self.preln(xs)
+        for i, layer in enumerate(self.encoders):
+            # NOTE(xcsong): Before layer.forward
+            #   shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2),
+            #   shape(cnn_cache[i])       is (b=1, hidden-dim, cache_t2)
+            if self.reduce_idx is not None:
+                if self.time_reduce is not None and i in self.reduce_idx:
+                    recover_activations.append(
+                        (xs, att_mask, pos_emb, mask_pad))
+                    xs, xs_lens, att_mask, mask_pad = self.time_reduction_layer(
+                        xs, xs_lens, att_mask, mask_pad)
+                    pos_emb = pos_emb[:, ::2, :]
+                    index += 1
+
+            if self.recover_idx is not None:
+                if self.time_reduce == 'recover' and i in self.recover_idx:
+                    index -= 1
+                    recover_tensor, recover_att_mask, recover_pos_emb, recover_mask_pad = recover_activations[
+                        index]
+                    # recover output length for ctc decode
+                    xs = paddle.repeat_interleave(xs, repeats=2, axis=1)
+                    xs = self.time_recover_layer(xs)
+                    recoverd_t = recover_tensor.shape[1]
+                    xs = recover_tensor + xs[:, :recoverd_t, :]
+                    att_mask = recover_att_mask
+                    pos_emb = recover_pos_emb
+                    mask_pad = recover_mask_pad
+
+            factor = self.calculate_downsampling_factor(i)
+            att_cache1 = att_cache[
+                i:i + 1][:, :, ::factor, :][:, :, :pos_emb.shape[1] - xs.shape[
+                    1], :]
+            cnn_cache1 = cnn_cache[i] if cnn_cache.shape[0] > 0 else cnn_cache
+            xs, _, new_att_cache, new_cnn_cache = layer(
+                xs,
+                att_mask,
+                pos_emb,
+                att_cache=att_cache1,
+                cnn_cache=cnn_cache1)
+            # NOTE(xcsong): After layer.forward
+            #   shape(new_att_cache) is (1, head, attention_key_size, d_k * 2),
+            #   shape(new_cnn_cache) is (b=1, hidden-dim, cache_t2)
+            cached_att = new_att_cache[:, :, next_cache_start // factor:, :]
+            cached_cnn = new_cnn_cache.unsqueeze(0)
+            cached_att = cached_att.repeat_interleave(repeats=factor, axis=2)
+            if i == 0:
+                # record length for the first block as max length
+                max_att_len = cached_att.shape[2]
+            r_att_cache.append(cached_att[:, :, :max_att_len, :])
+            r_cnn_cache.append(cached_cnn)
+        # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2),
+        #   ? may be larger than cache_t1, it depends on required_cache_size
+        r_att_cache = paddle.concat(r_att_cache, axis=0)
+        # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2)
+        r_cnn_cache = paddle.concat(r_cnn_cache, axis=0)
+
+        if self.final_proj is not None:
+            xs = self.final_proj(xs)
+        return xs, r_att_cache, r_cnn_cache
diff --git a/paddlespeech/s2t/modules/encoder_layer.py b/paddlespeech/s2t/modules/encoder_layer.py
index dac62bce..0499e742 100644
--- a/paddlespeech/s2t/modules/encoder_layer.py
+++ b/paddlespeech/s2t/modules/encoder_layer.py
@@ -26,7 +26,10 @@ from paddlespeech.s2t.utils.log import Log
 
 logger = Log(__name__).getlog()
 
-__all__ = ["TransformerEncoderLayer", "ConformerEncoderLayer"]
+__all__ = [
+    "TransformerEncoderLayer", "ConformerEncoderLayer",
+    "SqueezeformerEncoderLayer"
+]
 
 
 class TransformerEncoderLayer(nn.Layer):
@@ -45,7 +48,7 @@ class TransformerEncoderLayer(nn.Layer):
         Args:
             size (int): Input dimension.
             self_attn (nn.Layer): Self-attention module instance.
-                `MultiHeadedAttention` or `RelPositionMultiHeadedAttention`
+                `MultiHeadedAttention`, `RelPositionMultiHeadedAttention` or `RoPERelPositionMultiHeadedAttention`
                 instance can be used as the argument.
             feed_forward (nn.Layer): Feed-forward module instance.
                 `PositionwiseFeedForward`, instance can be used as the argument.
@@ -144,7 +147,7 @@ class ConformerEncoderLayer(nn.Layer):
         Args:
             size (int): Input dimension.
             self_attn (nn.Layer): Self-attention module instance.
-                `MultiHeadedAttention` or `RelPositionMultiHeadedAttention`
+                `MultiHeadedAttention`, `RelPositionMultiHeadedAttention` or `RoPERelPositionMultiHeadedAttention`
                 instance can be used as the argument.
             feed_forward (nn.Layer): Feed-forward module instance.
                 `PositionwiseFeedForward` instance can be used as the argument.
@@ -276,3 +279,125 @@ class ConformerEncoderLayer(nn.Layer):
             x = self.norm_final(x)
 
         return x, mask, new_att_cache, new_cnn_cache
+
+
+class SqueezeformerEncoderLayer(nn.Layer):
+    """Encoder layer module."""
+
+    def __init__(self,
+                 size: int,
+                 self_attn: paddle.nn.Layer,
+                 feed_forward1: Optional[nn.Layer]=None,
+                 conv_module: Optional[nn.Layer]=None,
+                 feed_forward2: Optional[nn.Layer]=None,
+                 normalize_before: bool=False,
+                 dropout_rate: float=0.1,
+                 concat_after: bool=False):
+        """Construct an EncoderLayer object.
+
+        Args:
+            size (int): Input dimension.
+            self_attn (paddle.nn.Layer): Self-attention module instance.
+                `MultiHeadedAttention`, `RelPositionMultiHeadedAttention` or `RoPERelPositionMultiHeadedAttention`
+                instance can be used as the argument.
+            feed_forward1 (paddle.nn.Layer): Feed-forward module instance.
+                `PositionwiseFeedForward` instance can be used as the argument.
+            conv_module (paddle.nn.Layer): Convolution module instance.
+                `ConvlutionLayer` instance can be used as the argument.
+            feed_forward2 (paddle.nn.Layer): Feed-forward module instance.
+                `PositionwiseFeedForward` instance can be used as the argument.
+            dropout_rate (float): Dropout rate.
+            normalize_before (bool):
+                True: use layer_norm before each sub-block.
+                False: use layer_norm after each sub-block.
+        """
+        super().__init__()
+        self.size = size
+        self.self_attn = self_attn
+        self.layer_norm1 = LayerNorm(size)
+        self.ffn1 = feed_forward1
+        self.layer_norm2 = LayerNorm(size)
+        self.conv_module = conv_module
+        self.layer_norm3 = LayerNorm(size)
+        self.ffn2 = feed_forward2
+        self.layer_norm4 = LayerNorm(size)
+        self.normalize_before = normalize_before
+        self.dropout = nn.Dropout(dropout_rate)
+        self.concat_after = concat_after
+        if concat_after:
+            self.concat_linear = Linear(size + size, size)
+        else:
+            self.concat_linear = nn.Identity()
+
+    def forward(
+            self,
+            x: paddle.Tensor,
+            mask: paddle.Tensor,
+            pos_emb: paddle.Tensor,
+            mask_pad: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool),
+            att_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]),
+            cnn_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]),
+    ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]:
+        """Compute encoded features.
+        Args:
+            x (paddle.Tensor): Input tensor (#batch, time, size).
+            mask (paddle.Tensor): Mask tensor for the input (#batch, time, time).
+                (0,0,0) means fake mask.
+            pos_emb (paddle.Tensor): postional encoding, must not be None
+                for ConformerEncoderLayer
+            mask_pad (paddle.Tensor): batch padding mask used for conv module.
+               (#batch, 1，time), (0, 0, 0) means fake mask.
+            att_cache (paddle.Tensor): Cache tensor of the KEY & VALUE
+                (#batch=1, head, cache_t1, d_k * 2), head * d_k == size.
+            cnn_cache (paddle.Tensor): Convolution cache in conformer layer
+                (1, #batch=1, size, cache_t2). First dim will not be used, just
+                for dy2st.
+        Returns:
+           paddle.Tensor: Output tensor (#batch, time, size).
+           paddle.Tensor: Mask tensor (#batch, time, time).
+           paddle.Tensor: att_cache tensor,
+                (#batch=1, head, cache_t1 + time, d_k * 2).
+           paddle.Tensor: cnn_cahce tensor (#batch, size, cache_t2).
+        """
+        # self attention module
+        residual = x
+        if self.normalize_before:
+            x = self.layer_norm1(x)
+        x_att, new_att_cache = self.self_attn(x, x, x, mask, pos_emb, att_cache)
+        if self.concat_after:
+            x_concat = paddle.concat((x, x_att), axis=-1)
+            x = residual + self.concat_linear(x_concat)
+        else:
+            x = residual + self.dropout(x_att)
+        if not self.normalize_before:
+            x = self.layer_norm1(x)
+
+        # ffn module
+        residual = x
+        if self.normalize_before:
+            x = self.layer_norm2(x)
+        x = self.ffn1(x)
+        x = residual + self.dropout(x)
+        if not self.normalize_before:
+            x = self.layer_norm2(x)
+
+        # conv module
+        residual = x
+        if self.normalize_before:
+            x = self.layer_norm3(x)
+        x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache)
+        x = residual + self.dropout(x)
+        if not self.normalize_before:
+            x = self.layer_norm3(x)
+
+        # ffn module
+        residual = x
+        if self.normalize_before:
+            x = self.layer_norm4(x)
+        x = self.ffn2(x)
+        # we do not use dropout here since it is inside feed forward function
+        x = residual + self.dropout(x)
+        if not self.normalize_before:
+            x = self.layer_norm4(x)
+
+        return x, mask, new_att_cache, new_cnn_cache
diff --git a/paddlespeech/s2t/modules/positionwise_feed_forward.py b/paddlespeech/s2t/modules/positionwise_feed_forward.py
index c2725dc5..9ebd5d63 100644
--- a/paddlespeech/s2t/modules/positionwise_feed_forward.py
+++ b/paddlespeech/s2t/modules/positionwise_feed_forward.py
@@ -16,6 +16,7 @@
 """Positionwise feed forward layer definition."""
 import paddle
 from paddle import nn
+from paddle.nn import initializer as I
 
 from paddlespeech.s2t.modules.align import Linear
 from paddlespeech.s2t.utils.log import Log
@@ -32,7 +33,9 @@ class PositionwiseFeedForward(nn.Layer):
                  idim: int,
                  hidden_units: int,
                  dropout_rate: float,
-                 activation: nn.Layer=nn.ReLU()):
+                 activation: nn.Layer=nn.ReLU(),
+                 adaptive_scale: bool=False,
+                 init_weights: bool=False):
         """Construct a PositionwiseFeedForward object.
 
         FeedForward are appied on each position of the sequence.
@@ -45,10 +48,35 @@ class PositionwiseFeedForward(nn.Layer):
             activation (paddle.nn.Layer): Activation function
         """
         super().__init__()
+        self.idim = idim
+        self.hidden_units = hidden_units
         self.w_1 = Linear(idim, hidden_units)
         self.activation = activation
         self.dropout = nn.Dropout(dropout_rate)
         self.w_2 = Linear(hidden_units, idim)
+        self.adaptive_scale = adaptive_scale
+        if self.adaptive_scale:
+            ada_scale = self.create_parameter(
+                [1, 1, idim], default_initializer=I.XavierUniform())
+            self.add_parameter('ada_scale', ada_scale)
+            ada_bias = self.create_parameter(
+                [1, 1, idim], default_initializer=I.XavierUniform())
+            self.add_parameter('ada_bias', ada_bias)
+
+        if init_weights:
+            self.init_weights()
+
+    def init_weights(self):
+        ffn1_max = self.idim**-0.5
+        ffn2_max = self.hidden_units**-0.5
+        self.w_1._param_attr = paddle.nn.initializer.Uniform(
+            low=-ffn1_max, high=ffn1_max)
+        self.w_1._bias_attr = paddle.nn.initializer.Uniform(
+            low=-ffn1_max, high=ffn1_max)
+        self.w_2._param_attr = paddle.nn.initializer.Uniform(
+            low=-ffn2_max, high=ffn2_max)
+        self.w_2._bias_attr = paddle.nn.initializer.Uniform(
+            low=-ffn2_max, high=ffn2_max)
 
     def forward(self, xs: paddle.Tensor) -> paddle.Tensor:
         """Forward function.
@@ -57,4 +85,6 @@ class PositionwiseFeedForward(nn.Layer):
         Returns:
             output tensor, (B, Lmax, D)
         """
+        if self.adaptive_scale:
+            xs = self.ada_scale * xs + self.ada_bias
         return self.w_2(self.dropout(self.activation(self.w_1(xs))))
diff --git a/paddlespeech/s2t/modules/subsampling.py b/paddlespeech/s2t/modules/subsampling.py
index 782a437e..ef60bdf0 100644
--- a/paddlespeech/s2t/modules/subsampling.py
+++ b/paddlespeech/s2t/modules/subsampling.py
@@ -29,7 +29,7 @@ logger = Log(__name__).getlog()
 
 __all__ = [
     "LinearNoSubsampling", "Conv2dSubsampling4", "Conv2dSubsampling6",
-    "Conv2dSubsampling8"
+    "Conv2dSubsampling8", "DepthwiseConv2DSubsampling4"
 ]
 
 
@@ -249,3 +249,67 @@ class Conv2dSubsampling8(Conv2dSubsampling):
         x = self.linear(x.transpose([0, 2, 1, 3]).reshape([b, -1, c * f]))
         x, pos_emb = self.pos_enc(x, offset)
         return x, pos_emb, x_mask[:, :, :-2:2][:, :, :-2:2][:, :, :-2:2]
+
+
+class DepthwiseConv2DSubsampling4(BaseSubsampling):
+    """Depthwise Convolutional 2D subsampling (to 1/4 length).
+
+        Args:
+            idim (int): Input dimension.
+            odim (int): Output dimension.
+            pos_enc_class (nn.Layer): position encoding class.
+            dw_stride (int): Whether do depthwise convolution.
+            input_size (int): filter bank dimension.
+
+        """
+
+    def __init__(self,
+                 idim: int,
+                 odim: int,
+                 pos_enc_class: nn.Layer,
+                 dw_stride: bool=False,
+                 input_size: int=80,
+                 input_dropout_rate: float=0.1,
+                 init_weights: bool=True):
+        super(DepthwiseConv2DSubsampling4, self).__init__()
+        self.idim = idim
+        self.odim = odim
+        self.pw_conv = Conv2D(
+            in_channels=idim, out_channels=odim, kernel_size=3, stride=2)
+        self.act1 = nn.ReLU()
+        self.dw_conv = Conv2D(
+            in_channels=odim,
+            out_channels=odim,
+            kernel_size=3,
+            stride=2,
+            groups=odim if dw_stride else 1)
+        self.act2 = nn.ReLU()
+        self.pos_enc = pos_enc_class
+        self.input_proj = nn.Sequential(
+            Linear(odim * (((input_size - 1) // 2 - 1) // 2), odim),
+            nn.Dropout(p=input_dropout_rate))
+        if init_weights:
+            linear_max = (odim * input_size / 4)**-0.5
+            self.input_proj.state_dict()[
+                '0.weight'] = paddle.nn.initializer.Uniform(
+                    low=-linear_max, high=linear_max)
+            self.input_proj.state_dict()[
+                '0.bias'] = paddle.nn.initializer.Uniform(
+                    low=-linear_max, high=linear_max)
+
+        self.subsampling_rate = 4
+        # 6 = (3 - 1) * 1 + (3 - 1) * 2
+        self.right_context = 6
+
+    def forward(self, x: paddle.Tensor, x_mask: paddle.Tensor, offset: int=0
+                ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
+        x = x.unsqueeze(1)  # (b, c=1, t, f)
+        x = self.pw_conv(x)
+        x = self.act1(x)
+        x = self.dw_conv(x)
+        x = self.act2(x)
+        b, c, t, f = x.shape
+        x = x.transpose([0, 2, 1, 3]).reshape([b, -1, c * f])
+        x, pos_emb = self.pos_enc(x, offset)
+        x = self.input_proj(x)
+        return x, pos_emb, x_mask[:, :, :-2:2][:, :, :-2:2]
diff --git a/paddlespeech/s2t/modules/time_reduction.py b/paddlespeech/s2t/modules/time_reduction.py
new file mode 100644
index 00000000..d3393f10
--- /dev/null
+++ b/paddlespeech/s2t/modules/time_reduction.py
@@ -0,0 +1,263 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from wenet(https://github.com/wenet-e2e/wenet)
+"""Subsampling layer definition."""
+from typing import Tuple
+
+import paddle
+import paddle.nn.functional as F
+from paddle import nn
+
+from paddlespeech.s2t import masked_fill
+from paddlespeech.s2t.modules.align import Conv1D
+from paddlespeech.s2t.modules.conv2d import Conv2DValid
+from paddlespeech.s2t.utils.log import Log
+
+logger = Log(__name__).getlog()
+
+__all__ = [
+    "TimeReductionLayerStream", "TimeReductionLayer1D", "TimeReductionLayer2D"
+]
+
+
+class TimeReductionLayer1D(nn.Layer):
+    """
+    Modified NeMo,
+    Squeezeformer Time Reduction procedure.
+    Downsamples the audio by `stride` in the time dimension.
+    Args:
+        channel (int): input dimension of
+                       MultiheadAttentionMechanism and PositionwiseFeedForward
+        out_dim (int): Output dimension of the module.
+        kernel_size (int): Conv kernel size for
+                           depthwise convolution in convolution module
+        stride (int): Downsampling factor in time dimension.
+    """
+
+    def __init__(self,
+                 channel: int,
+                 out_dim: int,
+                 kernel_size: int=5,
+                 stride: int=2):
+        super(TimeReductionLayer1D, self).__init__()
+
+        self.channel = channel
+        self.out_dim = out_dim
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = max(0, self.kernel_size - self.stride)
+
+        self.dw_conv = Conv1D(
+            in_channels=channel,
+            out_channels=channel,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=self.padding,
+            groups=channel, )
+
+        self.pw_conv = Conv1D(
+            in_channels=channel,
+            out_channels=out_dim,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1, )
+
+        self.init_weights()
+
+    def init_weights(self):
+        dw_max = self.kernel_size**-0.5
+        pw_max = self.channel**-0.5
+        self.dw_conv._param_attr = paddle.nn.initializer.Uniform(
+            low=-dw_max, high=dw_max)
+        self.dw_conv._bias_attr = paddle.nn.initializer.Uniform(
+            low=-dw_max, high=dw_max)
+        self.pw_conv._param_attr = paddle.nn.initializer.Uniform(
+            low=-pw_max, high=pw_max)
+        self.pw_conv._bias_attr = paddle.nn.initializer.Uniform(
+            low=-pw_max, high=pw_max)
+
+    def forward(
+            self,
+            xs,
+            xs_lens: paddle.Tensor,
+            mask: paddle.Tensor=paddle.ones((0, 0, 0), dtype=paddle.bool),
+            mask_pad: paddle.Tensor=paddle.ones((0, 0, 0),
+                                                dtype=paddle.bool), ):
+        xs = xs.transpose([0, 2, 1])  # [B, C, T]
+        xs = masked_fill(xs, mask_pad.equal(0), 0.0)
+
+        xs = self.dw_conv(xs)
+        xs = self.pw_conv(xs)
+
+        xs = xs.transpose([0, 2, 1])  # [B, T, C]
+
+        B, T, D = xs.shape
+        mask = mask[:, ::self.stride, ::self.stride]
+        mask_pad = mask_pad[:, :, ::self.stride]
+        L = mask_pad.shape[-1]
+        # For JIT exporting, we remove F.pad operator.
+        if L - T < 0:
+            xs = xs[:, :L - T, :]
+        else:
+            dummy_pad = paddle.zeros([B, L - T, D], dtype=paddle.float32)
+            xs = paddle.concat([xs, dummy_pad], axis=1)
+
+        xs_lens = (xs_lens + 1) // 2
+        return xs, xs_lens, mask, mask_pad
+
+
+class TimeReductionLayer2D(nn.Layer):
+    def __init__(self, kernel_size: int=5, stride: int=2, encoder_dim: int=256):
+        super(TimeReductionLayer2D, self).__init__()
+        self.encoder_dim = encoder_dim
+        self.kernel_size = kernel_size
+        self.dw_conv = Conv2DValid(
+            in_channels=encoder_dim,
+            out_channels=encoder_dim,
+            kernel_size=(kernel_size, 1),
+            stride=stride,
+            valid_trigy=True)
+        self.pw_conv = Conv2DValid(
+            in_channels=encoder_dim,
+            out_channels=encoder_dim,
+            kernel_size=1,
+            stride=1,
+            valid_trigx=False,
+            valid_trigy=False)
+
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.init_weights()
+
+    def init_weights(self):
+        dw_max = self.kernel_size**-0.5
+        pw_max = self.encoder_dim**-0.5
+        self.dw_conv._param_attr = paddle.nn.initializer.Uniform(
+            low=-dw_max, high=dw_max)
+        self.dw_conv._bias_attr = paddle.nn.initializer.Uniform(
+            low=-dw_max, high=dw_max)
+        self.pw_conv._param_attr = paddle.nn.initializer.Uniform(
+            low=-pw_max, high=pw_max)
+        self.pw_conv._bias_attr = paddle.nn.initializer.Uniform(
+            low=-pw_max, high=pw_max)
+
+    def forward(
+            self,
+            xs: paddle.Tensor,
+            xs_lens: paddle.Tensor,
+            mask: paddle.Tensor=paddle.ones((0, 0, 0), dtype=paddle.bool),
+            mask_pad: paddle.Tensor=paddle.ones((0, 0, 0), dtype=paddle.bool),
+    ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]:
+        xs = masked_fill(xs, mask_pad.transpose([0, 2, 1]).equal(0), 0.0)
+        xs = xs.unsqueeze(1)
+        padding1 = self.kernel_size - self.stride
+        xs = F.pad(
+            xs, (0, 0, 0, 0, 0, padding1, 0, 0), mode='constant', value=0.)
+        xs = self.dw_conv(xs.transpose([0, 3, 2, 1]))
+        xs = self.pw_conv(xs).transpose([0, 3, 2, 1]).squeeze(1)
+        tmp_length = xs.shape[1]
+        xs_lens = (xs_lens + 1) // 2
+        padding2 = max(0, (xs_lens.max() - tmp_length).item())
+        batch_size, hidden = xs.shape[0], xs.shape[-1]
+        dummy_pad = paddle.zeros(
+            [batch_size, padding2, hidden], dtype=paddle.float32)
+        xs = paddle.concat([xs, dummy_pad], axis=1)
+        mask = mask[:, ::2, ::2]
+        mask_pad = mask_pad[:, :, ::2]
+        return xs, xs_lens, mask, mask_pad
+
+
+class TimeReductionLayerStream(nn.Layer):
+    """
+    Squeezeformer Time Reduction procedure.
+    Downsamples the audio by `stride` in the time dimension.
+    Args:
+        channel (int): input dimension of
+            MultiheadAttentionMechanism and PositionwiseFeedForward
+        out_dim (int): Output dimension of the module.
+        kernel_size (int): Conv kernel size for
+            depthwise convolution in convolution module
+        stride (int): Downsampling factor in time dimension.
+    """
+
+    def __init__(self,
+                 channel: int,
+                 out_dim: int,
+                 kernel_size: int=1,
+                 stride: int=2):
+        super(TimeReductionLayerStream, self).__init__()
+
+        self.channel = channel
+        self.out_dim = out_dim
+        self.kernel_size = kernel_size
+        self.stride = stride
+
+        self.dw_conv = Conv1D(
+            in_channels=channel,
+            out_channels=channel,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=0,
+            groups=channel)
+
+        self.pw_conv = Conv1D(
+            in_channels=channel,
+            out_channels=out_dim,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1)
+        self.init_weights()
+
+    def init_weights(self):
+        dw_max = self.kernel_size**-0.5
+        pw_max = self.channel**-0.5
+        self.dw_conv._param_attr = paddle.nn.initializer.Uniform(
+            low=-dw_max, high=dw_max)
+        self.dw_conv._bias_attr = paddle.nn.initializer.Uniform(
+            low=-dw_max, high=dw_max)
+        self.pw_conv._param_attr = paddle.nn.initializer.Uniform(
+            low=-pw_max, high=pw_max)
+        self.pw_conv._bias_attr = paddle.nn.initializer.Uniform(
+            low=-pw_max, high=pw_max)
+
+    def forward(
+            self,
+            xs,
+            xs_lens: paddle.Tensor,
+            mask: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool),
+            mask_pad: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool)):
+        xs = xs.transpose([0, 2, 1])  # [B, C, T]
+        xs = masked_fill(xs, mask_pad.equal(0), 0.0)
+
+        xs = self.dw_conv(xs)
+        xs = self.pw_conv(xs)
+
+        xs = xs.transpose([0, 2, 1])  # [B, T, C]
+
+        B, T, D = xs.shape
+        mask = mask[:, ::self.stride, ::self.stride]
+        mask_pad = mask_pad[:, :, ::self.stride]
+        L = mask_pad.shape[-1]
+        # For JIT exporting, we remove F.pad operator.
+        if L - T < 0:
+            xs = xs[:, :L - T, :]
+        else:
+            dummy_pad = paddle.zeros([B, L - T, D], dtype=paddle.float32)
+            xs = paddle.concat([xs, dummy_pad], axis=1)
+
+        xs_lens = (xs_lens + 1) // 2
+        return xs, xs_lens, mask, mask_pad
diff --git a/paddlespeech/s2t/training/cli.py b/paddlespeech/s2t/training/cli.py
index 1b6bec8a..741b95df 100644
--- a/paddlespeech/s2t/training/cli.py
+++ b/paddlespeech/s2t/training/cli.py
@@ -13,6 +13,9 @@
 # limitations under the License.
 import argparse
 
+import distutils
+from yacs.config import CfgNode
+
 
 class ExtendAction(argparse.Action):
     """
@@ -68,7 +71,15 @@ def default_argument_parser(parser=None):
     parser.register('action', 'extend', ExtendAction)
     parser.add_argument(
         '--conf', type=open, action=LoadFromFile, help="config file.")
+    parser.add_argument(
+        "--debug",
+        type=distutils.util.strtobool,
+        default=False,
+        help="logging with debug mode.")
+    parser.add_argument(
+        "--dump_path", type=str, default=None, help="path to dump config file.")
 
+    # train group
     train_group = parser.add_argument_group(
         title='Train Options', description=None)
     train_group.add_argument(
@@ -103,14 +114,35 @@ def default_argument_parser(parser=None):
     train_group.add_argument(
         "--dump-config", metavar="FILE", help="dump config to `this` file.")
 
+    # test group
     test_group = parser.add_argument_group(
         title='Test Options', description=None)
-
     test_group.add_argument(
         "--decode_cfg",
         metavar="DECODE_CONFIG_FILE",
         help="decode config file.")
+    test_group.add_argument(
+        "--result_file", type=str, help="path of save the asr result")
+    test_group.add_argument(
+        "--audio_file", type=str, help="path of the input audio file")
+
+    # quant & export
+    quant_group = parser.add_argument_group(
+        title='Quant Options', description=None)
+    quant_group.add_argument(
+        "--audio_scp", type=str, help="path of the input audio scp file")
+    quant_group.add_argument(
+        "--num_utts",
+        type=int,
+        default=200,
+        help="num utts for quant calibrition.")
+    quant_group.add_argument(
+        "--export_path",
+        type=str,
+        default='export.jit.quant',
+        help="path of the jit model to save")
 
+    # profile group
     profile_group = parser.add_argument_group(
         title='Benchmark Options', description=None)
     profile_group.add_argument(
@@ -131,3 +163,28 @@ def default_argument_parser(parser=None):
         help='max iteration for benchmark.')
 
     return parser
+
+
+def config_from_args(args):
+    # https://yaml.org/type/float.html
+    config = CfgNode(new_allowed=True)
+
+    if args.config:
+        config.merge_from_file(args.config)
+
+    if args.decode_cfg:
+        decode_confs = CfgNode(new_allowed=True)
+        decode_confs.merge_from_file(args.decode_cfg)
+        config.decode = decode_confs
+
+    if args.opts:
+        config.merge_from_list(args.opts)
+    config.freeze()
+    return config
+
+
+def maybe_dump_config(dump_path, config):
+    if dump_path:
+        with open(dump_path, 'w') as f:
+            print(config, file=f)
+        print(f"save config to {dump_path}")
diff --git a/paddlespeech/s2t/training/gradclip.py b/paddlespeech/s2t/training/gradclip.py
deleted file mode 100644
index 06587c74..00000000
--- a/paddlespeech/s2t/training/gradclip.py
+++ /dev/null
@@ -1,86 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import paddle
-from paddle.fluid import core
-from paddle.fluid import layers
-from paddle.fluid.dygraph import base as imperative_base
-
-from paddlespeech.s2t.utils.log import Log
-
-__all__ = ["ClipGradByGlobalNormWithLog"]
-
-logger = Log(__name__).getlog()
-
-
-class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm):
-    def __init__(self, clip_norm):
-        super().__init__(clip_norm)
-
-    def __repr__(self):
-        return f"{self.__class__.__name__}(global_clip_norm={self.clip_norm})"
-
-    @imperative_base.no_grad
-    def _dygraph_clip(self, params_grads):
-        params_and_grads = []
-        sum_square_list = []
-        for i, (p, g) in enumerate(params_grads):
-            if g is None:
-                continue
-            if getattr(p, 'need_clip', True) is False:
-                continue
-            merge_grad = g
-            if g.type == core.VarDesc.VarType.SELECTED_ROWS:
-                merge_grad = layers.merge_selected_rows(g)
-                merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
-            square = paddle.square(merge_grad)
-            sum_square = paddle.sum(square)
-            sum_square_list.append(sum_square)
-
-            # debug log, not dump all since slow down train process
-            if i < 10:
-                logger.debug(
-                    f"Grad Before Clip: {p.name}: {float(sum_square.sqrt()) }")
-
-        # all parameters have been filterd out
-        if len(sum_square_list) == 0:
-            return params_grads
-
-        global_norm_var = paddle.concat(sum_square_list)
-        global_norm_var = paddle.sum(global_norm_var)
-        global_norm_var = paddle.sqrt(global_norm_var)
-
-        # debug log
-        logger.debug(f"Grad Global Norm: {float(global_norm_var)}!!!!")
-
-        max_global_norm = paddle.full(
-            shape=[1], dtype=global_norm_var.dtype, fill_value=self.clip_norm)
-        clip_var = paddle.divide(
-            x=max_global_norm,
-            y=paddle.maximum(x=global_norm_var, y=max_global_norm))
-        for i, (p, g) in enumerate(params_grads):
-            if g is None:
-                continue
-            if getattr(p, 'need_clip', True) is False:
-                params_and_grads.append((p, g))
-                continue
-            new_grad = paddle.multiply(x=g, y=clip_var)
-            params_and_grads.append((p, new_grad))
-
-            # debug log, not dump all since slow down train process
-            if i < 10:
-                logger.debug(
-                    f"Grad After Clip: {p.name}: {float(new_grad.square().sum().sqrt())}"
-                )
-
-        return params_and_grads
diff --git a/paddlespeech/s2t/training/optimizer/__init__.py b/paddlespeech/s2t/training/optimizer/__init__.py
index aafdc5b6..90281e1e 100644
--- a/paddlespeech/s2t/training/optimizer/__init__.py
+++ b/paddlespeech/s2t/training/optimizer/__init__.py
@@ -19,7 +19,7 @@ from typing import Text
 import paddle
 from paddle.optimizer import Optimizer
 from paddle.regularizer import L2Decay
-from paddlespeech.s2t.training.gradclip import ClipGradByGlobalNormWithLog
+
 from paddlespeech.s2t.utils.dynamic_import import dynamic_import
 from paddlespeech.s2t.utils.dynamic_import import instance_class
 from paddlespeech.s2t.utils.log import Log
@@ -100,10 +100,9 @@ class OptimizerFactory():
         assert "parameters" in args, "parameters not in args."
         assert "learning_rate" in args, "learning_rate not in args."
 
-        grad_clip = ClipGradByGlobalNormWithLog(
+        grad_clip = paddle.nn.ClipGradByGlobalNorm(
             args['grad_clip']) if "grad_clip" in args else None
-        weight_decay = L2Decay(
-            args['weight_decay']) if "weight_decay" in args else None
+        weight_decay = args.get("weight_decay", None)
         if weight_decay:
             logger.info(f'<WeightDecay - {weight_decay}>')
         if grad_clip:
diff --git a/paddlespeech/s2t/training/optimizer/adadelta.py b/paddlespeech/s2t/training/optimizer/adadelta.py
index 900b697c..7c3950a9 100644
--- a/paddlespeech/s2t/training/optimizer/adadelta.py
+++ b/paddlespeech/s2t/training/optimizer/adadelta.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import paddle
-from paddle.fluid import framework
+from paddle import framework
 from paddle.optimizer import Optimizer
 
 __all__ = []
diff --git a/paddlespeech/s2t/training/trainer.py b/paddlespeech/s2t/training/trainer.py
index 4a69d78a..a8f36f91 100644
--- a/paddlespeech/s2t/training/trainer.py
+++ b/paddlespeech/s2t/training/trainer.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import os
 import sys
 import time
 from collections import OrderedDict
@@ -110,6 +111,7 @@ class Trainer():
         self.rank = dist.get_rank()
         self.world_size = dist.get_world_size()
         self._train = True
+        self.scaler = None
 
         # print deps version
         all_version()
@@ -187,8 +189,13 @@ class Trainer():
         infos.update({
             "step": self.iteration,
             "epoch": self.epoch,
-            "lr": self.optimizer.get_lr()
+            "lr": self.optimizer.get_lr(),
         })
+        if self.scaler:
+            scaler_path = os.path.join(self.checkpoint_dir,
+                                       "{}".format(self.epoch)) + '.scaler'
+            paddle.save(self.scaler.state_dict(), scaler_path)
+
         self.checkpoint.save_parameters(self.checkpoint_dir, self.iteration
                                         if tag is None else tag, self.model,
                                         self.optimizer, infos)
@@ -211,6 +218,13 @@ class Trainer():
             # lr will resotre from optimizer ckpt
             self.iteration = infos["step"]
             self.epoch = infos["epoch"]
+
+            scaler_path = os.path.join(self.checkpoint_dir,
+                                       "{}".format(self.epoch)) + '.scaler'
+            if os.path.exists(scaler_path):
+                scaler_state_dict = paddle.load(scaler_path)
+                self.scaler.load_state_dict(scaler_state_dict)
+
             scratch = False
             logger.info(
                 f"Restore ckpt: epoch {self.epoch }, step {self.iteration}!")
diff --git a/paddlespeech/s2t/utils/utility.py b/paddlespeech/s2t/utils/utility.py
index fdd8c029..5655ec3f 100644
--- a/paddlespeech/s2t/utils/utility.py
+++ b/paddlespeech/s2t/utils/utility.py
@@ -29,10 +29,7 @@ from paddlespeech.s2t.utils.log import Log
 
 logger = Log(__name__).getlog()
 
-__all__ = [
-    "all_version", "UpdateConfig", "seed_all", 'print_arguments',
-    'add_arguments', "log_add"
-]
+__all__ = ["all_version", "UpdateConfig", "seed_all", "log_add"]
 
 
 def all_version():
@@ -60,51 +57,6 @@ def seed_all(seed: int=20210329):
     paddle.seed(seed)
 
 
-def print_arguments(args, info=None):
-    """Print argparse's arguments.
-
-    Usage:
-
-    .. code-block:: python
-
-        parser = argparse.ArgumentParser()
-        parser.add_argument("name", default="Jonh", type=str, help="User name.")
-        args = parser.parse_args()
-        print_arguments(args)
-
-    :param args: Input argparse.Namespace for printing.
-    :type args: argparse.Namespace
-    """
-    filename = ""
-    if info:
-        filename = info["__file__"]
-    filename = os.path.basename(filename)
-    print(f"----------- {filename} Arguments -----------")
-    for arg, value in sorted(vars(args).items()):
-        print("%s: %s" % (arg, value))
-    print("-----------------------------------------------------------")
-
-
-def add_arguments(argname, type, default, help, argparser, **kwargs):
-    """Add argparse's argument.
-
-    Usage:
-
-    .. code-block:: python
-
-        parser = argparse.ArgumentParser()
-        add_argument("name", str, "Jonh", "User name.", parser)
-        args = parser.parse_args()
-    """
-    type = distutils.util.strtobool if type == bool else type
-    argparser.add_argument(
-        "--" + argname,
-        default=default,
-        type=type,
-        help=help + ' Default: %(default)s.',
-        **kwargs)
-
-
 def log_add(args: List[int]) -> float:
     """Stable log add
 
@@ -130,8 +82,11 @@ def get_subsample(config):
     Returns:
         int: subsample rate.
     """
-    input_layer = config["encoder_conf"]["input_layer"]
-    assert input_layer in ["conv2d", "conv2d6", "conv2d8"]
+    if config['encoder'] == 'squeezeformer':
+        return 4
+    else:
+        input_layer = config["encoder_conf"]["input_layer"]
+        assert input_layer in ["conv2d", "conv2d6", "conv2d8"]
     if input_layer == "conv2d":
         return 4
     elif input_layer == "conv2d6":
diff --git a/paddlespeech/server/engine/asr/online/python/asr_engine.py b/paddlespeech/server/engine/asr/online/python/asr_engine.py
index 536ffe0a..a702f0aa 100644
--- a/paddlespeech/server/engine/asr/online/python/asr_engine.py
+++ b/paddlespeech/server/engine/asr/online/python/asr_engine.py
@@ -609,7 +609,7 @@ class PaddleASRConnectionHanddler:
             dtype=paddle.long)  # (beam_size,)
         hyps_pad, _ = add_sos_eos(hyps_pad, self.model.sos, self.model.eos,
                                   self.model.ignore_id)
-        hyps_lens = hyps_lens + 1  # Add <sos> at begining
+        hyps_lens = hyps_lens + 1  # Add <sos> at beginning
 
         # ctc score in ln domain
         # (beam_size, max_hyps_len, vocab_size)
diff --git a/paddlespeech/server/engine/asr/python/asr_engine.py b/paddlespeech/server/engine/asr/python/asr_engine.py
index e297e5c2..7f81f03b 100644
--- a/paddlespeech/server/engine/asr/python/asr_engine.py
+++ b/paddlespeech/server/engine/asr/python/asr_engine.py
@@ -67,13 +67,19 @@ class ASREngine(BaseEngine):
             logger.error(e)
             return False
 
+        cs = False
+
+        if self.config.lang == "zh_en" :
+            cs=True
+
         self.executor._init_from_path(
             model_type=self.config.model,
             lang=self.config.lang,
             sample_rate=self.config.sample_rate,
             cfg_path=self.config.cfg_path,
             decode_method=self.config.decode_method,
-            ckpt_path=self.config.ckpt_path)
+            ckpt_path=self.config.ckpt_path,
+            codeswitch=cs )
 
         logger.info("Initialize ASR server engine successfully on device: %s." %
                     (self.device))
diff --git a/paddlespeech/server/engine/tts/online/onnx/tts_engine.py b/paddlespeech/server/engine/tts/online/onnx/tts_engine.py
index 0995a55d..14204dde 100644
--- a/paddlespeech/server/engine/tts/online/onnx/tts_engine.py
+++ b/paddlespeech/server/engine/tts/online/onnx/tts_engine.py
@@ -28,7 +28,7 @@ from paddlespeech.server.utils.audio_process import float2pcm
 from paddlespeech.server.utils.onnx_infer import get_sess
 from paddlespeech.server.utils.util import denorm
 from paddlespeech.server.utils.util import get_chunks
-from paddlespeech.t2s.frontend import English
+from paddlespeech.t2s.frontend.en_frontend import English
 from paddlespeech.t2s.frontend.zh_frontend import Frontend
 
 __all__ = ['TTSEngine', 'PaddleTTSConnectionHandler']
@@ -154,7 +154,7 @@ class TTSServerExecutor(TTSExecutor):
         self.voc_sess = get_sess(self.voc_ckpt, voc_sess_conf)
         logger.debug("Create voc sess successfully.")
 
-        with open(self.phones_dict, "r") as f:
+        with open(self.phones_dict, "r", encoding='utf-8') as f:
             phn_id = [line.strip().split() for line in f.readlines()]
         self.vocab_size = len(phn_id)
         logger.debug(f"vocab_size: {self.vocab_size}")
diff --git a/paddlespeech/server/engine/tts/online/python/tts_engine.py b/paddlespeech/server/engine/tts/online/python/tts_engine.py
index a46b84bd..0cfb2035 100644
--- a/paddlespeech/server/engine/tts/online/python/tts_engine.py
+++ b/paddlespeech/server/engine/tts/online/python/tts_engine.py
@@ -29,7 +29,7 @@ from paddlespeech.server.engine.base_engine import BaseEngine
 from paddlespeech.server.utils.audio_process import float2pcm
 from paddlespeech.server.utils.util import denorm
 from paddlespeech.server.utils.util import get_chunks
-from paddlespeech.t2s.frontend import English
+from paddlespeech.t2s.frontend.en_frontend import English
 from paddlespeech.t2s.frontend.zh_frontend import Frontend
 from paddlespeech.t2s.modules.normalizer import ZScore
 
diff --git a/paddlespeech/server/engine/tts/paddleinference/tts_engine.py b/paddlespeech/server/engine/tts/paddleinference/tts_engine.py
index 20b98fae..3a6461f8 100644
--- a/paddlespeech/server/engine/tts/paddleinference/tts_engine.py
+++ b/paddlespeech/server/engine/tts/paddleinference/tts_engine.py
@@ -32,7 +32,7 @@ from paddlespeech.server.utils.errors import ErrorCode
 from paddlespeech.server.utils.exception import ServerBaseException
 from paddlespeech.server.utils.paddle_predictor import init_predictor
 from paddlespeech.server.utils.paddle_predictor import run_model
-from paddlespeech.t2s.frontend import English
+from paddlespeech.t2s.frontend.en_frontend import English
 from paddlespeech.t2s.frontend.zh_frontend import Frontend
 
 __all__ = ['TTSEngine', 'PaddleTTSConnectionHandler']
diff --git a/paddlespeech/server/ws/asr_api.py b/paddlespeech/server/ws/asr_api.py
index ae1c8831..b3ad0b7c 100644
--- a/paddlespeech/server/ws/asr_api.py
+++ b/paddlespeech/server/ws/asr_api.py
@@ -67,7 +67,7 @@ async def websocket_endpoint(websocket: WebSocket):
                 #              and we break the loop
                 if message['signal'] == 'start':
                     resp = {"status": "ok", "signal": "server_ready"}
-                    # do something at begining here
+                    # do something at beginning here
                     # create the instance to process the audio
                     #connection_handler = PaddleASRConnectionHanddler(asr_model)
                     connection_handler = asr_model.new_handler()
diff --git a/paddlespeech/t2s/__init__.py b/paddlespeech/t2s/__init__.py
index 57fe82a9..7d93c026 100644
--- a/paddlespeech/t2s/__init__.py
+++ b/paddlespeech/t2s/__init__.py
@@ -18,6 +18,5 @@ from . import exps
 from . import frontend
 from . import models
 from . import modules
-from . import ssml
 from . import training
 from . import utils
diff --git a/paddlespeech/t2s/assets/__init__.py b/paddlespeech/t2s/assets/__init__.py
new file mode 100644
index 00000000..595add0a
--- /dev/null
+++ b/paddlespeech/t2s/assets/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddlespeech/t2s/exps/csmsc_test.txt b/paddlespeech/t2s/assets/csmsc_test.txt
similarity index 100%
rename from paddlespeech/t2s/exps/csmsc_test.txt
rename to paddlespeech/t2s/assets/csmsc_test.txt
diff --git a/paddlespeech/t2s/exps/sentences.txt b/paddlespeech/t2s/assets/sentences.txt
similarity index 100%
rename from paddlespeech/t2s/exps/sentences.txt
rename to paddlespeech/t2s/assets/sentences.txt
diff --git a/paddlespeech/t2s/exps/sentences_canton.txt b/paddlespeech/t2s/assets/sentences_canton.txt
similarity index 100%
rename from paddlespeech/t2s/exps/sentences_canton.txt
rename to paddlespeech/t2s/assets/sentences_canton.txt
diff --git a/paddlespeech/t2s/exps/sentences_en.txt b/paddlespeech/t2s/assets/sentences_en.txt
similarity index 100%
rename from paddlespeech/t2s/exps/sentences_en.txt
rename to paddlespeech/t2s/assets/sentences_en.txt
diff --git a/paddlespeech/t2s/exps/sentences_mix.txt b/paddlespeech/t2s/assets/sentences_mix.txt
similarity index 90%
rename from paddlespeech/t2s/exps/sentences_mix.txt
rename to paddlespeech/t2s/assets/sentences_mix.txt
index 06e97d14..bfa0db63 100644
--- a/paddlespeech/t2s/exps/sentences_mix.txt
+++ b/paddlespeech/t2s/assets/sentences_mix.txt
@@ -5,4 +5,5 @@
 005 Paddle Bo Bo: 使用 Paddle Speech 的语音合成模块生成虚拟人的声音。
 006 热烈欢迎您在 Discussions 中提交问题，并在 Issues 中指出发现的 bug。此外，我们非常希望您参与到 Paddle Speech 的开发中！
 007 我喜欢 eat apple, 你喜欢 drink milk。
-008 我们要去云南 team building, 非常非常 happy.
\ No newline at end of file
+008 我们要去云南 team building, 非常非常 happy.
+009 AI for Sceience 平台。
\ No newline at end of file
diff --git a/paddlespeech/t2s/assets/sentences_sing.txt b/paddlespeech/t2s/assets/sentences_sing.txt
new file mode 100644
index 00000000..7b9c6272
--- /dev/null
+++ b/paddlespeech/t2s/assets/sentences_sing.txt
@@ -0,0 +1,2 @@
+{"utt_id": "2093003457", "input_type": "word", "text": "小酒窝长睫毛AP是你最美的记号", "notes": "C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4", "note_durs": "0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340"}
+{"utt_id": "2093003458", "input_type": "phoneme", "phones": "w o m ei t ian sh ui ui b u u zh ao AP x iang n ian n i d e w ei x iao iao AP" , "notes": "C#4/Db4 C#4/Db4 D#4/Eb4 D#4/Eb4 F4 F4 F#4/Gb4 F#4/Gb4 D#4/Eb4 D#4/Eb4 D#4/Eb4 A#3/Bb3 A#3/Bb3 A#3/Bb3 rest F#4/Gb4 F#4/Gb4 F4 F4 F#4/Gb4 F#4/Gb4 F4 F4 G#4/Ab4 G#4/Ab4 D#4/Eb4 D#4/Eb4 C#4/Db4 rest", "note_durs": "0.221750 0.221750 0.414460 0.414460 0.223160 0.223160 0.430900 0.430900 0.335990 0.269270 0.269270 0.289060 0.522690 0.522690 0.355060 0.397130 0.397130 0.247690 0.247690 0.406720 0.406720 0.246830 0.246830 0.307540 0.307540 0.429910 0.429910 0.519130 0.342300", "is_slurs": "0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0"}
\ No newline at end of file
diff --git a/paddlespeech/t2s/exps/sentences_ssml.txt b/paddlespeech/t2s/assets/sentences_ssml.txt
similarity index 100%
rename from paddlespeech/t2s/exps/sentences_ssml.txt
rename to paddlespeech/t2s/assets/sentences_ssml.txt
diff --git a/paddlespeech/t2s/datasets/am_batch_fn.py b/paddlespeech/t2s/datasets/am_batch_fn.py
index c95d908d..fe5d977a 100644
--- a/paddlespeech/t2s/datasets/am_batch_fn.py
+++ b/paddlespeech/t2s/datasets/am_batch_fn.py
@@ -114,7 +114,7 @@ def erniesat_batch_fn(examples,
         ]
         span_bdy = paddle.to_tensor(span_bdy)
 
-    # dual_mask 的是混合中英时候同时 mask 语音和文本 
+    # dual_mask 的是混合中英时候同时 mask 语音和文本
     # ernie sat 在实现跨语言的时候都 mask 了
     if text_masking:
         masked_pos, text_masked_pos = phones_text_masking(
@@ -153,7 +153,7 @@ def erniesat_batch_fn(examples,
     batch = {
         "text": text,
         "speech": speech,
-        # need to generate 
+        # need to generate
         "masked_pos": masked_pos,
         "speech_mask": speech_mask,
         "text_mask": text_mask,
@@ -414,6 +414,135 @@ def fastspeech2_multi_spk_batch_fn(examples):
     return batch
 
 
+def diffsinger_single_spk_batch_fn(examples):
+    # fields = ["text", "note", "note_dur", "is_slur", "text_lengths", \
+    # "speech", "speech_lengths", "durations", "pitch", "energy"]
+    text = [np.array(item["text"], dtype=np.int64) for item in examples]
+    note = [np.array(item["note"], dtype=np.int64) for item in examples]
+    note_dur = [
+        np.array(item["note_dur"], dtype=np.float32) for item in examples
+    ]
+    is_slur = [np.array(item["is_slur"], dtype=np.int64) for item in examples]
+    speech = [np.array(item["speech"], dtype=np.float32) for item in examples]
+    pitch = [np.array(item["pitch"], dtype=np.float32) for item in examples]
+    energy = [np.array(item["energy"], dtype=np.float32) for item in examples]
+    durations = [
+        np.array(item["durations"], dtype=np.int64) for item in examples
+    ]
+
+    text_lengths = [
+        np.array(item["text_lengths"], dtype=np.int64) for item in examples
+    ]
+    speech_lengths = [
+        np.array(item["speech_lengths"], dtype=np.int64) for item in examples
+    ]
+
+    text = batch_sequences(text)
+    note = batch_sequences(note)
+    note_dur = batch_sequences(note_dur)
+    is_slur = batch_sequences(is_slur)
+    pitch = batch_sequences(pitch)
+    speech = batch_sequences(speech)
+    durations = batch_sequences(durations)
+    energy = batch_sequences(energy)
+
+    # convert each batch to paddle.Tensor
+    text = paddle.to_tensor(text)
+    note = paddle.to_tensor(note)
+    note_dur = paddle.to_tensor(note_dur)
+    is_slur = paddle.to_tensor(is_slur)
+    pitch = paddle.to_tensor(pitch)
+    speech = paddle.to_tensor(speech)
+    durations = paddle.to_tensor(durations)
+    energy = paddle.to_tensor(energy)
+    text_lengths = paddle.to_tensor(text_lengths)
+    speech_lengths = paddle.to_tensor(speech_lengths)
+
+    batch = {
+        "text": text,
+        "note": note,
+        "note_dur": note_dur,
+        "is_slur": is_slur,
+        "text_lengths": text_lengths,
+        "durations": durations,
+        "speech": speech,
+        "speech_lengths": speech_lengths,
+        "pitch": pitch,
+        "energy": energy
+    }
+    return batch
+
+
+def diffsinger_multi_spk_batch_fn(examples):
+    # fields = ["text", "note", "note_dur", "is_slur", "text_lengths", "speech", \
+    # "speech_lengths", "durations", "pitch", "energy", "spk_id"/"spk_emb"]
+    text = [np.array(item["text"], dtype=np.int64) for item in examples]
+    note = [np.array(item["note"], dtype=np.int64) for item in examples]
+    note_dur = [
+        np.array(item["note_dur"], dtype=np.float32) for item in examples
+    ]
+    is_slur = [np.array(item["is_slur"], dtype=np.int64) for item in examples]
+    speech = [np.array(item["speech"], dtype=np.float32) for item in examples]
+    pitch = [np.array(item["pitch"], dtype=np.float32) for item in examples]
+    energy = [np.array(item["energy"], dtype=np.float32) for item in examples]
+    durations = [
+        np.array(item["durations"], dtype=np.int64) for item in examples
+    ]
+    text_lengths = [
+        np.array(item["text_lengths"], dtype=np.int64) for item in examples
+    ]
+    speech_lengths = [
+        np.array(item["speech_lengths"], dtype=np.int64) for item in examples
+    ]
+
+    text = batch_sequences(text)
+    note = batch_sequences(note)
+    note_dur = batch_sequences(note_dur)
+    is_slur = batch_sequences(is_slur)
+    pitch = batch_sequences(pitch)
+    speech = batch_sequences(speech)
+    durations = batch_sequences(durations)
+    energy = batch_sequences(energy)
+
+    # convert each batch to paddle.Tensor
+    text = paddle.to_tensor(text)
+    note = paddle.to_tensor(note)
+    note_dur = paddle.to_tensor(note_dur)
+    is_slur = paddle.to_tensor(is_slur)
+    pitch = paddle.to_tensor(pitch)
+    speech = paddle.to_tensor(speech)
+    durations = paddle.to_tensor(durations)
+    energy = paddle.to_tensor(energy)
+    text_lengths = paddle.to_tensor(text_lengths)
+    speech_lengths = paddle.to_tensor(speech_lengths)
+
+    batch = {
+        "text": text,
+        "note": note,
+        "note_dur": note_dur,
+        "is_slur": is_slur,
+        "text_lengths": text_lengths,
+        "durations": durations,
+        "speech": speech,
+        "speech_lengths": speech_lengths,
+        "pitch": pitch,
+        "energy": energy
+    }
+    # spk_emb has a higher priority than spk_id
+    if "spk_emb" in examples[0]:
+        spk_emb = [
+            np.array(item["spk_emb"], dtype=np.float32) for item in examples
+        ]
+        spk_emb = batch_sequences(spk_emb)
+        spk_emb = paddle.to_tensor(spk_emb)
+        batch["spk_emb"] = spk_emb
+    elif "spk_id" in examples[0]:
+        spk_id = [np.array(item["spk_id"], dtype=np.int64) for item in examples]
+        spk_id = paddle.to_tensor(spk_id)
+        batch["spk_id"] = spk_id
+    return batch
+
+
 def transformer_single_spk_batch_fn(examples):
     # fields = ["text", "text_lengths", "speech", "speech_lengths"]
     text = [np.array(item["text"], dtype=np.int64) for item in examples]
@@ -540,6 +669,211 @@ def vits_multi_spk_batch_fn(examples):
     return batch
 
 
+def jets_single_spk_batch_fn(examples):
+    """
+    Returns:
+        Dict[str, Any]:
+            - text (Tensor): Text index tensor (B, T_text).
+            - text_lengths (Tensor): Text length tensor (B,).
+            - feats (Tensor): Feature tensor (B, T_feats, aux_channels).
+            - feats_lengths (Tensor): Feature length tensor (B,).
+            - durations (Tensor): Feature tensor (B, T_text,).
+            - durations_lengths (Tensor): Durations length tensor (B,).
+            - pitch (Tensor): Feature tensor (B, pitch_length,).
+            - energy (Tensor): Feature tensor (B, energy_length,).
+            - speech (Tensor): Speech waveform tensor (B, T_wav).
+
+    """
+    # fields = ["text", "text_lengths", "feats", "feats_lengths", "durations", "pitch", "energy", "speech"]
+    text = [np.array(item["text"], dtype=np.int64) for item in examples]
+    feats = [np.array(item["feats"], dtype=np.float32) for item in examples]
+    durations = [
+        np.array(item["durations"], dtype=np.int64) for item in examples
+    ]
+    pitch = [np.array(item["pitch"], dtype=np.float32) for item in examples]
+    energy = [np.array(item["energy"], dtype=np.float32) for item in examples]
+    speech = [np.array(item["wave"], dtype=np.float32) for item in examples]
+
+    text_lengths = [
+        np.array(item["text_lengths"], dtype=np.int64) for item in examples
+    ]
+    feats_lengths = [
+        np.array(item["feats_lengths"], dtype=np.int64) for item in examples
+    ]
+
+    text = batch_sequences(text)
+    feats = batch_sequences(feats)
+    durations = batch_sequences(durations)
+    pitch = batch_sequences(pitch)
+    energy = batch_sequences(energy)
+    speech = batch_sequences(speech)
+
+    # convert each batch to paddle.Tensor
+    text = paddle.to_tensor(text)
+    feats = paddle.to_tensor(feats)
+    durations = paddle.to_tensor(durations)
+    pitch = paddle.to_tensor(pitch)
+    energy = paddle.to_tensor(energy)
+    text_lengths = paddle.to_tensor(text_lengths)
+    feats_lengths = paddle.to_tensor(feats_lengths)
+
+    batch = {
+        "text": text,
+        "text_lengths": text_lengths,
+        "feats": feats,
+        "feats_lengths": feats_lengths,
+        "durations": durations,
+        "durations_lengths": text_lengths,
+        "pitch": pitch,
+        "energy": energy,
+        "speech": speech,
+    }
+    return batch
+
+
+def jets_multi_spk_batch_fn(examples):
+    """
+    Returns:
+        Dict[str, Any]:
+            - text (Tensor): Text index tensor (B, T_text).
+            - text_lengths (Tensor): Text length tensor (B,).
+            - feats (Tensor): Feature tensor (B, T_feats, aux_channels).
+            - feats_lengths (Tensor): Feature length tensor (B,).
+            - durations (Tensor): Feature tensor (B, T_text,).
+            - durations_lengths (Tensor): Durations length tensor (B,).
+            - pitch (Tensor): Feature tensor (B, pitch_length,).
+            - energy (Tensor): Feature tensor (B, energy_length,).
+            - speech (Tensor): Speech waveform tensor (B, T_wav).
+            - spk_id (Optional[Tensor]): Speaker index tensor (B,) or (B, 1).
+            - spk_emb (Optional[Tensor]): Speaker embedding tensor (B, spk_embed_dim).
+    """
+    # fields = ["text", "text_lengths", "feats", "feats_lengths", "durations", "pitch", "energy", "speech", "spk_id"/"spk_emb"]
+    text = [np.array(item["text"], dtype=np.int64) for item in examples]
+    feats = [np.array(item["feats"], dtype=np.float32) for item in examples]
+    durations = [
+        np.array(item["durations"], dtype=np.int64) for item in examples
+    ]
+    pitch = [np.array(item["pitch"], dtype=np.float32) for item in examples]
+    energy = [np.array(item["energy"], dtype=np.float32) for item in examples]
+    speech = [np.array(item["wave"], dtype=np.float32) for item in examples]
+    text_lengths = [
+        np.array(item["text_lengths"], dtype=np.int64) for item in examples
+    ]
+    feats_lengths = [
+        np.array(item["feats_lengths"], dtype=np.int64) for item in examples
+    ]
+
+    text = batch_sequences(text)
+    feats = batch_sequences(feats)
+    durations = batch_sequences(durations)
+    pitch = batch_sequences(pitch)
+    energy = batch_sequences(energy)
+    speech = batch_sequences(speech)
+
+    # convert each batch to paddle.Tensor
+    text = paddle.to_tensor(text)
+    feats = paddle.to_tensor(feats)
+    durations = paddle.to_tensor(durations)
+    pitch = paddle.to_tensor(pitch)
+    energy = paddle.to_tensor(energy)
+    text_lengths = paddle.to_tensor(text_lengths)
+    feats_lengths = paddle.to_tensor(feats_lengths)
+
+    batch = {
+        "text": text,
+        "text_lengths": text_lengths,
+        "feats": feats,
+        "feats_lengths": feats_lengths,
+        "durations": durations,
+        "durations_lengths": text_lengths,
+        "pitch": pitch,
+        "energy": energy,
+        "speech": speech,
+    }
+    # spk_emb has a higher priority than spk_id
+    if "spk_emb" in examples[0]:
+        spk_emb = [
+            np.array(item["spk_emb"], dtype=np.float32) for item in examples
+        ]
+        spk_emb = batch_sequences(spk_emb)
+        spk_emb = paddle.to_tensor(spk_emb)
+        batch["spk_emb"] = spk_emb
+    elif "spk_id" in examples[0]:
+        spk_id = [np.array(item["spk_id"], dtype=np.int64) for item in examples]
+        spk_id = paddle.to_tensor(spk_id)
+        batch["spk_id"] = spk_id
+    return batch
+
+
+# 因为要传参数，所以需要额外构建
+def build_starganv2_vc_collate_fn(latent_dim: int=16, max_mel_length: int=192):
+
+    return StarGANv2VCCollateFn(
+        latent_dim=latent_dim, max_mel_length=max_mel_length)
+
+
+class StarGANv2VCCollateFn:
+    """Functor class of common_collate_fn()"""
+
+    def __init__(self, latent_dim: int=16, max_mel_length: int=192):
+        self.latent_dim = latent_dim
+        self.max_mel_length = max_mel_length
+
+    def random_clip(self, mel: np.array):
+        # [T, 80]
+        mel_length = mel.shape[0]
+        if mel_length > self.max_mel_length:
+            random_start = np.random.randint(0,
+                                             mel_length - self.max_mel_length)
+
+            mel = mel[random_start:random_start + self.max_mel_length, :]
+        return mel
+
+    def __call__(self, exmaples):
+        return self.starganv2_vc_batch_fn(exmaples)
+
+    def starganv2_vc_batch_fn(self, examples):
+        batch_size = len(examples)
+
+        label = [np.array(item["label"], dtype=np.int64) for item in examples]
+        ref_label = [
+            np.array(item["ref_label"], dtype=np.int64) for item in examples
+        ]
+
+        # 需要对 mel 进行裁剪
+        mel = [self.random_clip(item["mel"]) for item in examples]
+        ref_mel = [self.random_clip(item["ref_mel"]) for item in examples]
+        ref_mel_2 = [self.random_clip(item["ref_mel_2"]) for item in examples]
+        mel = batch_sequences(mel)
+        ref_mel = batch_sequences(ref_mel)
+        ref_mel_2 = batch_sequences(ref_mel_2)
+
+        # convert each batch to paddle.Tensor
+        # (B,)
+        label = paddle.to_tensor(label)
+        ref_label = paddle.to_tensor(ref_label)
+        # [B, T, 80] -> [B, 1, 80, T]
+        mel = paddle.to_tensor(mel).transpose([0, 2, 1]).unsqueeze(1)
+        ref_mel = paddle.to_tensor(ref_mel).transpose([0, 2, 1]).unsqueeze(1)
+        ref_mel_2 = paddle.to_tensor(ref_mel_2).transpose(
+            [0, 2, 1]).unsqueeze(1)
+
+        z_trg = paddle.randn([batch_size, self.latent_dim])
+        z_trg2 = paddle.randn([batch_size, self.latent_dim])
+
+        batch = {
+            "x_real": mel,
+            "y_org": label,
+            "x_ref": ref_mel,
+            "x_ref2": ref_mel_2,
+            "y_trg": ref_label,
+            "z_trg": z_trg,
+            "z_trg2": z_trg2
+        }
+
+        return batch
+
+
 # for PaddleSlim
 def fastspeech2_single_spk_batch_fn_static(examples):
     text = [np.array(item["text"], dtype=np.int64) for item in examples]
diff --git a/paddlespeech/t2s/datasets/data_table.py b/paddlespeech/t2s/datasets/data_table.py
index c9815af2..4ac67546 100644
--- a/paddlespeech/t2s/datasets/data_table.py
+++ b/paddlespeech/t2s/datasets/data_table.py
@@ -11,12 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import random
 from multiprocessing import Manager
 from typing import Any
 from typing import Callable
 from typing import Dict
 from typing import List
 
+import numpy as np
 from paddle.io import Dataset
 
 
@@ -131,3 +133,54 @@ class DataTable(Dataset):
             The length of the dataset
         """
         return len(self.data)
+
+
+class StarGANv2VCDataTable(DataTable):
+    def __init__(self, data: List[Dict[str, Any]]):
+        super().__init__(data)
+        raw_data = data
+        spk_id_set = list(set([item['spk_id'] for item in raw_data]))
+        data_list_per_class = {}
+        for spk_id in spk_id_set:
+            data_list_per_class[spk_id] = []
+        for item in raw_data:
+            for spk_id in spk_id_set:
+                if item['spk_id'] == spk_id:
+                    data_list_per_class[spk_id].append(item)
+        self.data_list_per_class = data_list_per_class
+
+    def __getitem__(self, idx: int) -> Dict[str, Any]:
+        """Get an example given an index.
+        Args:
+            idx (int): Index of the example to get
+
+        Returns:
+            Dict[str, Any]: A converted example
+        """
+        if self.use_cache and self.caches[idx] is not None:
+            return self.caches[idx]
+
+        data = self._get_metadata(idx)
+
+        # 裁剪放到 batch_fn 里面
+        # 返回一个字典
+        """
+        {'utt_id': 'p225_111', 'spk_id': '1', 'speech': 'path of *.npy'}
+        """
+        ref_data = random.choice(self.data)
+        ref_label = ref_data['spk_id']
+        ref_data_2 = random.choice(self.data_list_per_class[ref_label])
+        # mel_tensor, label, ref_mel_tensor, ref2_mel_tensor, ref_label
+        new_example = {
+            'utt_id': data['utt_id'],
+            'mel': np.load(data['speech']),
+            'label': int(data['spk_id']),
+            'ref_mel': np.load(ref_data['speech']),
+            'ref_mel_2': np.load(ref_data_2['speech']),
+            'ref_label': int(ref_label)
+        }
+
+        if self.use_cache:
+            self.caches[idx] = new_example
+
+        return new_example
diff --git a/paddlespeech/t2s/datasets/get_feats.py b/paddlespeech/t2s/datasets/get_feats.py
index 5ec97b81..ea273e24 100644
--- a/paddlespeech/t2s/datasets/get_feats.py
+++ b/paddlespeech/t2s/datasets/get_feats.py
@@ -12,17 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # Modified from espnet(https://github.com/espnet/espnet)
+from typing import List
+from typing import Optional
+from typing import Union
+
 import librosa
 import numpy as np
 import pyworld
 from scipy.interpolate import interp1d
-
-from typing import Optional
-from typing import Union
 from typing_extensions import Literal
 
 
-
 class LogMelFBank():
     def __init__(self,
                  sr: int=24000,
@@ -79,7 +79,7 @@ class LogMelFBank():
 
     def _spectrogram(self, wav: np.ndarray):
         D = self._stft(wav)
-        return np.abs(D) ** self.power
+        return np.abs(D)**self.power
 
     def _mel_spectrogram(self, wav: np.ndarray):
         S = self._spectrogram(wav)
@@ -117,7 +117,6 @@ class Pitch():
         if (f0 == 0).all():
             print("All frames seems to be unvoiced, this utt will be removed.")
             return f0
-
         # padding start and end of f0 sequence
         start_f0 = f0[f0 != 0][0]
         end_f0 = f0[f0 != 0][-1]
@@ -179,6 +178,8 @@ class Pitch():
         f0 = self._calculate_f0(wav, use_continuous_f0, use_log_f0)
         if use_token_averaged_f0 and duration is not None:
             f0 = self._average_by_duration(f0, duration)
+        else:
+            f0 = np.expand_dims(np.array(f0), 0).T
         return f0
 
 
@@ -237,6 +238,8 @@ class Energy():
         energy = self._calculate_energy(wav)
         if use_token_averaged_energy and duration is not None:
             energy = self._average_by_duration(energy, duration)
+        else:
+            energy = np.expand_dims(np.array(energy), 0).T
         return energy
 
 
diff --git a/paddlespeech/t2s/datasets/preprocess_utils.py b/paddlespeech/t2s/datasets/preprocess_utils.py
index 445b69bd..bf813b22 100644
--- a/paddlespeech/t2s/datasets/preprocess_utils.py
+++ b/paddlespeech/t2s/datasets/preprocess_utils.py
@@ -12,6 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import re
+from typing import List
+
+import librosa
+import numpy as np
 
 
 # speaker|utt_id|phn dur phn dur ...
@@ -41,6 +45,90 @@ def get_phn_dur(file_name):
     return sentence, speaker_set
 
 
+def note2midi(notes: List[str]) -> List[str]:
+    """Covert note string to note id, for example: ["C1"] -> [24]
+
+    Args:
+        notes (List[str]): the list of note string
+
+    Returns:
+        List[str]: the list of note id
+    """
+    midis = []
+    for note in notes:
+        if note == 'rest':
+            midi = 0
+        else:
+            midi = librosa.note_to_midi(note.split("/")[0])
+        midis.append(midi)
+
+    return midis
+
+
+def time2frame(
+        times: List[float],
+        sample_rate: int=24000,
+        n_shift: int=128, ) -> List[int]:
+    """Convert the phoneme duration of time(s) into frames
+
+    Args:
+        times (List[float]): phoneme duration of time(s)
+        sample_rate (int, optional): sample rate. Defaults to 24000.
+        n_shift (int, optional): frame shift. Defaults to 128.
+
+    Returns:
+        List[int]: phoneme duration of frame
+    """
+    end = 0.0
+    ends = []
+    for t in times:
+        end += t
+        ends.append(end)
+    frame_pos = librosa.time_to_frames(ends, sr=sample_rate, hop_length=n_shift)
+    durations = np.diff(frame_pos, prepend=0)
+    return durations
+
+
+def get_sentences_svs(
+        file_name,
+        dataset: str='opencpop',
+        sample_rate: int=24000,
+        n_shift: int=128, ):
+    '''
+    read label file
+    Args:
+        file_name (str or Path): path of gen_duration_from_textgrid.py's result
+        dataset (str): dataset name
+    Returns: 
+        Dict: the information of sentence, include [phone id (int)], [the frame of phone (int)], [note id (int)], [note duration (float)], [is slur (int)], text(str), speaker name (str)
+        tuple: speaker name
+    '''
+    f = open(file_name, 'r')
+    sentence = {}
+    speaker_set = set()
+    if dataset == 'opencpop':
+        speaker_set.add("opencpop")
+        for line in f:
+            line_list = line.strip().split('|')
+            utt = line_list[0]
+            text = line_list[1]
+            ph = line_list[2].split()
+            midi = note2midi(line_list[3].split())
+            midi_dur = line_list[4].split()
+            ph_dur = time2frame([float(t) for t in line_list[5].split()], sample_rate=sample_rate, n_shift=n_shift)
+            is_slur = line_list[6].split()
+            assert len(ph) == len(midi) == len(midi_dur) == len(is_slur)
+            sentence[utt] = (ph, [int(i) for i in ph_dur],
+                             [int(i) for i in midi],
+                             [float(i) for i in midi_dur],
+                             [int(i) for i in is_slur], text, "opencpop")
+    else:
+        print("dataset should in {opencpop} now!")
+
+    f.close()
+    return sentence, speaker_set
+
+
 def merge_silence(sentence):
     '''
     merge silences
@@ -88,6 +176,9 @@ def get_input_token(sentence, output_path, dataset="baker"):
     phn_token = ["<pad>", "<unk>"] + phn_token
     if dataset in {"baker", "aishell3"}:
         phn_token += ["，", "。", "？", "！"]
+    # svs dataset
+    elif dataset in {"opencpop"}:
+        pass
     else:
         phn_token += [",", ".", "?", "!"]
     phn_token += ["<eos>"]
diff --git a/paddlespeech/t2s/exps/PTQ_static.py b/paddlespeech/t2s/exps/PTQ_static.py
index 16b3ae98..a9578645 100644
--- a/paddlespeech/t2s/exps/PTQ_static.py
+++ b/paddlespeech/t2s/exps/PTQ_static.py
@@ -42,6 +42,8 @@ def parse_args():
             'hifigan_aishell3',
             'hifigan_ljspeech',
             'hifigan_vctk',
+            'pwgan_opencpop',
+            'hifigan_opencpop',
         ],
         help='Choose model type of tts task.')
 
diff --git a/paddlespeech/t2s/exps/diffsinger/__init__.py b/paddlespeech/t2s/exps/diffsinger/__init__.py
new file mode 100644
index 00000000..595add0a
--- /dev/null
+++ b/paddlespeech/t2s/exps/diffsinger/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddlespeech/t2s/exps/diffsinger/gen_gta_mel.py b/paddlespeech/t2s/exps/diffsinger/gen_gta_mel.py
new file mode 100644
index 00000000..519808f2
--- /dev/null
+++ b/paddlespeech/t2s/exps/diffsinger/gen_gta_mel.py
@@ -0,0 +1,240 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# generate mels using durations.txt
+# for mb melgan finetune
+import argparse
+import os
+from pathlib import Path
+
+import numpy as np
+import paddle
+import yaml
+from tqdm import tqdm
+from yacs.config import CfgNode
+
+from paddlespeech.t2s.datasets.preprocess_utils import get_sentences_svs
+from paddlespeech.t2s.models.diffsinger import DiffSinger
+from paddlespeech.t2s.models.diffsinger import DiffSingerInference
+from paddlespeech.t2s.modules.normalizer import ZScore
+from paddlespeech.t2s.utils import str2bool
+
+
+def evaluate(args, diffsinger_config):
+    rootdir = Path(args.rootdir).expanduser()
+    assert rootdir.is_dir()
+
+    # construct dataset for evaluation
+    with open(args.phones_dict, "r") as f:
+        phn_id = [line.strip().split() for line in f.readlines()]
+    vocab_size = len(phn_id)
+    print("vocab_size:", vocab_size)
+
+    phone_dict = {}
+    for phn, id in phn_id:
+        phone_dict[phn] = int(id)
+
+    if args.speaker_dict:
+        with open(args.speaker_dict, 'rt') as f:
+            spk_id_list = [line.strip().split() for line in f.readlines()]
+            spk_num = len(spk_id_list)
+    else:
+        spk_num = None
+
+    with open(args.diffsinger_stretch, "r") as f:
+        spec_min = np.load(args.diffsinger_stretch)[0]
+        spec_max = np.load(args.diffsinger_stretch)[1]
+        spec_min = paddle.to_tensor(spec_min)
+        spec_max = paddle.to_tensor(spec_max)
+    print("min and max spec done!")
+
+    odim = diffsinger_config.n_mels
+    diffsinger_config["model"]["fastspeech2_params"]["spk_num"] = spk_num
+    model = DiffSinger(
+        spec_min=spec_min,
+        spec_max=spec_max,
+        idim=vocab_size,
+        odim=odim,
+        **diffsinger_config["model"], )
+
+    model.set_state_dict(paddle.load(args.diffsinger_checkpoint)["main_params"])
+    model.eval()
+
+    stat = np.load(args.diffsinger_stat)
+    mu, std = stat
+    mu = paddle.to_tensor(mu)
+    std = paddle.to_tensor(std)
+    diffsinger_normalizer = ZScore(mu, std)
+
+    diffsinger_inference = DiffSingerInference(diffsinger_normalizer, model)
+    diffsinger_inference.eval()
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    sentences, speaker_set = get_sentences_svs(
+        args.dur_file,
+        dataset=args.dataset,
+        sample_rate=diffsinger_config.fs,
+        n_shift=diffsinger_config.n_shift, )
+
+    if args.dataset == "opencpop":
+        wavdir = rootdir / "wavs"
+        # split data into 3 sections
+        train_file = rootdir / "train.txt"
+        train_wav_files = []
+        with open(train_file, "r") as f_train:
+            for line in f_train.readlines():
+                utt = line.split("|")[0]
+                wav_name = utt + ".wav"
+                wav_path = wavdir / wav_name
+                train_wav_files.append(wav_path)
+
+        test_file = rootdir / "test.txt"
+        dev_wav_files = []
+        test_wav_files = []
+        num_dev = 106
+        count = 0
+        with open(test_file, "r") as f_test:
+            for line in f_test.readlines():
+                count += 1
+                utt = line.split("|")[0]
+                wav_name = utt + ".wav"
+                wav_path = wavdir / wav_name
+                if count > num_dev:
+                    test_wav_files.append(wav_path)
+                else:
+                    dev_wav_files.append(wav_path)
+    else:
+        print("dataset should in {opencpop} now!")
+
+    train_wav_files = [
+        os.path.basename(str(str_path)) for str_path in train_wav_files
+    ]
+    dev_wav_files = [
+        os.path.basename(str(str_path)) for str_path in dev_wav_files
+    ]
+    test_wav_files = [
+        os.path.basename(str(str_path)) for str_path in test_wav_files
+    ]
+
+    for i, utt_id in enumerate(tqdm(sentences)):
+        phones = sentences[utt_id][0]
+        durations = sentences[utt_id][1]
+        note = sentences[utt_id][2]
+        note_dur = sentences[utt_id][3]
+        is_slur = sentences[utt_id][4]
+        speaker = sentences[utt_id][-1]
+
+        phone_ids = [phone_dict[phn] for phn in phones]
+        phone_ids = paddle.to_tensor(np.array(phone_ids))
+
+        if args.speaker_dict:
+            speaker_id = int(
+                [item[1] for item in spk_id_list if speaker == item[0]][0])
+            speaker_id = paddle.to_tensor(speaker_id)
+        else:
+            speaker_id = None
+
+        durations = paddle.to_tensor(np.array(durations))
+        note = paddle.to_tensor(np.array(note))
+        note_dur = paddle.to_tensor(np.array(note_dur))
+        is_slur = paddle.to_tensor(np.array(is_slur))
+        # 生成的和真实的可能有 1, 2 帧的差距，但是 batch_fn 会修复
+        # split data into 3 sections
+
+        wav_path = utt_id + ".wav"
+
+        if wav_path in train_wav_files:
+            sub_output_dir = output_dir / ("train/raw")
+        elif wav_path in dev_wav_files:
+            sub_output_dir = output_dir / ("dev/raw")
+        elif wav_path in test_wav_files:
+            sub_output_dir = output_dir / ("test/raw")
+
+        sub_output_dir.mkdir(parents=True, exist_ok=True)
+
+        with paddle.no_grad():
+            mel = diffsinger_inference(
+                text=phone_ids,
+                note=note,
+                note_dur=note_dur,
+                is_slur=is_slur,
+                get_mel_fs2=False)
+        np.save(sub_output_dir / (utt_id + "_feats.npy"), mel)
+
+
+def main():
+    # parse args and config and redirect to train_sp
+    parser = argparse.ArgumentParser(
+        description="Generate mel with diffsinger.")
+    parser.add_argument(
+        "--dataset",
+        default="opencpop",
+        type=str,
+        help="name of dataset, should in {opencpop} now")
+    parser.add_argument(
+        "--rootdir", default=None, type=str, help="directory to dataset.")
+    parser.add_argument(
+        "--diffsinger-config", type=str, help="diffsinger config file.")
+    parser.add_argument(
+        "--diffsinger-checkpoint",
+        type=str,
+        help="diffsinger checkpoint to load.")
+    parser.add_argument(
+        "--diffsinger-stat",
+        type=str,
+        help="mean and standard deviation used to normalize spectrogram when training diffsinger."
+    )
+    parser.add_argument(
+        "--diffsinger-stretch",
+        type=str,
+        help="min and max mel used to stretch before training diffusion.")
+
+    parser.add_argument(
+        "--phones-dict",
+        type=str,
+        default="phone_id_map.txt",
+        help="phone vocabulary file.")
+
+    parser.add_argument(
+        "--speaker-dict", type=str, default=None, help="speaker id map file.")
+
+    parser.add_argument(
+        "--dur-file", default=None, type=str, help="path to durations.txt.")
+    parser.add_argument("--output-dir", type=str, help="output dir.")
+    parser.add_argument(
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
+
+    args = parser.parse_args()
+
+    if args.ngpu == 0:
+        paddle.set_device("cpu")
+    elif args.ngpu > 0:
+        paddle.set_device("gpu")
+    else:
+        print("ngpu should >= 0 !")
+
+    with open(args.diffsinger_config) as f:
+        diffsinger_config = CfgNode(yaml.safe_load(f))
+
+    print("========Args========")
+    print(yaml.safe_dump(vars(args)))
+    print("========Config========")
+    print(diffsinger_config)
+
+    evaluate(args, diffsinger_config)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/paddlespeech/t2s/exps/diffsinger/get_minmax.py b/paddlespeech/t2s/exps/diffsinger/get_minmax.py
new file mode 100644
index 00000000..5457f1e2
--- /dev/null
+++ b/paddlespeech/t2s/exps/diffsinger/get_minmax.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import logging
+
+import jsonlines
+import numpy as np
+from tqdm import tqdm
+
+from paddlespeech.t2s.datasets.data_table import DataTable
+
+
+def get_minmax(spec, min_spec, max_spec):
+    # spec: [T, 80]
+    for i in range(spec.shape[1]):
+        min_value = np.min(spec[:, i])
+        max_value = np.max(spec[:, i])
+        min_spec[i] = min(min_value, min_spec[i])
+        max_spec[i] = max(max_value, max_spec[i])
+
+    return min_spec, max_spec
+
+
+def main():
+    """Run preprocessing process."""
+    parser = argparse.ArgumentParser(
+        description="Normalize dumped raw features (See detail in parallel_wavegan/bin/normalize.py)."
+    )
+    parser.add_argument(
+        "--metadata",
+        type=str,
+        required=True,
+        help="directory including feature files to be normalized. "
+        "you need to specify either *-scp or rootdir.")
+
+    parser.add_argument(
+        "--speech-stretchs",
+        type=str,
+        required=True,
+        help="min max spec file. only computer on train data")
+
+    args = parser.parse_args()
+
+    # get dataset
+    with jsonlines.open(args.metadata, 'r') as reader:
+        metadata = list(reader)
+    dataset = DataTable(
+        metadata, converters={
+            "speech": np.load,
+        })
+    logging.info(f"The number of files = {len(dataset)}.")
+
+    n_mel = 80
+    min_spec = 100.0 * np.ones(shape=(n_mel), dtype=np.float32)
+    max_spec = -100.0 * np.ones(shape=(n_mel), dtype=np.float32)
+
+    for item in tqdm(dataset):
+        spec = item['speech']
+        min_spec, max_spec = get_minmax(spec, min_spec, max_spec)
+
+    # Using min_spec=-6.0 training effect is better so far
+    min_spec = -6.0 * np.ones(shape=(n_mel), dtype=np.float32)
+    min_max_spec = np.stack([min_spec, max_spec], axis=0)
+    np.save(
+        str(args.speech_stretchs),
+        min_max_spec.astype(np.float32),
+        allow_pickle=False)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/paddlespeech/t2s/exps/diffsinger/normalize.py b/paddlespeech/t2s/exps/diffsinger/normalize.py
new file mode 100644
index 00000000..d3e61162
--- /dev/null
+++ b/paddlespeech/t2s/exps/diffsinger/normalize.py
@@ -0,0 +1,189 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Normalize feature files and dump them."""
+import argparse
+import logging
+from operator import itemgetter
+from pathlib import Path
+
+import jsonlines
+import numpy as np
+from sklearn.preprocessing import StandardScaler
+from tqdm import tqdm
+
+from paddlespeech.t2s.datasets.data_table import DataTable
+from paddlespeech.t2s.utils import str2bool
+
+
+def main():
+    """Run preprocessing process."""
+    parser = argparse.ArgumentParser(
+        description="Normalize dumped raw features (See detail in parallel_wavegan/bin/normalize.py)."
+    )
+    parser.add_argument(
+        "--metadata",
+        type=str,
+        required=True,
+        help="directory including feature files to be normalized. "
+        "you need to specify either *-scp or rootdir.")
+
+    parser.add_argument(
+        "--dumpdir",
+        type=str,
+        required=True,
+        help="directory to dump normalized feature files.")
+    parser.add_argument(
+        "--speech-stats",
+        type=str,
+        required=True,
+        help="speech statistics file.")
+    parser.add_argument(
+        "--pitch-stats", type=str, required=True, help="pitch statistics file.")
+    parser.add_argument(
+        "--energy-stats",
+        type=str,
+        required=True,
+        help="energy statistics file.")
+    parser.add_argument(
+        "--phones-dict", type=str, default=None, help="phone vocabulary file.")
+    parser.add_argument(
+        "--speaker-dict", type=str, default=None, help="speaker id map file.")
+    parser.add_argument(
+        "--norm-feats",
+        type=str2bool,
+        default=False,
+        help="whether to norm features")
+
+    args = parser.parse_args()
+
+    dumpdir = Path(args.dumpdir).expanduser()
+    # use absolute path
+    dumpdir = dumpdir.resolve()
+    dumpdir.mkdir(parents=True, exist_ok=True)
+
+    # get dataset
+    with jsonlines.open(args.metadata, 'r') as reader:
+        metadata = list(reader)
+    dataset = DataTable(
+        metadata,
+        converters={
+            "speech": np.load,
+            "pitch": np.load,
+            "energy": np.load,
+        })
+    logging.info(f"The number of files = {len(dataset)}.")
+
+    # restore scaler
+    speech_scaler = StandardScaler()
+    if args.norm_feats:
+        speech_scaler.mean_ = np.load(args.speech_stats)[0]
+        speech_scaler.scale_ = np.load(args.speech_stats)[1]
+    else:
+        speech_scaler.mean_ = np.zeros(
+            np.load(args.speech_stats)[0].shape, dtype="float32")
+        speech_scaler.scale_ = np.ones(
+            np.load(args.speech_stats)[1].shape, dtype="float32")
+    speech_scaler.n_features_in_ = speech_scaler.mean_.shape[0]
+
+    pitch_scaler = StandardScaler()
+    if args.norm_feats:
+        pitch_scaler.mean_ = np.load(args.pitch_stats)[0]
+        pitch_scaler.scale_ = np.load(args.pitch_stats)[1]
+    else:
+        pitch_scaler.mean_ = np.zeros(
+            np.load(args.pitch_stats)[0].shape, dtype="float32")
+        pitch_scaler.scale_ = np.ones(
+            np.load(args.pitch_stats)[1].shape, dtype="float32")
+    pitch_scaler.n_features_in_ = pitch_scaler.mean_.shape[0]
+
+    energy_scaler = StandardScaler()
+    if args.norm_feats:
+        energy_scaler.mean_ = np.load(args.energy_stats)[0]
+        energy_scaler.scale_ = np.load(args.energy_stats)[1]
+    else:
+        energy_scaler.mean_ = np.zeros(
+            np.load(args.energy_stats)[0].shape, dtype="float32")
+        energy_scaler.scale_ = np.ones(
+            np.load(args.energy_stats)[1].shape, dtype="float32")
+    energy_scaler.n_features_in_ = energy_scaler.mean_.shape[0]
+
+    vocab_phones = {}
+    with open(args.phones_dict, 'rt') as f:
+        phn_id = [line.strip().split() for line in f.readlines()]
+    for phn, id in phn_id:
+        vocab_phones[phn] = int(id)
+
+    vocab_speaker = {}
+    with open(args.speaker_dict, 'rt') as f:
+        spk_id = [line.strip().split() for line in f.readlines()]
+    for spk, id in spk_id:
+        vocab_speaker[spk] = int(id)
+
+    # process each file
+    output_metadata = []
+
+    for item in tqdm(dataset):
+        utt_id = item['utt_id']
+        speech = item['speech']
+        pitch = item['pitch']
+        energy = item['energy']
+        # normalize
+        speech = speech_scaler.transform(speech)
+        speech_dir = dumpdir / "data_speech"
+        speech_dir.mkdir(parents=True, exist_ok=True)
+        speech_path = speech_dir / f"{utt_id}_speech.npy"
+        np.save(speech_path, speech.astype(np.float32), allow_pickle=False)
+
+        pitch = pitch_scaler.transform(pitch)
+        pitch_dir = dumpdir / "data_pitch"
+        pitch_dir.mkdir(parents=True, exist_ok=True)
+        pitch_path = pitch_dir / f"{utt_id}_pitch.npy"
+        np.save(pitch_path, pitch.astype(np.float32), allow_pickle=False)
+
+        energy = energy_scaler.transform(energy)
+        energy_dir = dumpdir / "data_energy"
+        energy_dir.mkdir(parents=True, exist_ok=True)
+        energy_path = energy_dir / f"{utt_id}_energy.npy"
+        np.save(energy_path, energy.astype(np.float32), allow_pickle=False)
+        phone_ids = [vocab_phones[p] for p in item['phones']]
+        spk_id = vocab_speaker[item["speaker"]]
+        record = {
+            "utt_id": item['utt_id'],
+            "spk_id": spk_id,
+            "text": phone_ids,
+            "text_lengths": item['text_lengths'],
+            "speech_lengths": item['speech_lengths'],
+            "durations": item['durations'],
+            "speech": str(speech_path),
+            "pitch": str(pitch_path),
+            "energy": str(energy_path),
+            "note": item['note'],
+            "note_dur": item['note_dur'],
+            "is_slur": item['is_slur'],
+        }
+        # add spk_emb for voice cloning
+        if "spk_emb" in item:
+            record["spk_emb"] = str(item["spk_emb"])
+
+        output_metadata.append(record)
+    output_metadata.sort(key=itemgetter('utt_id'))
+    output_metadata_path = Path(args.dumpdir) / "metadata.jsonl"
+    with jsonlines.open(output_metadata_path, 'w') as writer:
+        for item in output_metadata:
+            writer.write(item)
+    logging.info(f"metadata dumped into {output_metadata_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/paddlespeech/t2s/exps/diffsinger/preprocess.py b/paddlespeech/t2s/exps/diffsinger/preprocess.py
new file mode 100644
index 00000000..a60ad44d
--- /dev/null
+++ b/paddlespeech/t2s/exps/diffsinger/preprocess.py
@@ -0,0 +1,377 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+from concurrent.futures import ThreadPoolExecutor
+from operator import itemgetter
+from pathlib import Path
+from typing import Any
+from typing import Dict
+from typing import List
+
+import jsonlines
+import librosa
+import numpy as np
+import tqdm
+import yaml
+from yacs.config import CfgNode
+
+from paddlespeech.t2s.datasets.get_feats import Energy
+from paddlespeech.t2s.datasets.get_feats import LogMelFBank
+from paddlespeech.t2s.datasets.get_feats import Pitch
+from paddlespeech.t2s.datasets.preprocess_utils import compare_duration_and_mel_length
+from paddlespeech.t2s.datasets.preprocess_utils import get_input_token
+from paddlespeech.t2s.datasets.preprocess_utils import get_sentences_svs
+from paddlespeech.t2s.datasets.preprocess_utils import get_spk_id_map
+from paddlespeech.t2s.utils import str2bool
+
+ALL_INITIALS = [
+    'zh', 'ch', 'sh', 'b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h',
+    'j', 'q', 'x', 'r', 'z', 'c', 's', 'y', 'w'
+]
+ALL_FINALS = [
+    'a', 'ai', 'an', 'ang', 'ao', 'e', 'ei', 'en', 'eng', 'er', 'i', 'ia',
+    'ian', 'iang', 'iao', 'ie', 'in', 'ing', 'iong', 'iu', 'ng', 'o', 'ong',
+    'ou', 'u', 'ua', 'uai', 'uan', 'uang', 'ui', 'un', 'uo', 'v', 'van', 've',
+    'vn'
+]
+
+
+def process_sentence(
+        config: Dict[str, Any],
+        fp: Path,
+        sentences: Dict,
+        output_dir: Path,
+        mel_extractor=None,
+        pitch_extractor=None,
+        energy_extractor=None,
+        cut_sil: bool=True,
+        spk_emb_dir: Path=None, ):
+    utt_id = fp.stem
+    record = None
+    if utt_id in sentences:
+        # reading, resampling may occur
+        wav, _ = librosa.load(str(fp), sr=config.fs)
+        if len(wav.shape) != 1:
+            return record
+        max_value = np.abs(wav).max()
+        if max_value > 1.0:
+            wav = wav / max_value
+        assert len(wav.shape) == 1, f"{utt_id} is not a mono-channel audio."
+        assert np.abs(wav).max(
+        ) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM."
+        phones = sentences[utt_id][0]
+        durations = sentences[utt_id][1]
+        note = sentences[utt_id][2]
+        note_dur = sentences[utt_id][3]
+        is_slur = sentences[utt_id][4]
+        speaker = sentences[utt_id][-1]
+
+        # extract mel feats
+        logmel = mel_extractor.get_log_mel_fbank(wav)
+        # change duration according to mel_length
+        compare_duration_and_mel_length(sentences, utt_id, logmel)
+        # utt_id may be popped in compare_duration_and_mel_length
+        if utt_id not in sentences:
+            return None
+        phones = sentences[utt_id][0]
+        durations = sentences[utt_id][1]
+        num_frames = logmel.shape[0]
+
+        assert sum(
+            durations
+        ) == num_frames, "the sum of durations doesn't equal to the num of mel frames. "
+        speech_dir = output_dir / "data_speech"
+        speech_dir.mkdir(parents=True, exist_ok=True)
+        speech_path = speech_dir / (utt_id + "_speech.npy")
+        np.save(speech_path, logmel)
+        # extract pitch and energy
+        pitch = pitch_extractor.get_pitch(wav)
+        assert pitch.shape[0] == num_frames
+        pitch_dir = output_dir / "data_pitch"
+        pitch_dir.mkdir(parents=True, exist_ok=True)
+        pitch_path = pitch_dir / (utt_id + "_pitch.npy")
+        np.save(pitch_path, pitch)
+        energy = energy_extractor.get_energy(wav)
+        assert energy.shape[0] == num_frames
+        energy_dir = output_dir / "data_energy"
+        energy_dir.mkdir(parents=True, exist_ok=True)
+        energy_path = energy_dir / (utt_id + "_energy.npy")
+        np.save(energy_path, energy)
+
+        record = {
+            "utt_id": utt_id,
+            "phones": phones,
+            "text_lengths": len(phones),
+            "speech_lengths": num_frames,
+            "durations": durations,
+            "speech": str(speech_path),
+            "pitch": str(pitch_path),
+            "energy": str(energy_path),
+            "speaker": speaker,
+            "note": note,
+            "note_dur": note_dur,
+            "is_slur": is_slur,
+        }
+        if spk_emb_dir:
+            if speaker in os.listdir(spk_emb_dir):
+                embed_name = utt_id + ".npy"
+                embed_path = spk_emb_dir / speaker / embed_name
+                if embed_path.is_file():
+                    record["spk_emb"] = str(embed_path)
+                else:
+                    return None
+    return record
+
+
+def process_sentences(
+        config,
+        fps: List[Path],
+        sentences: Dict,
+        output_dir: Path,
+        mel_extractor=None,
+        pitch_extractor=None,
+        energy_extractor=None,
+        nprocs: int=1,
+        cut_sil: bool=True,
+        spk_emb_dir: Path=None,
+        write_metadata_method: str='w', ):
+    if nprocs == 1:
+        results = []
+        for fp in tqdm.tqdm(fps, total=len(fps)):
+            record = process_sentence(
+                config=config,
+                fp=fp,
+                sentences=sentences,
+                output_dir=output_dir,
+                mel_extractor=mel_extractor,
+                pitch_extractor=pitch_extractor,
+                energy_extractor=energy_extractor,
+                cut_sil=cut_sil,
+                spk_emb_dir=spk_emb_dir, )
+            if record:
+                results.append(record)
+    else:
+        with ThreadPoolExecutor(nprocs) as pool:
+            futures = []
+            with tqdm.tqdm(total=len(fps)) as progress:
+                for fp in fps:
+                    future = pool.submit(
+                        process_sentence,
+                        config,
+                        fp,
+                        sentences,
+                        output_dir,
+                        mel_extractor,
+                        pitch_extractor,
+                        energy_extractor,
+                        cut_sil,
+                        spk_emb_dir, )
+                    future.add_done_callback(lambda p: progress.update())
+                    futures.append(future)
+
+                results = []
+                for ft in futures:
+                    record = ft.result()
+                    if record:
+                        results.append(record)
+
+    results.sort(key=itemgetter("utt_id"))
+    with jsonlines.open(output_dir / "metadata.jsonl",
+                        write_metadata_method) as writer:
+        for item in results:
+            writer.write(item)
+    print("Done")
+
+
+def main():
+    # parse config and args
+    parser = argparse.ArgumentParser(
+        description="Preprocess audio and then extract features.")
+
+    parser.add_argument(
+        "--dataset",
+        default="opencpop",
+        type=str,
+        help="name of dataset, should in {opencpop} now")
+
+    parser.add_argument(
+        "--rootdir", default=None, type=str, help="directory to dataset.")
+
+    parser.add_argument(
+        "--dumpdir",
+        type=str,
+        required=True,
+        help="directory to dump feature files.")
+
+    parser.add_argument(
+        "--label-file", default=None, type=str, help="path to label file.")
+
+    parser.add_argument("--config", type=str, help="diffsinger config file.")
+
+    parser.add_argument(
+        "--num-cpu", type=int, default=1, help="number of process.")
+
+    parser.add_argument(
+        "--cut-sil",
+        type=str2bool,
+        default=True,
+        help="whether cut sil in the edge of audio")
+
+    parser.add_argument(
+        "--spk_emb_dir",
+        default=None,
+        type=str,
+        help="directory to speaker embedding files.")
+
+    parser.add_argument(
+        "--write_metadata_method",
+        default="w",
+        type=str,
+        choices=["w", "a"],
+        help="How the metadata.jsonl file is written.")
+    args = parser.parse_args()
+
+    rootdir = Path(args.rootdir).expanduser()
+    dumpdir = Path(args.dumpdir).expanduser()
+    # use absolute path
+    dumpdir = dumpdir.resolve()
+    dumpdir.mkdir(parents=True, exist_ok=True)
+    label_file = Path(args.label_file).expanduser()
+
+    if args.spk_emb_dir:
+        spk_emb_dir = Path(args.spk_emb_dir).expanduser().resolve()
+    else:
+        spk_emb_dir = None
+
+    assert rootdir.is_dir()
+    assert label_file.is_file()
+
+    with open(args.config, 'rt') as f:
+        config = CfgNode(yaml.safe_load(f))
+
+    sentences, speaker_set = get_sentences_svs(
+        label_file,
+        dataset=args.dataset,
+        sample_rate=config.fs,
+        n_shift=config.n_shift, )
+
+    phone_id_map_path = dumpdir / "phone_id_map.txt"
+    speaker_id_map_path = dumpdir / "speaker_id_map.txt"
+    get_input_token(sentences, phone_id_map_path, args.dataset)
+    get_spk_id_map(speaker_set, speaker_id_map_path)
+
+    if args.dataset == "opencpop":
+        wavdir = rootdir / "wavs"
+        # split data into 3 sections
+        train_file = rootdir / "train.txt"
+        train_wav_files = []
+        with open(train_file, "r") as f_train:
+            for line in f_train.readlines():
+                utt = line.split("|")[0]
+                wav_name = utt + ".wav"
+                wav_path = wavdir / wav_name
+                train_wav_files.append(wav_path)
+
+        test_file = rootdir / "test.txt"
+        dev_wav_files = []
+        test_wav_files = []
+        num_dev = 106
+        count = 0
+        with open(test_file, "r") as f_test:
+            for line in f_test.readlines():
+                count += 1
+                utt = line.split("|")[0]
+                wav_name = utt + ".wav"
+                wav_path = wavdir / wav_name
+                if count > num_dev:
+                    test_wav_files.append(wav_path)
+                else:
+                    dev_wav_files.append(wav_path)
+
+    else:
+        print("dataset should in {opencpop} now!")
+
+    train_dump_dir = dumpdir / "train" / "raw"
+    train_dump_dir.mkdir(parents=True, exist_ok=True)
+    dev_dump_dir = dumpdir / "dev" / "raw"
+    dev_dump_dir.mkdir(parents=True, exist_ok=True)
+    test_dump_dir = dumpdir / "test" / "raw"
+    test_dump_dir.mkdir(parents=True, exist_ok=True)
+
+    # Extractor
+    mel_extractor = LogMelFBank(
+        sr=config.fs,
+        n_fft=config.n_fft,
+        hop_length=config.n_shift,
+        win_length=config.win_length,
+        window=config.window,
+        n_mels=config.n_mels,
+        fmin=config.fmin,
+        fmax=config.fmax)
+    pitch_extractor = Pitch(
+        sr=config.fs,
+        hop_length=config.n_shift,
+        f0min=config.f0min,
+        f0max=config.f0max)
+    energy_extractor = Energy(
+        n_fft=config.n_fft,
+        hop_length=config.n_shift,
+        win_length=config.win_length,
+        window=config.window)
+
+    # process for the 3 sections
+    if train_wav_files:
+        process_sentences(
+            config=config,
+            fps=train_wav_files,
+            sentences=sentences,
+            output_dir=train_dump_dir,
+            mel_extractor=mel_extractor,
+            pitch_extractor=pitch_extractor,
+            energy_extractor=energy_extractor,
+            nprocs=args.num_cpu,
+            cut_sil=args.cut_sil,
+            spk_emb_dir=spk_emb_dir,
+            write_metadata_method=args.write_metadata_method)
+    if dev_wav_files:
+        process_sentences(
+            config=config,
+            fps=dev_wav_files,
+            sentences=sentences,
+            output_dir=dev_dump_dir,
+            mel_extractor=mel_extractor,
+            pitch_extractor=pitch_extractor,
+            energy_extractor=energy_extractor,
+            nprocs=args.num_cpu,
+            cut_sil=args.cut_sil,
+            spk_emb_dir=spk_emb_dir,
+            write_metadata_method=args.write_metadata_method)
+    if test_wav_files:
+        process_sentences(
+            config=config,
+            fps=test_wav_files,
+            sentences=sentences,
+            output_dir=test_dump_dir,
+            mel_extractor=mel_extractor,
+            pitch_extractor=pitch_extractor,
+            energy_extractor=energy_extractor,
+            nprocs=args.num_cpu,
+            cut_sil=args.cut_sil,
+            spk_emb_dir=spk_emb_dir,
+            write_metadata_method=args.write_metadata_method)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/paddlespeech/t2s/exps/diffsinger/train.py b/paddlespeech/t2s/exps/diffsinger/train.py
new file mode 100644
index 00000000..e79104c4
--- /dev/null
+++ b/paddlespeech/t2s/exps/diffsinger/train.py
@@ -0,0 +1,257 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import logging
+import os
+import shutil
+from pathlib import Path
+
+import jsonlines
+import numpy as np
+import paddle
+import yaml
+from paddle import DataParallel
+from paddle import distributed as dist
+from paddle import nn
+from paddle.io import DataLoader
+from paddle.io import DistributedBatchSampler
+from paddle.optimizer import AdamW
+from paddle.optimizer.lr import StepDecay
+from yacs.config import CfgNode
+
+from paddlespeech.t2s.datasets.am_batch_fn import diffsinger_multi_spk_batch_fn
+from paddlespeech.t2s.datasets.am_batch_fn import diffsinger_single_spk_batch_fn
+from paddlespeech.t2s.datasets.data_table import DataTable
+from paddlespeech.t2s.models.diffsinger import DiffSinger
+from paddlespeech.t2s.models.diffsinger import DiffSingerEvaluator
+from paddlespeech.t2s.models.diffsinger import DiffSingerUpdater
+from paddlespeech.t2s.models.diffsinger import DiffusionLoss
+from paddlespeech.t2s.models.diffsinger.fastspeech2midi import FastSpeech2MIDILoss
+from paddlespeech.t2s.training.extensions.snapshot import Snapshot
+from paddlespeech.t2s.training.extensions.visualizer import VisualDL
+from paddlespeech.t2s.training.optimizer import build_optimizers
+from paddlespeech.t2s.training.seeding import seed_everything
+from paddlespeech.t2s.training.trainer import Trainer
+
+
+def train_sp(args, config):
+    # decides device type and whether to run in parallel
+    # setup running environment correctly
+    if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0:
+        paddle.set_device("cpu")
+    else:
+        paddle.set_device("gpu")
+    world_size = paddle.distributed.get_world_size()
+    if world_size > 1:
+        paddle.distributed.init_parallel_env()
+
+    # set the random seed, it is a must for multiprocess training
+    seed_everything(config.seed)
+
+    print(
+        f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}",
+    )
+    fields = [
+        "text", "text_lengths", "speech", "speech_lengths", "durations",
+        "pitch", "energy", "note", "note_dur", "is_slur"
+    ]
+    converters = {"speech": np.load, "pitch": np.load, "energy": np.load}
+    spk_num = None
+    if args.speaker_dict is not None:
+        print("multiple speaker diffsinger!")
+        collate_fn = diffsinger_multi_spk_batch_fn
+        with open(args.speaker_dict, 'rt') as f:
+            spk_id = [line.strip().split() for line in f.readlines()]
+        spk_num = len(spk_id)
+        fields += ["spk_id"]
+    else:
+        collate_fn = diffsinger_single_spk_batch_fn
+        print("single speaker diffsinger!")
+
+    print("spk_num:", spk_num)
+
+    # dataloader has been too verbose
+    logging.getLogger("DataLoader").disabled = True
+
+    # construct dataset for training and validation
+    with jsonlines.open(args.train_metadata, 'r') as reader:
+        train_metadata = list(reader)
+    train_dataset = DataTable(
+        data=train_metadata,
+        fields=fields,
+        converters=converters, )
+    with jsonlines.open(args.dev_metadata, 'r') as reader:
+        dev_metadata = list(reader)
+    dev_dataset = DataTable(
+        data=dev_metadata,
+        fields=fields,
+        converters=converters, )
+
+    # collate function and dataloader
+    train_sampler = DistributedBatchSampler(
+        train_dataset,
+        batch_size=config.batch_size,
+        shuffle=True,
+        drop_last=True)
+
+    print("samplers done!")
+
+    train_dataloader = DataLoader(
+        train_dataset,
+        batch_sampler=train_sampler,
+        collate_fn=collate_fn,
+        num_workers=config.num_workers)
+
+    dev_dataloader = DataLoader(
+        dev_dataset,
+        shuffle=False,
+        drop_last=False,
+        batch_size=config.batch_size,
+        collate_fn=collate_fn,
+        num_workers=config.num_workers)
+    print("dataloaders done!")
+
+    with open(args.phones_dict, "r") as f:
+        phn_id = [line.strip().split() for line in f.readlines()]
+    vocab_size = len(phn_id)
+    print("vocab_size:", vocab_size)
+
+    with open(args.speech_stretchs, "r") as f:
+        spec_min = np.load(args.speech_stretchs)[0]
+        spec_max = np.load(args.speech_stretchs)[1]
+        spec_min = paddle.to_tensor(spec_min)
+        spec_max = paddle.to_tensor(spec_max)
+    print("min and max spec done!")
+
+    odim = config.n_mels
+    config["model"]["fastspeech2_params"]["spk_num"] = spk_num
+    model = DiffSinger(
+        spec_min=spec_min,
+        spec_max=spec_max,
+        idim=vocab_size,
+        odim=odim,
+        **config["model"], )
+    model_fs2 = model.fs2
+    model_ds = model.diffusion
+    if world_size > 1:
+        model = DataParallel(model)
+        model_fs2 = model._layers.fs2
+        model_ds = model._layers.diffusion
+    print("models done!")
+
+    criterion_fs2 = FastSpeech2MIDILoss(**config["fs2_updater"])
+    criterion_ds = DiffusionLoss(**config["ds_updater"])
+    print("criterions done!")
+
+    optimizer_fs2 = build_optimizers(model_fs2, **config["fs2_optimizer"])
+    lr_schedule_ds = StepDecay(**config["ds_scheduler_params"])
+    gradient_clip_ds = nn.ClipGradByGlobalNorm(config["ds_grad_norm"])
+    optimizer_ds = AdamW(
+        learning_rate=lr_schedule_ds,
+        grad_clip=gradient_clip_ds,
+        parameters=model_ds.parameters(),
+        **config["ds_optimizer_params"])
+    print("optimizer done!")
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    if dist.get_rank() == 0:
+        config_name = args.config.split("/")[-1]
+        # copy conf to output_dir
+        shutil.copyfile(args.config, output_dir / config_name)
+
+    updater = DiffSingerUpdater(
+        model=model,
+        optimizers={
+            "fs2": optimizer_fs2,
+            "ds": optimizer_ds,
+        },
+        criterions={
+            "fs2": criterion_fs2,
+            "ds": criterion_ds,
+        },
+        dataloader=train_dataloader,
+        ds_train_start_steps=config.ds_train_start_steps,
+        output_dir=output_dir,
+        only_train_diffusion=config["only_train_diffusion"])
+
+    evaluator = DiffSingerEvaluator(
+        model=model,
+        criterions={
+            "fs2": criterion_fs2,
+            "ds": criterion_ds,
+        },
+        dataloader=dev_dataloader,
+        output_dir=output_dir, )
+
+    trainer = Trainer(
+        updater,
+        stop_trigger=(config.train_max_steps, "iteration"),
+        out=output_dir, )
+
+    if dist.get_rank() == 0:
+        trainer.extend(
+            evaluator, trigger=(config.eval_interval_steps, 'iteration'))
+        trainer.extend(VisualDL(output_dir), trigger=(1, 'iteration'))
+    trainer.extend(
+        Snapshot(max_size=config.num_snapshots),
+        trigger=(config.save_interval_steps, 'iteration'))
+
+    print("Trainer Done!")
+    trainer.run()
+
+
+def main():
+    # parse args and config and redirect to train_sp
+    parser = argparse.ArgumentParser(description="Train a DiffSinger model.")
+    parser.add_argument("--config", type=str, help="diffsinger config file.")
+    parser.add_argument("--train-metadata", type=str, help="training data.")
+    parser.add_argument("--dev-metadata", type=str, help="dev data.")
+    parser.add_argument("--output-dir", type=str, help="output dir.")
+    parser.add_argument(
+        "--ngpu", type=int, default=1, help="if ngpu=0, use cpu.")
+    parser.add_argument(
+        "--phones-dict", type=str, default=None, help="phone vocabulary file.")
+    parser.add_argument(
+        "--speaker-dict",
+        type=str,
+        default=None,
+        help="speaker id map file for multiple speaker model.")
+    parser.add_argument(
+        "--speech-stretchs",
+        type=str,
+        help="The min and max values of the mel spectrum.")
+
+    args = parser.parse_args()
+
+    with open(args.config) as f:
+        config = CfgNode(yaml.safe_load(f))
+
+    print("========Args========")
+    print(yaml.safe_dump(vars(args)))
+    print("========Config========")
+    print(config)
+    print(
+        f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}"
+    )
+
+    # dispatch
+    if args.ngpu > 1:
+        dist.spawn(train_sp, (args, config), nprocs=args.ngpu)
+    else:
+        train_sp(args, config)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/paddlespeech/t2s/exps/dygraph_to_static.py b/paddlespeech/t2s/exps/dygraph_to_static.py
new file mode 100644
index 00000000..5e15ca4c
--- /dev/null
+++ b/paddlespeech/t2s/exps/dygraph_to_static.py
@@ -0,0 +1,170 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+
+import yaml
+from yacs.config import CfgNode
+
+from paddlespeech.t2s.exps.syn_utils import am_to_static
+from paddlespeech.t2s.exps.syn_utils import get_am_inference
+from paddlespeech.t2s.exps.syn_utils import get_voc_inference
+from paddlespeech.t2s.exps.syn_utils import voc_to_static
+
+
+def am_dygraph_to_static(args):
+    with open(args.am_config) as f:
+        am_config = CfgNode(yaml.safe_load(f))
+    am_inference = get_am_inference(
+        am=args.am,
+        am_config=am_config,
+        am_ckpt=args.am_ckpt,
+        am_stat=args.am_stat,
+        phones_dict=args.phones_dict,
+        tones_dict=args.tones_dict,
+        speaker_dict=args.speaker_dict)
+    print("acoustic model done!")
+
+    # dygraph to static
+    am_inference = am_to_static(
+        am_inference=am_inference,
+        am=args.am,
+        inference_dir=args.inference_dir,
+        speaker_dict=args.speaker_dict)
+    print("finish to convert dygraph acoustic model to static!")
+
+
+def voc_dygraph_to_static(args):
+    with open(args.voc_config) as f:
+        voc_config = CfgNode(yaml.safe_load(f))
+    voc_inference = get_voc_inference(
+        voc=args.voc,
+        voc_config=voc_config,
+        voc_ckpt=args.voc_ckpt,
+        voc_stat=args.voc_stat)
+    print("voc done!")
+
+    # dygraph to static
+    voc_inference = voc_to_static(
+        voc_inference=voc_inference,
+        voc=args.voc,
+        inference_dir=args.inference_dir)
+    print("finish to convert dygraph vocoder model to static!")
+
+
+def parse_args():
+    # parse args and config
+    parser = argparse.ArgumentParser(
+        description="Synthesize with acoustic model & vocoder")
+    parser.add_argument(
+        '--type',
+        type=str,
+        required=True,
+        choices=["am", "voc"],
+        help='Choose the model type of dynamic to static, am or voc')
+    # acoustic model
+    parser.add_argument(
+        '--am',
+        type=str,
+        default='fastspeech2_csmsc',
+        choices=[
+            'speedyspeech_csmsc',
+            'speedyspeech_aishell3',
+            'fastspeech2_csmsc',
+            'fastspeech2_ljspeech',
+            'fastspeech2_aishell3',
+            'fastspeech2_vctk',
+            'tacotron2_csmsc',
+            'tacotron2_ljspeech',
+            'fastspeech2_mix',
+            'fastspeech2_canton',
+            'fastspeech2_male-zh',
+            'fastspeech2_male-en',
+            'fastspeech2_male-mix',
+        ],
+        help='Choose acoustic model type of tts task.')
+    parser.add_argument(
+        '--am_config', type=str, default=None, help='Config of acoustic model.')
+    parser.add_argument(
+        '--am_ckpt',
+        type=str,
+        default=None,
+        help='Checkpoint file of acoustic model.')
+    parser.add_argument(
+        "--am_stat",
+        type=str,
+        default=None,
+        help="mean and standard deviation used to normalize spectrogram when training acoustic model."
+    )
+    parser.add_argument(
+        "--phones_dict", type=str, default=None, help="phone vocabulary file.")
+    parser.add_argument(
+        "--tones_dict", type=str, default=None, help="tone vocabulary file.")
+    parser.add_argument(
+        "--speaker_dict", type=str, default=None, help="speaker id map file.")
+    # vocoder
+    parser.add_argument(
+        '--voc',
+        type=str,
+        default='pwgan_csmsc',
+        choices=[
+            'pwgan_csmsc',
+            'pwgan_ljspeech',
+            'pwgan_aishell3',
+            'pwgan_vctk',
+            'mb_melgan_csmsc',
+            'style_melgan_csmsc',
+            'hifigan_csmsc',
+            'hifigan_ljspeech',
+            'hifigan_aishell3',
+            'hifigan_vctk',
+            'wavernn_csmsc',
+            'pwgan_male',
+            'hifigan_male',
+            'pwgan_opencpop',
+            'hifigan_opencpop',
+        ],
+        help='Choose vocoder type of tts task.')
+    parser.add_argument(
+        '--voc_config', type=str, default=None, help='Config of voc.')
+    parser.add_argument(
+        '--voc_ckpt', type=str, default=None, help='Checkpoint file of voc.')
+    parser.add_argument(
+        "--voc_stat",
+        type=str,
+        default=None,
+        help="mean and standard deviation used to normalize spectrogram when training voc."
+    )
+    # other
+    parser.add_argument(
+        "--inference_dir",
+        type=str,
+        default=None,
+        help="dir to save inference models")
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    if args.type == "am":
+        am_dygraph_to_static(args)
+    elif args.type == "voc":
+        voc_dygraph_to_static(args)
+    else:
+        print("type should be in ['am', 'voc'] !")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/paddlespeech/t2s/exps/ernie_sat/preprocess.py b/paddlespeech/t2s/exps/ernie_sat/preprocess.py
index 486ed13a..04bbc074 100644
--- a/paddlespeech/t2s/exps/ernie_sat/preprocess.py
+++ b/paddlespeech/t2s/exps/ernie_sat/preprocess.py
@@ -324,6 +324,7 @@ def main():
             sentences=sentences,
             output_dir=dev_dump_dir,
             mel_extractor=mel_extractor,
+            nprocs=args.num_cpu,
             cut_sil=args.cut_sil,
             spk_emb_dir=spk_emb_dir)
     if test_wav_files:
diff --git a/paddlespeech/t2s/exps/fastspeech2/preprocess.py b/paddlespeech/t2s/exps/fastspeech2/preprocess.py
index 521b9a88..a2353242 100644
--- a/paddlespeech/t2s/exps/fastspeech2/preprocess.py
+++ b/paddlespeech/t2s/exps/fastspeech2/preprocess.py
@@ -382,6 +382,7 @@ def main():
             mel_extractor=mel_extractor,
             pitch_extractor=pitch_extractor,
             energy_extractor=energy_extractor,
+            nprocs=args.num_cpu,
             cut_sil=args.cut_sil,
             spk_emb_dir=spk_emb_dir,
             write_metadata_method=args.write_metadata_method)
diff --git a/paddlespeech/t2s/exps/fastspeech2/train.py b/paddlespeech/t2s/exps/fastspeech2/train.py
index 97626db0..24f2be7d 100644
--- a/paddlespeech/t2s/exps/fastspeech2/train.py
+++ b/paddlespeech/t2s/exps/fastspeech2/train.py
@@ -44,10 +44,17 @@ from paddlespeech.t2s.utils import str2bool
 def train_sp(args, config):
     # decides device type and whether to run in parallel
     # setup running environment correctly
-    if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0:
+    if args.ngpu > 0 and paddle.is_compiled_with_cuda():
+        paddle.set_device("gpu")
+    elif args.nxpu > 0 and paddle.is_compiled_with_xpu():
+        paddle.set_device("xpu")
+    elif args.ngpu == 0 and args.nxpu == 0:
         paddle.set_device("cpu")
     else:
-        paddle.set_device("gpu")
+        raise ValueError(
+            "Please make sure that the paddle you installed matches the device type you set, "
+            "and that ngpu and nxpu cannot be negative at the same time.")
+
     world_size = paddle.distributed.get_world_size()
     if world_size > 1:
         paddle.distributed.init_parallel_env()
@@ -183,7 +190,12 @@ def main():
     parser.add_argument("--dev-metadata", type=str, help="dev data.")
     parser.add_argument("--output-dir", type=str, help="output dir.")
     parser.add_argument(
-        "--ngpu", type=int, default=1, help="if ngpu=0, use cpu.")
+        "--ngpu", type=int, default=1, help="if ngpu=0, use cpu or xpu.")
+    parser.add_argument(
+        "--nxpu",
+        type=int,
+        default=0,
+        help="if ngpu=0 and nxpu > 0, use xpu. if ngpu=0 and nxpu=0, use cpu.")
     parser.add_argument(
         "--phones-dict", type=str, default=None, help="phone vocabulary file.")
     parser.add_argument(
diff --git a/paddlespeech/t2s/exps/gan_vocoder/preprocess.py b/paddlespeech/t2s/exps/gan_vocoder/preprocess.py
index 05c65768..a2629a90 100644
--- a/paddlespeech/t2s/exps/gan_vocoder/preprocess.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/preprocess.py
@@ -29,6 +29,7 @@ from yacs.config import CfgNode
 
 from paddlespeech.t2s.datasets.get_feats import LogMelFBank
 from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur
+from paddlespeech.t2s.datasets.preprocess_utils import get_sentences_svs
 from paddlespeech.t2s.datasets.preprocess_utils import merge_silence
 from paddlespeech.t2s.utils import str2bool
 
@@ -192,8 +193,15 @@ def main():
     with open(args.config, 'rt') as f:
         config = CfgNode(yaml.safe_load(f))
 
-    sentences, speaker_set = get_phn_dur(dur_file)
-    merge_silence(sentences)
+    if args.dataset == "opencpop":
+        sentences, speaker_set = get_sentences_svs(
+            dur_file,
+            dataset=args.dataset,
+            sample_rate=config.fs,
+            n_shift=config.n_shift, )
+    else:
+        sentences, speaker_set = get_phn_dur(dur_file)
+        merge_silence(sentences)
 
     # split data into 3 sections
     if args.dataset == "baker":
@@ -240,6 +248,33 @@ def main():
                 test_wav_files += wav_files[-sub_num_dev:]
             else:
                 train_wav_files += wav_files
+    elif args.dataset == "opencpop":
+        wavdir = rootdir / "wavs"
+        # split data into 3 sections
+        train_file = rootdir / "train.txt"
+        train_wav_files = []
+        with open(train_file, "r") as f_train:
+            for line in f_train.readlines():
+                utt = line.split("|")[0]
+                wav_name = utt + ".wav"
+                wav_path = wavdir / wav_name
+                train_wav_files.append(wav_path)
+
+        test_file = rootdir / "test.txt"
+        dev_wav_files = []
+        test_wav_files = []
+        num_dev = 106
+        count = 0
+        with open(test_file, "r") as f_test:
+            for line in f_test.readlines():
+                count += 1
+                utt = line.split("|")[0]
+                wav_name = utt + ".wav"
+                wav_path = wavdir / wav_name
+                if count > num_dev:
+                    test_wav_files.append(wav_path)
+                else:
+                    dev_wav_files.append(wav_path)
     else:
         print("dataset should in {baker, ljspeech, vctk, aishell3} now!")
 
diff --git a/paddlespeech/t2s/exps/inference.py b/paddlespeech/t2s/exps/inference.py
index 31fe1449..8a526982 100644
--- a/paddlespeech/t2s/exps/inference.py
+++ b/paddlespeech/t2s/exps/inference.py
@@ -112,7 +112,7 @@ def parse_args():
     parser.add_argument(
         "--device",
         default="gpu",
-        choices=["gpu", "cpu"],
+        choices=["gpu", "cpu", "xpu"],
         help="Device selected for inference.", )
     parser.add_argument('--cpu_threads', type=int, default=1)
 
diff --git a/paddlespeech/t2s/exps/jets/__init__.py b/paddlespeech/t2s/exps/jets/__init__.py
new file mode 100644
index 00000000..97043fd7
--- /dev/null
+++ b/paddlespeech/t2s/exps/jets/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddlespeech/t2s/exps/jets/inference.py b/paddlespeech/t2s/exps/jets/inference.py
new file mode 100644
index 00000000..4f6882ed
--- /dev/null
+++ b/paddlespeech/t2s/exps/jets/inference.py
@@ -0,0 +1,172 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+from pathlib import Path
+
+import paddle
+import soundfile as sf
+from timer import timer
+
+from paddlespeech.t2s.exps.syn_utils import get_am_output
+from paddlespeech.t2s.exps.syn_utils import get_frontend
+from paddlespeech.t2s.exps.syn_utils import get_predictor
+from paddlespeech.t2s.exps.syn_utils import get_sentences
+from paddlespeech.t2s.utils import str2bool
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Paddle Infernce with acoustic model & vocoder.")
+    # acoustic model
+    parser.add_argument(
+        '--am',
+        type=str,
+        default='jets_csmsc',
+        choices=['jets_csmsc', 'jets_aishell3'],
+        help='Choose acoustic model type of tts task.')
+    parser.add_argument(
+        "--phones_dict", type=str, default=None, help="phone vocabulary file.")
+    parser.add_argument(
+        "--speaker_dict", type=str, default=None, help="speaker id map file.")
+    parser.add_argument(
+        '--spk_id',
+        type=int,
+        default=0,
+        help='spk id for multi speaker acoustic model')
+    # other
+    parser.add_argument(
+        '--lang',
+        type=str,
+        default='zh',
+        help='Choose model language. zh or en or mix')
+    parser.add_argument(
+        "--text",
+        type=str,
+        help="text to synthesize, a 'utt_id sentence' pair per line")
+    parser.add_argument(
+        "--add-blank",
+        type=str2bool,
+        default=True,
+        help="whether to add blank between phones")
+    parser.add_argument(
+        "--inference_dir", type=str, help="dir to save inference models")
+    parser.add_argument("--output_dir", type=str, help="output dir")
+    # inference
+    parser.add_argument(
+        "--use_trt",
+        type=str2bool,
+        default=False,
+        help="whether to use TensorRT or not in GPU", )
+    parser.add_argument(
+        "--use_mkldnn",
+        type=str2bool,
+        default=False,
+        help="whether to use MKLDNN or not in CPU.", )
+    parser.add_argument(
+        "--precision",
+        type=str,
+        default='fp32',
+        choices=['fp32', 'fp16', 'bf16', 'int8'],
+        help="mode of running")
+    parser.add_argument(
+        "--device",
+        default="gpu",
+        choices=["gpu", "cpu"],
+        help="Device selected for inference.", )
+    parser.add_argument('--cpu_threads', type=int, default=1)
+
+    args, _ = parser.parse_known_args()
+    return args
+
+
+# only inference for models trained with csmsc now
+def main():
+    args = parse_args()
+
+    paddle.set_device(args.device)
+
+    # frontend
+    frontend = get_frontend(lang=args.lang, phones_dict=args.phones_dict)
+
+    # am_predictor
+    am_predictor = get_predictor(
+        model_dir=args.inference_dir,
+        model_file=args.am + ".pdmodel",
+        params_file=args.am + ".pdiparams",
+        device=args.device,
+        use_trt=args.use_trt,
+        use_mkldnn=args.use_mkldnn,
+        cpu_threads=args.cpu_threads,
+        precision=args.precision)
+    # model: {model_name}_{dataset}
+    am_dataset = args.am[args.am.rindex('_') + 1:]
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    sentences = get_sentences(text_file=args.text, lang=args.lang)
+
+    merge_sentences = True
+    add_blank = args.add_blank
+    # jets's fs is 22050
+    fs = 22050
+    # warmup
+    for utt_id, sentence in sentences[:3]:
+        with timer() as t:
+            wav = get_am_output(
+                input=sentence,
+                am_predictor=am_predictor,
+                am=args.am,
+                frontend=frontend,
+                lang=args.lang,
+                merge_sentences=merge_sentences,
+                speaker_dict=args.speaker_dict,
+                spk_id=args.spk_id, )
+        speed = wav.size / t.elapse
+        rtf = fs / speed
+        print(
+            f"{utt_id}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
+        )
+
+    print("warm up done!")
+
+    N = 0
+    T = 0
+    for utt_id, sentence in sentences:
+        with timer() as t:
+            wav = get_am_output(
+                input=sentence,
+                am_predictor=am_predictor,
+                am=args.am,
+                frontend=frontend,
+                lang=args.lang,
+                merge_sentences=merge_sentences,
+                speaker_dict=args.speaker_dict,
+                spk_id=args.spk_id, )
+
+        N += wav.size
+        T += t.elapse
+        speed = wav.size / t.elapse
+        rtf = fs / speed
+        sf.write(output_dir / (utt_id + ".wav"), wav, samplerate=fs)
+        print(
+            f"{utt_id}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
+        )
+
+        print(f"{utt_id} done!")
+    print(f"generation speed: {N / T}Hz, RTF: {fs / (N / T) }")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/paddlespeech/t2s/exps/jets/normalize.py b/paddlespeech/t2s/exps/jets/normalize.py
new file mode 100644
index 00000000..8531f0db
--- /dev/null
+++ b/paddlespeech/t2s/exps/jets/normalize.py
@@ -0,0 +1,163 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Normalize feature files and dump them."""
+import argparse
+import logging
+from operator import itemgetter
+from pathlib import Path
+
+import jsonlines
+import numpy as np
+from sklearn.preprocessing import StandardScaler
+from tqdm import tqdm
+
+from paddlespeech.t2s.datasets.data_table import DataTable
+
+
+def main():
+    """Run preprocessing process."""
+    parser = argparse.ArgumentParser(
+        description="Normalize dumped raw features (See detail in parallel_wavegan/bin/normalize.py)."
+    )
+    parser.add_argument(
+        "--metadata",
+        type=str,
+        required=True,
+        help="directory including feature files to be normalized. "
+        "you need to specify either *-scp or rootdir.")
+
+    parser.add_argument(
+        "--dumpdir",
+        type=str,
+        required=True,
+        help="directory to dump normalized feature files.")
+    parser.add_argument(
+        "--feats-stats", type=str, required=True, help="feats statistics file.")
+    parser.add_argument(
+        "--pitch-stats", type=str, required=True, help="pitch statistics file.")
+    parser.add_argument(
+        "--energy-stats",
+        type=str,
+        required=True,
+        help="energy statistics file.")
+    parser.add_argument(
+        "--phones-dict", type=str, default=None, help="phone vocabulary file.")
+    parser.add_argument(
+        "--speaker-dict", type=str, default=None, help="speaker id map file.")
+
+    args = parser.parse_args()
+
+    dumpdir = Path(args.dumpdir).expanduser()
+    # use absolute path
+    dumpdir = dumpdir.resolve()
+    dumpdir.mkdir(parents=True, exist_ok=True)
+
+    # get dataset
+    with jsonlines.open(args.metadata, 'r') as reader:
+        metadata = list(reader)
+    dataset = DataTable(
+        metadata,
+        converters={
+            "feats": np.load,
+            "pitch": np.load,
+            "energy": np.load,
+            "wave": str,
+        })
+    logging.info(f"The number of files = {len(dataset)}.")
+
+    # restore scaler
+    feats_scaler = StandardScaler()
+    feats_scaler.mean_ = np.load(args.feats_stats)[0]
+    feats_scaler.scale_ = np.load(args.feats_stats)[1]
+    feats_scaler.n_features_in_ = feats_scaler.mean_.shape[0]
+
+    pitch_scaler = StandardScaler()
+    pitch_scaler.mean_ = np.load(args.pitch_stats)[0]
+    pitch_scaler.scale_ = np.load(args.pitch_stats)[1]
+    pitch_scaler.n_features_in_ = pitch_scaler.mean_.shape[0]
+
+    energy_scaler = StandardScaler()
+    energy_scaler.mean_ = np.load(args.energy_stats)[0]
+    energy_scaler.scale_ = np.load(args.energy_stats)[1]
+    energy_scaler.n_features_in_ = energy_scaler.mean_.shape[0]
+
+    vocab_phones = {}
+    with open(args.phones_dict, 'rt') as f:
+        phn_id = [line.strip().split() for line in f.readlines()]
+    for phn, id in phn_id:
+        vocab_phones[phn] = int(id)
+
+    vocab_speaker = {}
+    with open(args.speaker_dict, 'rt') as f:
+        spk_id = [line.strip().split() for line in f.readlines()]
+    for spk, id in spk_id:
+        vocab_speaker[spk] = int(id)
+
+    # process each file
+    output_metadata = []
+
+    for item in tqdm(dataset):
+        utt_id = item['utt_id']
+        feats = item['feats']
+        pitch = item['pitch']
+        energy = item['energy']
+        wave_path = item['wave']
+        # normalize
+        feats = feats_scaler.transform(feats)
+        feats_dir = dumpdir / "data_feats"
+        feats_dir.mkdir(parents=True, exist_ok=True)
+        feats_path = feats_dir / f"{utt_id}_feats.npy"
+        np.save(feats_path, feats.astype(np.float32), allow_pickle=False)
+
+        pitch = pitch_scaler.transform(pitch)
+        pitch_dir = dumpdir / "data_pitch"
+        pitch_dir.mkdir(parents=True, exist_ok=True)
+        pitch_path = pitch_dir / f"{utt_id}_pitch.npy"
+        np.save(pitch_path, pitch.astype(np.float32), allow_pickle=False)
+
+        energy = energy_scaler.transform(energy)
+        energy_dir = dumpdir / "data_energy"
+        energy_dir.mkdir(parents=True, exist_ok=True)
+        energy_path = energy_dir / f"{utt_id}_energy.npy"
+        np.save(energy_path, energy.astype(np.float32), allow_pickle=False)
+
+        phone_ids = [vocab_phones[p] for p in item['phones']]
+        spk_id = vocab_speaker[item["speaker"]]
+        record = {
+            "utt_id": item['utt_id'],
+            "spk_id": spk_id,
+            "text": phone_ids,
+            "text_lengths": item['text_lengths'],
+            "feats_lengths": item['feats_lengths'],
+            "durations": item['durations'],
+            "feats": str(feats_path),
+            "pitch": str(pitch_path),
+            "energy": str(energy_path),
+            "wave": str(wave_path),
+        }
+        # add spk_emb for voice cloning
+        if "spk_emb" in item:
+            record["spk_emb"] = str(item["spk_emb"])
+
+        output_metadata.append(record)
+    output_metadata.sort(key=itemgetter('utt_id'))
+    output_metadata_path = Path(args.dumpdir) / "metadata.jsonl"
+    with jsonlines.open(output_metadata_path, 'w') as writer:
+        for item in output_metadata:
+            writer.write(item)
+    logging.info(f"metadata dumped into {output_metadata_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/paddlespeech/t2s/exps/jets/preprocess.py b/paddlespeech/t2s/exps/jets/preprocess.py
new file mode 100644
index 00000000..468941ea
--- /dev/null
+++ b/paddlespeech/t2s/exps/jets/preprocess.py
@@ -0,0 +1,451 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+from concurrent.futures import ThreadPoolExecutor
+from operator import itemgetter
+from pathlib import Path
+from typing import Any
+from typing import Dict
+from typing import List
+
+import jsonlines
+import librosa
+import numpy as np
+import tqdm
+import yaml
+from yacs.config import CfgNode
+
+from paddlespeech.t2s.datasets.get_feats import Energy
+from paddlespeech.t2s.datasets.get_feats import LogMelFBank
+from paddlespeech.t2s.datasets.get_feats import Pitch
+from paddlespeech.t2s.datasets.preprocess_utils import compare_duration_and_mel_length
+from paddlespeech.t2s.datasets.preprocess_utils import get_input_token
+from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur
+from paddlespeech.t2s.datasets.preprocess_utils import get_spk_id_map
+from paddlespeech.t2s.datasets.preprocess_utils import merge_silence
+from paddlespeech.t2s.utils import str2bool
+
+
+def process_sentence(config: Dict[str, Any],
+                     fp: Path,
+                     sentences: Dict,
+                     output_dir: Path,
+                     mel_extractor=None,
+                     pitch_extractor=None,
+                     energy_extractor=None,
+                     cut_sil: bool=True,
+                     spk_emb_dir: Path=None,
+                     token_average: bool=True):
+    utt_id = fp.stem
+    # for vctk
+    if utt_id.endswith("_mic2"):
+        utt_id = utt_id[:-5]
+    record = None
+    if utt_id in sentences:
+        # reading, resampling may occur
+        wav, _ = librosa.load(
+            str(fp), sr=config.fs,
+            mono=False) if "canton" in str(fp) else librosa.load(
+                str(fp), sr=config.fs)
+        if len(wav.shape) == 2 and "canton" in str(fp):
+            # Remind that Cantonese datasets should be placed in ~/datasets/canton_all. Otherwise, it may cause problem.
+            wav = wav[0]
+            wav = np.ascontiguousarray(wav)
+        elif len(wav.shape) != 1:
+            return record
+        max_value = np.abs(wav).max()
+        if max_value > 1.0:
+            wav = wav / max_value
+        assert len(wav.shape) == 1, f"{utt_id} is not a mono-channel audio."
+        assert np.abs(wav).max(
+        ) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM."
+        phones = sentences[utt_id][0]
+        durations = sentences[utt_id][1]
+        speaker = sentences[utt_id][2]
+        d_cumsum = np.pad(np.array(durations).cumsum(0), (1, 0), 'constant')
+        # little imprecise than use *.TextGrid directly
+        times = librosa.frames_to_time(
+            d_cumsum, sr=config.fs, hop_length=config.n_shift)
+        if cut_sil:
+            start = 0
+            end = d_cumsum[-1]
+            if phones[0] == "sil" and len(durations) > 1:
+                start = times[1]
+                durations = durations[1:]
+                phones = phones[1:]
+            if phones[-1] == 'sil' and len(durations) > 1:
+                end = times[-2]
+                durations = durations[:-1]
+                phones = phones[:-1]
+            sentences[utt_id][0] = phones
+            sentences[utt_id][1] = durations
+            start, end = librosa.time_to_samples([start, end], sr=config.fs)
+            wav = wav[start:end]
+        # extract mel feats
+        logmel = mel_extractor.get_log_mel_fbank(wav)
+        # change duration according to mel_length
+        compare_duration_and_mel_length(sentences, utt_id, logmel)
+        # utt_id may be popped in compare_duration_and_mel_length
+        if utt_id not in sentences:
+            return None
+        phones = sentences[utt_id][0]
+        durations = sentences[utt_id][1]
+        num_frames = logmel.shape[0]
+        assert sum(durations) == num_frames
+        mel_dir = output_dir / "data_feats"
+        mel_dir.mkdir(parents=True, exist_ok=True)
+        mel_path = mel_dir / (utt_id + "_feats.npy")
+        np.save(mel_path, logmel)
+
+        if wav.size < num_frames * config.n_shift:
+            wav = np.pad(
+                wav, (0, num_frames * config.n_shift - wav.size),
+                mode="reflect")
+        else:
+            wav = wav[:num_frames * config.n_shift]
+        wave_dir = output_dir / "data_wave"
+        wave_dir.mkdir(parents=True, exist_ok=True)
+        wav_path = wave_dir / (utt_id + "_wave.npy")
+        # (num_samples, )
+        np.save(wav_path, wav)
+        # extract pitch and energy
+        if token_average == True:
+            f0 = pitch_extractor.get_pitch(
+                wav,
+                duration=np.array(durations),
+                use_token_averaged_f0=token_average)
+            if (f0 == 0).all():
+                return None
+            assert f0.shape[0] == len(durations)
+        else:
+            f0 = pitch_extractor.get_pitch(
+                wav, use_token_averaged_f0=token_average)
+            if (f0 == 0).all():
+                return None
+            f0 = f0[:num_frames]
+            assert f0.shape[0] == num_frames
+        f0_dir = output_dir / "data_pitch"
+        f0_dir.mkdir(parents=True, exist_ok=True)
+        f0_path = f0_dir / (utt_id + "_pitch.npy")
+        np.save(f0_path, f0)
+        if token_average == True:
+            energy = energy_extractor.get_energy(
+                wav,
+                duration=np.array(durations),
+                use_token_averaged_energy=token_average)
+            assert energy.shape[0] == len(durations)
+        else:
+            energy = energy_extractor.get_energy(
+                wav, use_token_averaged_energy=token_average)
+            energy = energy[:num_frames]
+            assert energy.shape[0] == num_frames
+
+        energy_dir = output_dir / "data_energy"
+        energy_dir.mkdir(parents=True, exist_ok=True)
+        energy_path = energy_dir / (utt_id + "_energy.npy")
+        np.save(energy_path, energy)
+        record = {
+            "utt_id": utt_id,
+            "phones": phones,
+            "text_lengths": len(phones),
+            "feats_lengths": num_frames,
+            "durations": durations,
+            "feats": str(mel_path),
+            "pitch": str(f0_path),
+            "energy": str(energy_path),
+            "wave": str(wav_path),
+            "speaker": speaker
+        }
+        if spk_emb_dir:
+            if speaker in os.listdir(spk_emb_dir):
+                embed_name = utt_id + ".npy"
+                embed_path = spk_emb_dir / speaker / embed_name
+                if embed_path.is_file():
+                    record["spk_emb"] = str(embed_path)
+                else:
+                    return None
+    return record
+
+
+def process_sentences(config,
+                      fps: List[Path],
+                      sentences: Dict,
+                      output_dir: Path,
+                      mel_extractor=None,
+                      pitch_extractor=None,
+                      energy_extractor=None,
+                      nprocs: int=1,
+                      cut_sil: bool=True,
+                      spk_emb_dir: Path=None,
+                      write_metadata_method: str='w',
+                      token_average: bool=True):
+    if nprocs == 1:
+        results = []
+        for fp in tqdm.tqdm(fps, total=len(fps)):
+            record = process_sentence(
+                config=config,
+                fp=fp,
+                sentences=sentences,
+                output_dir=output_dir,
+                mel_extractor=mel_extractor,
+                pitch_extractor=pitch_extractor,
+                energy_extractor=energy_extractor,
+                cut_sil=cut_sil,
+                spk_emb_dir=spk_emb_dir,
+                token_average=token_average)
+            if record:
+                results.append(record)
+    else:
+        with ThreadPoolExecutor(nprocs) as pool:
+            futures = []
+            with tqdm.tqdm(total=len(fps)) as progress:
+                for fp in fps:
+                    future = pool.submit(process_sentence, config, fp,
+                                         sentences, output_dir, mel_extractor,
+                                         pitch_extractor, energy_extractor,
+                                         cut_sil, spk_emb_dir)
+                    future.add_done_callback(lambda p: progress.update())
+                    futures.append(future)
+
+                results = []
+                for ft in futures:
+                    record = ft.result()
+                    if record:
+                        results.append(record)
+
+    results.sort(key=itemgetter("utt_id"))
+    with jsonlines.open(output_dir / "metadata.jsonl",
+                        write_metadata_method) as writer:
+        for item in results:
+            writer.write(item)
+    print("Done")
+
+
+def main():
+    # parse config and args
+    parser = argparse.ArgumentParser(
+        description="Preprocess audio and then extract features.")
+
+    parser.add_argument(
+        "--dataset",
+        default="baker",
+        type=str,
+        help="name of dataset, should in {baker, aishell3, ljspeech, vctk} now")
+
+    parser.add_argument(
+        "--rootdir", default=None, type=str, help="directory to dataset.")
+
+    parser.add_argument(
+        "--dumpdir",
+        type=str,
+        required=True,
+        help="directory to dump feature files.")
+    parser.add_argument(
+        "--dur-file", default=None, type=str, help="path to durations.txt.")
+
+    parser.add_argument("--config", type=str, help="fastspeech2 config file.")
+
+    parser.add_argument(
+        "--num-cpu", type=int, default=1, help="number of process.")
+
+    parser.add_argument(
+        "--cut-sil",
+        type=str2bool,
+        default=True,
+        help="whether cut sil in the edge of audio")
+
+    parser.add_argument(
+        "--spk_emb_dir",
+        default=None,
+        type=str,
+        help="directory to speaker embedding files.")
+
+    parser.add_argument(
+        "--write_metadata_method",
+        default="w",
+        type=str,
+        choices=["w", "a"],
+        help="How the metadata.jsonl file is written.")
+
+    parser.add_argument(
+        "--token_average",
+        type=str2bool,
+        default=False,
+        help="Average the energy and pitch accroding to durations")
+    args = parser.parse_args()
+
+    rootdir = Path(args.rootdir).expanduser()
+    dumpdir = Path(args.dumpdir).expanduser()
+    # use absolute path
+    dumpdir = dumpdir.resolve()
+    dumpdir.mkdir(parents=True, exist_ok=True)
+    dur_file = Path(args.dur_file).expanduser()
+
+    if args.spk_emb_dir:
+        spk_emb_dir = Path(args.spk_emb_dir).expanduser().resolve()
+    else:
+        spk_emb_dir = None
+
+    assert rootdir.is_dir()
+    assert dur_file.is_file()
+
+    with open(args.config, 'rt') as f:
+        config = CfgNode(yaml.safe_load(f))
+
+    sentences, speaker_set = get_phn_dur(dur_file)
+
+    merge_silence(sentences)
+    phone_id_map_path = dumpdir / "phone_id_map.txt"
+    speaker_id_map_path = dumpdir / "speaker_id_map.txt"
+    get_input_token(sentences, phone_id_map_path, args.dataset)
+    get_spk_id_map(speaker_set, speaker_id_map_path)
+
+    if args.dataset == "baker":
+        wav_files = sorted(list((rootdir / "Wave").rglob("*.wav")))
+        # split data into 3 sections
+        num_train = 9800
+        num_dev = 100
+        train_wav_files = wav_files[:num_train]
+        dev_wav_files = wav_files[num_train:num_train + num_dev]
+        test_wav_files = wav_files[num_train + num_dev:]
+    elif args.dataset == "aishell3":
+        sub_num_dev = 5
+        wav_dir = rootdir / "train" / "wav"
+        train_wav_files = []
+        dev_wav_files = []
+        test_wav_files = []
+        for speaker in os.listdir(wav_dir):
+            wav_files = sorted(list((wav_dir / speaker).rglob("*.wav")))
+            if len(wav_files) > 100:
+                train_wav_files += wav_files[:-sub_num_dev * 2]
+                dev_wav_files += wav_files[-sub_num_dev * 2:-sub_num_dev]
+                test_wav_files += wav_files[-sub_num_dev:]
+            else:
+                train_wav_files += wav_files
+    elif args.dataset == "canton":
+        sub_num_dev = 5
+        wav_dir = rootdir / "WAV"
+        train_wav_files = []
+        dev_wav_files = []
+        test_wav_files = []
+        for speaker in os.listdir(wav_dir):
+            wav_files = sorted(list((wav_dir / speaker).rglob("*.wav")))
+            if len(wav_files) > 100:
+                train_wav_files += wav_files[:-sub_num_dev * 2]
+                dev_wav_files += wav_files[-sub_num_dev * 2:-sub_num_dev]
+                test_wav_files += wav_files[-sub_num_dev:]
+            else:
+                train_wav_files += wav_files
+    elif args.dataset == "ljspeech":
+        wav_files = sorted(list((rootdir / "wavs").rglob("*.wav")))
+        # split data into 3 sections
+        num_train = 12900
+        num_dev = 100
+        train_wav_files = wav_files[:num_train]
+        dev_wav_files = wav_files[num_train:num_train + num_dev]
+        test_wav_files = wav_files[num_train + num_dev:]
+    elif args.dataset == "vctk":
+        sub_num_dev = 5
+        wav_dir = rootdir / "wav48_silence_trimmed"
+        train_wav_files = []
+        dev_wav_files = []
+        test_wav_files = []
+        for speaker in os.listdir(wav_dir):
+            wav_files = sorted(list((wav_dir / speaker).rglob("*_mic2.flac")))
+            if len(wav_files) > 100:
+                train_wav_files += wav_files[:-sub_num_dev * 2]
+                dev_wav_files += wav_files[-sub_num_dev * 2:-sub_num_dev]
+                test_wav_files += wav_files[-sub_num_dev:]
+            else:
+                train_wav_files += wav_files
+
+    else:
+        print("dataset should in {baker, aishell3, ljspeech, vctk} now!")
+
+    train_dump_dir = dumpdir / "train" / "raw"
+    train_dump_dir.mkdir(parents=True, exist_ok=True)
+    dev_dump_dir = dumpdir / "dev" / "raw"
+    dev_dump_dir.mkdir(parents=True, exist_ok=True)
+    test_dump_dir = dumpdir / "test" / "raw"
+    test_dump_dir.mkdir(parents=True, exist_ok=True)
+
+    # Extractor
+    mel_extractor = LogMelFBank(
+        sr=config.fs,
+        n_fft=config.n_fft,
+        hop_length=config.n_shift,
+        win_length=config.win_length,
+        window=config.window,
+        n_mels=config.n_mels,
+        fmin=config.fmin,
+        fmax=config.fmax)
+    pitch_extractor = Pitch(
+        sr=config.fs,
+        hop_length=config.n_shift,
+        f0min=config.f0min,
+        f0max=config.f0max)
+    energy_extractor = Energy(
+        n_fft=config.n_fft,
+        hop_length=config.n_shift,
+        win_length=config.win_length,
+        window=config.window)
+
+    # process for the 3 sections
+    if train_wav_files:
+        process_sentences(
+            config=config,
+            fps=train_wav_files,
+            sentences=sentences,
+            output_dir=train_dump_dir,
+            mel_extractor=mel_extractor,
+            pitch_extractor=pitch_extractor,
+            energy_extractor=energy_extractor,
+            nprocs=args.num_cpu,
+            cut_sil=args.cut_sil,
+            spk_emb_dir=spk_emb_dir,
+            write_metadata_method=args.write_metadata_method,
+            token_average=args.token_average)
+    if dev_wav_files:
+        process_sentences(
+            config=config,
+            fps=dev_wav_files,
+            sentences=sentences,
+            output_dir=dev_dump_dir,
+            mel_extractor=mel_extractor,
+            pitch_extractor=pitch_extractor,
+            energy_extractor=energy_extractor,
+            nprocs=args.num_cpu,
+            cut_sil=args.cut_sil,
+            spk_emb_dir=spk_emb_dir,
+            write_metadata_method=args.write_metadata_method,
+            token_average=args.token_average)
+    if test_wav_files:
+        process_sentences(
+            config=config,
+            fps=test_wav_files,
+            sentences=sentences,
+            output_dir=test_dump_dir,
+            mel_extractor=mel_extractor,
+            pitch_extractor=pitch_extractor,
+            energy_extractor=energy_extractor,
+            nprocs=args.num_cpu,
+            cut_sil=args.cut_sil,
+            spk_emb_dir=spk_emb_dir,
+            write_metadata_method=args.write_metadata_method,
+            token_average=args.token_average)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/paddlespeech/t2s/exps/jets/synthesize.py b/paddlespeech/t2s/exps/jets/synthesize.py
new file mode 100644
index 00000000..ef26414d
--- /dev/null
+++ b/paddlespeech/t2s/exps/jets/synthesize.py
@@ -0,0 +1,153 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+from pathlib import Path
+
+import jsonlines
+import numpy as np
+import paddle
+import soundfile as sf
+import yaml
+from timer import timer
+from yacs.config import CfgNode
+
+from paddlespeech.t2s.datasets.data_table import DataTable
+from paddlespeech.t2s.models.jets import JETS
+from paddlespeech.t2s.utils import str2bool
+
+
+def evaluate(args):
+
+    # construct dataset for evaluation
+    with jsonlines.open(args.test_metadata, 'r') as reader:
+        test_metadata = list(reader)
+    # Init body.
+    with open(args.config) as f:
+        config = CfgNode(yaml.safe_load(f))
+
+    print("========Args========")
+    print(yaml.safe_dump(vars(args)))
+    print("========Config========")
+    print(config)
+
+    fields = ["utt_id", "text"]
+    converters = {}
+
+    spk_num = None
+    if args.speaker_dict is not None:
+        print("multiple speaker jets!")
+        with open(args.speaker_dict, 'rt') as f:
+            spk_id = [line.strip().split() for line in f.readlines()]
+        spk_num = len(spk_id)
+        fields += ["spk_id"]
+    elif args.voice_cloning:
+        print("Evaluating voice cloning!")
+        fields += ["spk_emb"]
+    else:
+        print("single speaker jets!")
+    print("spk_num:", spk_num)
+
+    test_dataset = DataTable(
+        data=test_metadata,
+        fields=fields,
+        converters=converters, )
+
+    with open(args.phones_dict, "r") as f:
+        phn_id = [line.strip().split() for line in f.readlines()]
+    vocab_size = len(phn_id)
+    print("vocab_size:", vocab_size)
+
+    odim = config.n_fft // 2 + 1
+    config["model"]["generator_params"]["spks"] = spk_num
+
+    jets = JETS(idim=vocab_size, odim=odim, **config["model"])
+    jets.set_state_dict(paddle.load(args.ckpt)["main_params"])
+    jets.eval()
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    N = 0
+    T = 0
+
+    for datum in test_dataset:
+        utt_id = datum["utt_id"]
+        phone_ids = paddle.to_tensor(datum["text"])
+        with timer() as t:
+            with paddle.no_grad():
+                spk_emb = None
+                spk_id = None
+                # multi speaker
+                if args.voice_cloning and "spk_emb" in datum:
+                    spk_emb = paddle.to_tensor(np.load(datum["spk_emb"]))
+                elif "spk_id" in datum:
+                    spk_id = paddle.to_tensor(datum["spk_id"])
+                out = jets.inference(
+                    text=phone_ids, sids=spk_id, spembs=spk_emb)
+            wav = out["wav"]
+            wav = wav.numpy()
+            N += wav.size
+            T += t.elapse
+            speed = wav.size / t.elapse
+            rtf = config.fs / speed
+        print(
+            f"{utt_id}, wave: {wav.size}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
+        )
+        sf.write(str(output_dir / (utt_id + ".wav")), wav, samplerate=config.fs)
+        print(f"{utt_id} done!")
+    print(f"generation speed: {N / T}Hz, RTF: {config.fs / (N / T) }")
+
+
+def parse_args():
+    # parse args and config
+    parser = argparse.ArgumentParser(description="Synthesize with JETS")
+    # model
+    parser.add_argument(
+        '--config', type=str, default=None, help='Config of JETS.')
+    parser.add_argument(
+        '--ckpt', type=str, default=None, help='Checkpoint file of JETS.')
+    parser.add_argument(
+        "--phones_dict", type=str, default=None, help="phone vocabulary file.")
+    parser.add_argument(
+        "--speaker_dict", type=str, default=None, help="speaker id map file.")
+    parser.add_argument(
+        "--voice-cloning",
+        type=str2bool,
+        default=False,
+        help="whether training voice cloning model.")
+    # other
+    parser.add_argument(
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
+    parser.add_argument("--test_metadata", type=str, help="test metadata.")
+    parser.add_argument("--output_dir", type=str, help="output dir.")
+
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    if args.ngpu == 0:
+        paddle.set_device("cpu")
+    elif args.ngpu > 0:
+        paddle.set_device("gpu")
+    else:
+        print("ngpu should >= 0 !")
+
+    evaluate(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/paddlespeech/t2s/exps/jets/synthesize_e2e.py b/paddlespeech/t2s/exps/jets/synthesize_e2e.py
new file mode 100644
index 00000000..1c713c06
--- /dev/null
+++ b/paddlespeech/t2s/exps/jets/synthesize_e2e.py
@@ -0,0 +1,189 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+from pathlib import Path
+
+import paddle
+import soundfile as sf
+import yaml
+from timer import timer
+from yacs.config import CfgNode
+
+from paddlespeech.t2s.exps.syn_utils import am_to_static
+from paddlespeech.t2s.exps.syn_utils import get_frontend
+from paddlespeech.t2s.exps.syn_utils import get_sentences
+from paddlespeech.t2s.models.jets import JETS
+from paddlespeech.t2s.models.jets import JETSInference
+from paddlespeech.t2s.utils import str2bool
+
+
+def evaluate(args):
+    # Init body.
+    with open(args.config) as f:
+        config = CfgNode(yaml.safe_load(f))
+
+    print("========Args========")
+    print(yaml.safe_dump(vars(args)))
+    print("========Config========")
+    print(config)
+
+    sentences = get_sentences(text_file=args.text, lang=args.lang)
+
+    # frontend
+    frontend = get_frontend(lang=args.lang, phones_dict=args.phones_dict)
+    # acoustic model
+    am_name = args.am[:args.am.rindex('_')]
+    am_dataset = args.am[args.am.rindex('_') + 1:]
+
+    spk_num = None
+    if args.speaker_dict is not None:
+        print("multiple speaker jets!")
+        with open(args.speaker_dict, 'rt') as f:
+            spk_id = [line.strip().split() for line in f.readlines()]
+        spk_num = len(spk_id)
+    else:
+        print("single speaker jets!")
+    print("spk_num:", spk_num)
+
+    with open(args.phones_dict, "r") as f:
+        phn_id = [line.strip().split() for line in f.readlines()]
+    vocab_size = len(phn_id)
+    print("vocab_size:", vocab_size)
+
+    odim = config.n_fft // 2 + 1
+    config["model"]["generator_params"]["spks"] = spk_num
+
+    jets = JETS(idim=vocab_size, odim=odim, **config["model"])
+    jets.set_state_dict(paddle.load(args.ckpt)["main_params"])
+    jets.eval()
+
+    jets_inference = JETSInference(jets)
+    # whether dygraph to static
+    if args.inference_dir:
+        jets_inference = am_to_static(
+            am_inference=jets_inference,
+            am=args.am,
+            inference_dir=args.inference_dir,
+            speaker_dict=args.speaker_dict)
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    merge_sentences = False
+
+    N = 0
+    T = 0
+    for utt_id, sentence in sentences:
+        with timer() as t:
+            if args.lang == 'zh':
+                input_ids = frontend.get_input_ids(
+                    sentence, merge_sentences=merge_sentences)
+                phone_ids = input_ids["phone_ids"]
+            elif args.lang == 'en':
+                input_ids = frontend.get_input_ids(
+                    sentence, merge_sentences=merge_sentences)
+                phone_ids = input_ids["phone_ids"]
+            else:
+                print("lang should in {'zh', 'en'}!")
+            with paddle.no_grad():
+                flags = 0
+                for i in range(len(phone_ids)):
+                    part_phone_ids = phone_ids[i]
+                    spk_id = None
+                    if am_dataset in {"aishell3",
+                                      "vctk"} and spk_num is not None:
+                        spk_id = paddle.to_tensor(args.spk_id)
+                        wav = jets_inference(part_phone_ids, spk_id)
+                    else:
+                        wav = jets_inference(part_phone_ids)
+                    if flags == 0:
+                        wav_all = wav
+                        flags = 1
+                    else:
+                        wav_all = paddle.concat([wav_all, wav])
+        wav = wav_all.numpy()
+        N += wav.size
+        T += t.elapse
+        speed = wav.size / t.elapse
+        rtf = config.fs / speed
+        print(
+            f"{utt_id}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
+        )
+        sf.write(str(output_dir / (utt_id + ".wav")), wav, samplerate=config.fs)
+        print(f"{utt_id} done!")
+    print(f"generation speed: {N / T}Hz, RTF: {config.fs / (N / T) }")
+
+
+def parse_args():
+    # parse args and config 
+    parser = argparse.ArgumentParser(description="Synthesize with JETS")
+
+    # model
+    parser.add_argument(
+        '--config', type=str, default=None, help='Config of JETS.')
+    parser.add_argument(
+        '--ckpt', type=str, default=None, help='Checkpoint file of JETS.')
+    parser.add_argument(
+        "--phones_dict", type=str, default=None, help="phone vocabulary file.")
+    parser.add_argument(
+        "--speaker_dict", type=str, default=None, help="speaker id map file.")
+    parser.add_argument(
+        '--spk_id',
+        type=int,
+        default=0,
+        help='spk id for multi speaker acoustic model')
+    # other
+    parser.add_argument(
+        '--lang',
+        type=str,
+        default='zh',
+        help='Choose model language. zh or en')
+
+    parser.add_argument(
+        "--inference_dir",
+        type=str,
+        default=None,
+        help="dir to save inference models")
+    parser.add_argument(
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
+    parser.add_argument(
+        "--text",
+        type=str,
+        help="text to synthesize, a 'utt_id sentence' pair per line.")
+    parser.add_argument("--output_dir", type=str, help="output dir.")
+
+    parser.add_argument(
+        '--am',
+        type=str,
+        default='jets_csmsc',
+        help='Choose acoustic model type of tts task.')
+
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    if args.ngpu == 0:
+        paddle.set_device("cpu")
+    elif args.ngpu > 0:
+        paddle.set_device("gpu")
+    else:
+        print("ngpu should >= 0 !")
+
+    evaluate(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/paddlespeech/t2s/exps/jets/train.py b/paddlespeech/t2s/exps/jets/train.py
new file mode 100644
index 00000000..7eb4031a
--- /dev/null
+++ b/paddlespeech/t2s/exps/jets/train.py
@@ -0,0 +1,305 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import logging
+import os
+import shutil
+from pathlib import Path
+
+import jsonlines
+import numpy as np
+import paddle
+import yaml
+from paddle import DataParallel
+from paddle import distributed as dist
+from paddle.io import DataLoader
+from paddle.optimizer import AdamW
+from yacs.config import CfgNode
+
+from paddlespeech.t2s.datasets.am_batch_fn import jets_multi_spk_batch_fn
+from paddlespeech.t2s.datasets.am_batch_fn import jets_single_spk_batch_fn
+from paddlespeech.t2s.datasets.data_table import DataTable
+from paddlespeech.t2s.datasets.sampler import ErnieSATSampler
+from paddlespeech.t2s.models.jets import JETS
+from paddlespeech.t2s.models.jets import JETSEvaluator
+from paddlespeech.t2s.models.jets import JETSUpdater
+from paddlespeech.t2s.modules.losses import DiscriminatorAdversarialLoss
+from paddlespeech.t2s.modules.losses import FeatureMatchLoss
+from paddlespeech.t2s.modules.losses import ForwardSumLoss
+from paddlespeech.t2s.modules.losses import GeneratorAdversarialLoss
+from paddlespeech.t2s.modules.losses import MelSpectrogramLoss
+from paddlespeech.t2s.modules.losses import VarianceLoss
+from paddlespeech.t2s.training.extensions.snapshot import Snapshot
+from paddlespeech.t2s.training.extensions.visualizer import VisualDL
+from paddlespeech.t2s.training.optimizer import scheduler_classes
+from paddlespeech.t2s.training.seeding import seed_everything
+from paddlespeech.t2s.training.trainer import Trainer
+from paddlespeech.t2s.utils import str2bool
+
+
+def train_sp(args, config):
+    # decides device type and whether to run in parallel
+    # setup running environment correctly
+    world_size = paddle.distributed.get_world_size()
+    if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0:
+        paddle.set_device("cpu")
+    else:
+        paddle.set_device("gpu")
+        if world_size > 1:
+            paddle.distributed.init_parallel_env()
+
+    # set the random seed, it is a must for multiprocess training
+    seed_everything(config.seed)
+
+    print(
+        f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}",
+    )
+
+    # dataloader has been too verbose
+    logging.getLogger("DataLoader").disabled = True
+
+    fields = [
+        "text", "text_lengths", "feats", "feats_lengths", "wave", "durations",
+        "pitch", "energy"
+    ]
+
+    converters = {
+        "wave": np.load,
+        "feats": np.load,
+        "pitch": np.load,
+        "energy": np.load,
+    }
+    spk_num = None
+    if args.speaker_dict is not None:
+        print("multiple speaker jets!")
+        collate_fn = jets_multi_spk_batch_fn
+        with open(args.speaker_dict, 'rt', encoding='utf-8') as f:
+            spk_id = [line.strip().split() for line in f.readlines()]
+        spk_num = len(spk_id)
+        fields += ["spk_id"]
+    elif args.voice_cloning:
+        print("Training voice cloning!")
+        collate_fn = jets_multi_spk_batch_fn
+        fields += ["spk_emb"]
+        converters["spk_emb"] = np.load
+    else:
+        print("single speaker jets!")
+        collate_fn = jets_single_spk_batch_fn
+    print("spk_num:", spk_num)
+
+    # construct dataset for training and validation
+    with jsonlines.open(args.train_metadata, 'r') as reader:
+        train_metadata = list(reader)
+    train_dataset = DataTable(
+        data=train_metadata,
+        fields=fields,
+        converters=converters, )
+    with jsonlines.open(args.dev_metadata, 'r') as reader:
+        dev_metadata = list(reader)
+    dev_dataset = DataTable(
+        data=dev_metadata,
+        fields=fields,
+        converters=converters, )
+
+    # collate function and dataloader
+    train_sampler = ErnieSATSampler(
+        train_dataset,
+        batch_size=config.batch_size,
+        shuffle=False,
+        drop_last=True)
+    dev_sampler = ErnieSATSampler(
+        dev_dataset,
+        batch_size=config.batch_size,
+        shuffle=False,
+        drop_last=False)
+    print("samplers done!")
+
+    train_dataloader = DataLoader(
+        train_dataset,
+        batch_sampler=train_sampler,
+        collate_fn=collate_fn,
+        num_workers=config.num_workers)
+
+    dev_dataloader = DataLoader(
+        dev_dataset,
+        batch_sampler=dev_sampler,
+        collate_fn=collate_fn,
+        num_workers=config.num_workers)
+    print("dataloaders done!")
+
+    with open(args.phones_dict, 'rt', encoding='utf-8') as f:
+        phn_id = [line.strip().split() for line in f.readlines()]
+    vocab_size = len(phn_id)
+    print("vocab_size:", vocab_size)
+
+    odim = config.n_mels
+    config["model"]["generator_params"]["spks"] = spk_num
+    model = JETS(idim=vocab_size, odim=odim, **config["model"])
+    gen_parameters = model.generator.parameters()
+    dis_parameters = model.discriminator.parameters()
+    if world_size > 1:
+        model = DataParallel(model)
+        gen_parameters = model._layers.generator.parameters()
+        dis_parameters = model._layers.discriminator.parameters()
+
+    print("model done!")
+
+    # loss
+    criterion_mel = MelSpectrogramLoss(
+        **config["mel_loss_params"], )
+    criterion_feat_match = FeatureMatchLoss(
+        **config["feat_match_loss_params"], )
+    criterion_gen_adv = GeneratorAdversarialLoss(
+        **config["generator_adv_loss_params"], )
+    criterion_dis_adv = DiscriminatorAdversarialLoss(
+        **config["discriminator_adv_loss_params"], )
+    criterion_var = VarianceLoss()
+    criterion_forwardsum = ForwardSumLoss()
+
+    print("criterions done!")
+
+    lr_schedule_g = scheduler_classes[config["generator_scheduler"]](
+        **config["generator_scheduler_params"])
+    optimizer_g = AdamW(
+        learning_rate=lr_schedule_g,
+        parameters=gen_parameters,
+        **config["generator_optimizer_params"])
+
+    lr_schedule_d = scheduler_classes[config["discriminator_scheduler"]](
+        **config["discriminator_scheduler_params"])
+    optimizer_d = AdamW(
+        learning_rate=lr_schedule_d,
+        parameters=dis_parameters,
+        **config["discriminator_optimizer_params"])
+
+    print("optimizers done!")
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    if dist.get_rank() == 0:
+        config_name = args.config.split("/")[-1]
+        # copy conf to output_dir
+        shutil.copyfile(args.config, output_dir / config_name)
+
+    updater = JETSUpdater(
+        model=model,
+        optimizers={
+            "generator": optimizer_g,
+            "discriminator": optimizer_d,
+        },
+        criterions={
+            "mel": criterion_mel,
+            "feat_match": criterion_feat_match,
+            "gen_adv": criterion_gen_adv,
+            "dis_adv": criterion_dis_adv,
+            "var": criterion_var,
+            "forwardsum": criterion_forwardsum,
+        },
+        schedulers={
+            "generator": lr_schedule_g,
+            "discriminator": lr_schedule_d,
+        },
+        dataloader=train_dataloader,
+        lambda_adv=config.lambda_adv,
+        lambda_mel=config.lambda_mel,
+        lambda_feat_match=config.lambda_feat_match,
+        lambda_var=config.lambda_var,
+        lambda_align=config.lambda_align,
+        generator_first=config.generator_first,
+        use_alignment_module=config.use_alignment_module,
+        output_dir=output_dir)
+
+    evaluator = JETSEvaluator(
+        model=model,
+        criterions={
+            "mel": criterion_mel,
+            "feat_match": criterion_feat_match,
+            "gen_adv": criterion_gen_adv,
+            "dis_adv": criterion_dis_adv,
+            "var": criterion_var,
+            "forwardsum": criterion_forwardsum,
+        },
+        dataloader=dev_dataloader,
+        lambda_adv=config.lambda_adv,
+        lambda_mel=config.lambda_mel,
+        lambda_feat_match=config.lambda_feat_match,
+        lambda_var=config.lambda_var,
+        lambda_align=config.lambda_align,
+        generator_first=config.generator_first,
+        use_alignment_module=config.use_alignment_module,
+        output_dir=output_dir)
+
+    trainer = Trainer(
+        updater,
+        stop_trigger=(config.train_max_steps, "iteration"),
+        out=output_dir)
+
+    if dist.get_rank() == 0:
+        trainer.extend(
+            evaluator, trigger=(config.eval_interval_steps, 'iteration'))
+        trainer.extend(VisualDL(output_dir), trigger=(1, 'iteration'))
+    trainer.extend(
+        Snapshot(max_size=config.num_snapshots),
+        trigger=(config.save_interval_steps, 'iteration'))
+
+    print("Trainer Done!")
+    trainer.run()
+
+
+def main():
+    # parse args and config and redirect to train_sp
+
+    parser = argparse.ArgumentParser(description="Train a JETS model.")
+    parser.add_argument("--config", type=str, help="JETS config file")
+    parser.add_argument("--train-metadata", type=str, help="training data.")
+    parser.add_argument("--dev-metadata", type=str, help="dev data.")
+    parser.add_argument("--output-dir", type=str, help="output dir.")
+    parser.add_argument(
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
+    parser.add_argument(
+        "--phones-dict", type=str, default=None, help="phone vocabulary file.")
+    parser.add_argument(
+        "--speaker-dict",
+        type=str,
+        default=None,
+        help="speaker id map file for multiple speaker model.")
+
+    parser.add_argument(
+        "--voice-cloning",
+        type=str2bool,
+        default=False,
+        help="whether training voice cloning model.")
+
+    args = parser.parse_args()
+
+    with open(args.config, 'rt') as f:
+        config = CfgNode(yaml.safe_load(f))
+
+    print("========Args========")
+    print(yaml.safe_dump(vars(args)))
+    print("========Config========")
+    print(config)
+    print(
+        f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}"
+    )
+
+    # dispatch
+    if args.ngpu > 1:
+        dist.spawn(train_sp, (args, config), nprocs=args.ngpu)
+    else:
+        train_sp(args, config)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/paddlespeech/t2s/exps/speedyspeech/preprocess.py b/paddlespeech/t2s/exps/speedyspeech/preprocess.py
index e4084c14..75a1b079 100644
--- a/paddlespeech/t2s/exps/speedyspeech/preprocess.py
+++ b/paddlespeech/t2s/exps/speedyspeech/preprocess.py
@@ -280,6 +280,7 @@ def main():
             sentences=sentences,
             output_dir=dev_dump_dir,
             mel_extractor=mel_extractor,
+            nprocs=args.num_cpu,
             cut_sil=args.cut_sil,
             use_relative_path=args.use_relative_path)
     if test_wav_files:
diff --git a/paddlespeech/t2s/exps/starganv2_vc/normalize.py b/paddlespeech/t2s/exps/starganv2_vc/normalize.py
new file mode 100644
index 00000000..c063c46f
--- /dev/null
+++ b/paddlespeech/t2s/exps/starganv2_vc/normalize.py
@@ -0,0 +1,101 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Normalize feature files and dump them."""
+import argparse
+import logging
+from operator import itemgetter
+from pathlib import Path
+
+import jsonlines
+import numpy as np
+import tqdm
+
+from paddlespeech.t2s.datasets.data_table import DataTable
+
+
+def main():
+    """Run preprocessing process."""
+    parser = argparse.ArgumentParser(
+        description="Normalize dumped raw features (See detail in parallel_wavegan/bin/normalize.py)."
+    )
+    parser.add_argument(
+        "--metadata",
+        type=str,
+        required=True,
+        help="directory including feature files to be normalized. "
+        "you need to specify either *-scp or rootdir.")
+
+    parser.add_argument(
+        "--dumpdir",
+        type=str,
+        required=True,
+        help="directory to dump normalized feature files.")
+
+    parser.add_argument(
+        "--speaker-dict", type=str, default=None, help="speaker id map file.")
+
+    args = parser.parse_args()
+
+    dumpdir = Path(args.dumpdir).expanduser()
+    # use absolute path
+    dumpdir = dumpdir.resolve()
+    dumpdir.mkdir(parents=True, exist_ok=True)
+
+    # get dataset
+    with jsonlines.open(args.metadata, 'r') as reader:
+        metadata = list(reader)
+    dataset = DataTable(
+        metadata, converters={
+            "speech": np.load,
+        })
+    logging.info(f"The number of files = {len(dataset)}.")
+
+    vocab_speaker = {}
+    with open(args.speaker_dict, 'rt') as f:
+        spk_id = [line.strip().split() for line in f.readlines()]
+    for spk, id in spk_id:
+        vocab_speaker[spk] = int(id)
+
+    # process each file
+    output_metadata = []
+
+    for item in tqdm.tqdm(dataset):
+        utt_id = item['utt_id']
+        speech = item['speech']
+
+        # normalize
+        # 这里暂时写死
+        mean, std = -4, 4
+        speech = (speech - mean) / std
+        speech_path = dumpdir / f"{utt_id}_speech.npy"
+        np.save(speech_path, speech.astype(np.float32), allow_pickle=False)
+
+        spk_id = vocab_speaker[item["speaker"]]
+        record = {
+            "utt_id": item['utt_id'],
+            "spk_id": spk_id,
+            "speech": str(speech_path),
+        }
+
+        output_metadata.append(record)
+    output_metadata.sort(key=itemgetter('utt_id'))
+    output_metadata_path = Path(args.dumpdir) / "metadata.jsonl"
+    with jsonlines.open(output_metadata_path, 'w') as writer:
+        for item in output_metadata:
+            writer.write(item)
+    logging.info(f"metadata dumped into {output_metadata_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/paddlespeech/t2s/exps/starganv2_vc/preprocess.py b/paddlespeech/t2s/exps/starganv2_vc/preprocess.py
new file mode 100644
index 00000000..053c3b32
--- /dev/null
+++ b/paddlespeech/t2s/exps/starganv2_vc/preprocess.py
@@ -0,0 +1,214 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+from concurrent.futures import ThreadPoolExecutor
+from operator import itemgetter
+from pathlib import Path
+from typing import Any
+from typing import Dict
+from typing import List
+
+import jsonlines
+import librosa
+import numpy as np
+import tqdm
+import yaml
+from yacs.config import CfgNode
+
+from paddlespeech.t2s.datasets.get_feats import LogMelFBank
+from paddlespeech.t2s.datasets.preprocess_utils import get_spk_id_map
+
+speaker_set = set()
+
+
+def process_sentence(config: Dict[str, Any],
+                     fp: Path,
+                     output_dir: Path,
+                     mel_extractor=None):
+    utt_id = fp.stem
+    # for vctk
+    if utt_id.endswith("_mic2"):
+        utt_id = utt_id[:-5]
+        speaker = utt_id.split('_')[0]
+        speaker_set.add(speaker)
+    # 需要额外获取 speaker
+    record = None
+    # reading, resampling may occur
+    # 源码的 bug, 读取的时候按照 24000 读取，但是提取 mel 的时候按照 16000 提取
+    # 具体参考 https://github.com/PaddlePaddle/PaddleSpeech/blob/c7d24ba42c377fe4c0765c6b1faa202a9aeb136f/paddlespeech/t2s/exps/starganv2_vc/vc.py#L165
+    # 之后需要换成按照 24000 读取和按照 24000 提取 mel
+    wav, _ = librosa.load(str(fp), sr=24000)
+    max_value = np.abs(wav).max()
+    if max_value > 1.0:
+        wav = wav / max_value
+    assert len(wav.shape) == 1, f"{utt_id} is not a mono-channel audio."
+    assert np.abs(
+        wav).max() <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM."
+    # extract mel feats
+    # 注意这里 base = 'e', 后续需要换成 base='10', 我们其他 TTS 模型都是 base='10'
+    logmel = mel_extractor.get_log_mel_fbank(wav, base='e')
+    mel_path = output_dir / (utt_id + "_speech.npy")
+    np.save(mel_path, logmel)
+    record = {"utt_id": utt_id, "speech": str(mel_path), "speaker": speaker}
+    return record
+
+
+def process_sentences(
+        config,
+        fps: List[Path],
+        output_dir: Path,
+        mel_extractor=None,
+        nprocs: int=1, ):
+    if nprocs == 1:
+        results = []
+        for fp in tqdm.tqdm(fps, total=len(fps)):
+            record = process_sentence(
+                config=config,
+                fp=fp,
+                output_dir=output_dir,
+                mel_extractor=mel_extractor)
+            if record:
+                results.append(record)
+    else:
+        with ThreadPoolExecutor(nprocs) as pool:
+            futures = []
+            with tqdm.tqdm(total=len(fps)) as progress:
+                for fp in fps:
+                    future = pool.submit(process_sentence, config, fp,
+                                         output_dir, mel_extractor)
+                    future.add_done_callback(lambda p: progress.update())
+                    futures.append(future)
+
+                results = []
+                for ft in futures:
+                    record = ft.result()
+                    if record:
+                        results.append(record)
+
+    results.sort(key=itemgetter("utt_id"))
+    with jsonlines.open(output_dir / "metadata.jsonl", 'w') as writer:
+        for item in results:
+            writer.write(item)
+    print("Done")
+
+
+def main():
+    # parse config and args
+    parser = argparse.ArgumentParser(
+        description="Preprocess audio and then extract features.")
+
+    parser.add_argument(
+        "--dataset",
+        default="vctk",
+        type=str,
+        help="name of dataset, should in {vctk} now")
+
+    parser.add_argument(
+        "--rootdir", default=None, type=str, help="directory to dataset.")
+
+    parser.add_argument(
+        "--dumpdir",
+        type=str,
+        required=True,
+        help="directory to dump feature files.")
+
+    parser.add_argument("--config", type=str, help="StarGANv2VC config file.")
+
+    parser.add_argument(
+        "--num-cpu", type=int, default=1, help="number of process.")
+
+    args = parser.parse_args()
+
+    rootdir = Path(args.rootdir).expanduser()
+    dumpdir = Path(args.dumpdir).expanduser()
+    # use absolute path
+    dumpdir = dumpdir.resolve()
+    dumpdir.mkdir(parents=True, exist_ok=True)
+
+    assert rootdir.is_dir()
+
+    with open(args.config, 'rt') as f:
+        config = CfgNode(yaml.safe_load(f))
+
+    if args.dataset == "vctk":
+        sub_num_dev = 5
+        wav_dir = rootdir / "wav48_silence_trimmed"
+        train_wav_files = []
+        dev_wav_files = []
+        test_wav_files = []
+        # only for test
+        for speaker in os.listdir(wav_dir):
+            wav_files = sorted(list((wav_dir / speaker).rglob("*_mic2.flac")))
+            if len(wav_files) > 100:
+                train_wav_files += wav_files[:-sub_num_dev * 2]
+                dev_wav_files += wav_files[-sub_num_dev * 2:-sub_num_dev]
+                test_wav_files += wav_files[-sub_num_dev:]
+            else:
+                train_wav_files += wav_files
+
+    else:
+        print("dataset should in {vctk} now!")
+
+    train_dump_dir = dumpdir / "train" / "raw"
+    train_dump_dir.mkdir(parents=True, exist_ok=True)
+    dev_dump_dir = dumpdir / "dev" / "raw"
+    dev_dump_dir.mkdir(parents=True, exist_ok=True)
+    test_dump_dir = dumpdir / "test" / "raw"
+    test_dump_dir.mkdir(parents=True, exist_ok=True)
+
+    # Extractor
+    mel_extractor = LogMelFBank(
+        sr=config.fs,
+        n_fft=config.n_fft,
+        hop_length=config.n_shift,
+        win_length=config.win_length,
+        window=config.window,
+        n_mels=config.n_mels,
+        fmin=config.fmin,
+        fmax=config.fmax,
+        # None here
+        norm=config.norm,
+        htk=config.htk,
+        power=config.power)
+
+    # process for the 3 sections
+    if train_wav_files:
+        process_sentences(
+            config=config,
+            fps=train_wav_files,
+            output_dir=train_dump_dir,
+            mel_extractor=mel_extractor,
+            nprocs=args.num_cpu)
+    if dev_wav_files:
+        process_sentences(
+            config=config,
+            fps=dev_wav_files,
+            output_dir=dev_dump_dir,
+            mel_extractor=mel_extractor,
+            nprocs=args.num_cpu)
+    if test_wav_files:
+        process_sentences(
+            config=config,
+            fps=test_wav_files,
+            output_dir=test_dump_dir,
+            mel_extractor=mel_extractor,
+            nprocs=args.num_cpu)
+
+    speaker_id_map_path = dumpdir / "speaker_id_map.txt"
+    get_spk_id_map(speaker_set, speaker_id_map_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/paddlespeech/t2s/exps/starganv2_vc/train.py b/paddlespeech/t2s/exps/starganv2_vc/train.py
new file mode 100644
index 00000000..94fa3032
--- /dev/null
+++ b/paddlespeech/t2s/exps/starganv2_vc/train.py
@@ -0,0 +1,274 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import logging
+import os
+import shutil
+from pathlib import Path
+
+import jsonlines
+import numpy as np
+import paddle
+import yaml
+from paddle import DataParallel
+from paddle import distributed as dist
+from paddle.io import DataLoader
+from paddle.io import DistributedBatchSampler
+from paddle.optimizer import AdamW
+from paddle.optimizer.lr import OneCycleLR
+from yacs.config import CfgNode
+
+from paddlespeech.cli.utils import download_and_decompress
+from paddlespeech.resource.pretrained_models import StarGANv2VC_source
+from paddlespeech.t2s.datasets.am_batch_fn import build_starganv2_vc_collate_fn
+from paddlespeech.t2s.datasets.data_table import StarGANv2VCDataTable
+from paddlespeech.t2s.models.starganv2_vc import ASRCNN
+from paddlespeech.t2s.models.starganv2_vc import Discriminator
+from paddlespeech.t2s.models.starganv2_vc import Generator
+from paddlespeech.t2s.models.starganv2_vc import JDCNet
+from paddlespeech.t2s.models.starganv2_vc import MappingNetwork
+from paddlespeech.t2s.models.starganv2_vc import StarGANv2VCEvaluator
+from paddlespeech.t2s.models.starganv2_vc import StarGANv2VCUpdater
+from paddlespeech.t2s.models.starganv2_vc import StyleEncoder
+from paddlespeech.t2s.training.extensions.snapshot import Snapshot
+from paddlespeech.t2s.training.extensions.visualizer import VisualDL
+from paddlespeech.t2s.training.seeding import seed_everything
+from paddlespeech.t2s.training.trainer import Trainer
+from paddlespeech.utils.env import MODEL_HOME
+
+
+def train_sp(args, config):
+    # decides device type and whether to run in parallel
+    # setup running environment correctly
+    world_size = paddle.distributed.get_world_size()
+    if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0:
+        paddle.set_device("cpu")
+    else:
+        paddle.set_device("gpu")
+        if world_size > 1:
+            paddle.distributed.init_parallel_env()
+
+    # set the random seed, it is a must for multiprocess training
+    seed_everything(config.seed)
+
+    print(
+        f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}",
+    )
+    # to edit
+    fields = ["speech", "speech_lengths"]
+    converters = {"speech": np.load}
+
+    collate_fn = build_starganv2_vc_collate_fn(
+        latent_dim=config['mapping_network_params']['latent_dim'],
+        max_mel_length=config['max_mel_length'])
+
+    # dataloader has been too verbose
+    logging.getLogger("DataLoader").disabled = True
+
+    # construct dataset for training and validation
+    with jsonlines.open(args.train_metadata, 'r') as reader:
+        train_metadata = list(reader)
+    train_dataset = StarGANv2VCDataTable(data=train_metadata)
+    with jsonlines.open(args.dev_metadata, 'r') as reader:
+        dev_metadata = list(reader)
+    dev_dataset = StarGANv2VCDataTable(data=dev_metadata)
+
+    # collate function and dataloader
+    train_sampler = DistributedBatchSampler(
+        train_dataset,
+        batch_size=config.batch_size,
+        shuffle=True,
+        drop_last=True)
+
+    print("samplers done!")
+
+    train_dataloader = DataLoader(
+        train_dataset,
+        batch_sampler=train_sampler,
+        collate_fn=collate_fn,
+        num_workers=config.num_workers)
+
+    dev_dataloader = DataLoader(
+        dev_dataset,
+        shuffle=False,
+        drop_last=False,
+        batch_size=config.batch_size,
+        collate_fn=collate_fn,
+        num_workers=config.num_workers)
+
+    print("dataloaders done!")
+
+    # load model
+    model_version = '1.0'
+    uncompress_path = download_and_decompress(StarGANv2VC_source[model_version],
+                                              MODEL_HOME)
+    # 根据 speaker 的个数修改 num_domains
+    # 源码的预训练模型和 default.yaml 里面默认是 20
+    if args.speaker_dict is not None:
+        with open(args.speaker_dict, 'rt', encoding='utf-8') as f:
+            spk_id = [line.strip().split() for line in f.readlines()]
+        spk_num = len(spk_id)
+        print("spk_num:", spk_num)
+        config['mapping_network_params']['num_domains'] = spk_num
+        config['style_encoder_params']['num_domains'] = spk_num
+        config['discriminator_params']['num_domains'] = spk_num
+
+    generator = Generator(**config['generator_params'])
+    mapping_network = MappingNetwork(**config['mapping_network_params'])
+    style_encoder = StyleEncoder(**config['style_encoder_params'])
+    discriminator = Discriminator(**config['discriminator_params'])
+
+    # load pretrained model
+    jdc_model_dir = os.path.join(uncompress_path, 'jdcnet.pdz')
+    asr_model_dir = os.path.join(uncompress_path, 'asr.pdz')
+
+    F0_model = JDCNet(num_class=1, seq_len=config['max_mel_length'])
+    F0_model.set_state_dict(paddle.load(jdc_model_dir)['main_params'])
+    F0_model.eval()
+
+    asr_model = ASRCNN(**config['asr_params'])
+    asr_model.set_state_dict(paddle.load(asr_model_dir)['main_params'])
+    asr_model.eval()
+
+    if world_size > 1:
+        generator = DataParallel(generator)
+        discriminator = DataParallel(discriminator)
+    print("models done!")
+
+    lr_schedule_g = OneCycleLR(**config["generator_scheduler_params"])
+    optimizer_g = AdamW(
+        learning_rate=lr_schedule_g,
+        parameters=generator.parameters(),
+        **config["generator_optimizer_params"])
+
+    lr_schedule_s = OneCycleLR(**config["style_encoder_scheduler_params"])
+    optimizer_s = AdamW(
+        learning_rate=lr_schedule_s,
+        parameters=style_encoder.parameters(),
+        **config["style_encoder_optimizer_params"])
+
+    lr_schedule_m = OneCycleLR(**config["mapping_network_scheduler_params"])
+    optimizer_m = AdamW(
+        learning_rate=lr_schedule_m,
+        parameters=mapping_network.parameters(),
+        **config["mapping_network_optimizer_params"])
+
+    lr_schedule_d = OneCycleLR(**config["discriminator_scheduler_params"])
+    optimizer_d = AdamW(
+        learning_rate=lr_schedule_d,
+        parameters=discriminator.parameters(),
+        **config["discriminator_optimizer_params"])
+    print("optimizers done!")
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    if dist.get_rank() == 0:
+        config_name = args.config.split("/")[-1]
+        # copy conf to output_dir
+        shutil.copyfile(args.config, output_dir / config_name)
+
+    updater = StarGANv2VCUpdater(
+        models={
+            "generator": generator,
+            "style_encoder": style_encoder,
+            "mapping_network": mapping_network,
+            "discriminator": discriminator,
+            "F0_model": F0_model,
+            "asr_model": asr_model,
+        },
+        optimizers={
+            "generator": optimizer_g,
+            "style_encoder": optimizer_s,
+            "mapping_network": optimizer_m,
+            "discriminator": optimizer_d,
+        },
+        schedulers={
+            "generator": lr_schedule_g,
+            "style_encoder": lr_schedule_s,
+            "mapping_network": lr_schedule_m,
+            "discriminator": lr_schedule_d,
+        },
+        dataloader=train_dataloader,
+        g_loss_params=config.loss_params.g_loss,
+        d_loss_params=config.loss_params.d_loss,
+        adv_cls_epoch=config.loss_params.adv_cls_epoch,
+        con_reg_epoch=config.loss_params.con_reg_epoch,
+        output_dir=output_dir)
+
+    evaluator = StarGANv2VCEvaluator(
+        models={
+            "generator": generator,
+            "style_encoder": style_encoder,
+            "mapping_network": mapping_network,
+            "discriminator": discriminator,
+            "F0_model": F0_model,
+            "asr_model": asr_model,
+        },
+        dataloader=dev_dataloader,
+        g_loss_params=config.loss_params.g_loss,
+        d_loss_params=config.loss_params.d_loss,
+        adv_cls_epoch=config.loss_params.adv_cls_epoch,
+        con_reg_epoch=config.loss_params.con_reg_epoch,
+        output_dir=output_dir)
+
+    trainer = Trainer(updater, (config.max_epoch, 'epoch'), output_dir)
+
+    if dist.get_rank() == 0:
+        trainer.extend(evaluator, trigger=(1, "epoch"))
+        trainer.extend(VisualDL(output_dir), trigger=(1, "iteration"))
+    trainer.extend(
+        Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch'))
+    print("Trainer Done!")
+
+    trainer.run()
+
+
+def main():
+    # parse args and config and redirect to train_sp
+
+    parser = argparse.ArgumentParser(description="Train a HiFiGAN model.")
+    parser.add_argument("--config", type=str, help="HiFiGAN config file.")
+    parser.add_argument("--train-metadata", type=str, help="training data.")
+    parser.add_argument("--dev-metadata", type=str, help="dev data.")
+    parser.add_argument("--output-dir", type=str, help="output dir.")
+    parser.add_argument(
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
+    parser.add_argument(
+        "--speaker-dict",
+        type=str,
+        default=None,
+        help="speaker id map file for multiple speaker model.")
+
+    args = parser.parse_args()
+
+    with open(args.config, 'rt') as f:
+        config = CfgNode(yaml.safe_load(f))
+
+    print("========Args========")
+    print(yaml.safe_dump(vars(args)))
+    print("========Config========")
+    print(config)
+    print(
+        f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}"
+    )
+
+    # dispatch
+    if args.ngpu > 1:
+        dist.spawn(train_sp, (args, config), nprocs=args.ngpu)
+    else:
+        train_sp(args, config)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/paddlespeech/t2s/exps/starganv2_vc/vc.py b/paddlespeech/t2s/exps/starganv2_vc/vc.py
index ffb25741..24d3dcf8 100644
--- a/paddlespeech/t2s/exps/starganv2_vc/vc.py
+++ b/paddlespeech/t2s/exps/starganv2_vc/vc.py
@@ -57,9 +57,10 @@ def get_mel_extractor():
 
 
 def preprocess(wave, mel_extractor):
+    # (T, 80)
     logmel = mel_extractor.get_log_mel_fbank(wave, base='e')
-    # [1, 80, 1011]
     mean, std = -4, 4
+    # [1, 80, T]
     mel_tensor = (paddle.to_tensor(logmel.T).unsqueeze(0) - mean) / std
     return mel_tensor
 
@@ -67,6 +68,7 @@ def preprocess(wave, mel_extractor):
 def compute_style(speaker_dicts, mel_extractor, style_encoder, mapping_network):
     reference_embeddings = {}
     for key, (path, speaker) in speaker_dicts.items():
+        # path = ''
         if path == '':
             label = paddle.to_tensor([speaker], dtype=paddle.int64)
             latent_dim = mapping_network.shared[0].weight.shape[0]
@@ -164,6 +166,15 @@ def voice_conversion(args, uncompress_path):
 
     wave, sr = librosa.load(args.source_path, sr=24000)
     source = preprocess(wave=wave, mel_extractor=mel_extractor)
+    # # 测试 preprocess.py 的输出是否 ok
+    # # 直接用 raw 然后 norm 的在这里 ok
+    # # 直接用 norm 在这里 ok
+    # import numpy as np
+    # source = np.load("~/PaddleSpeech_stargan_preprocess/PaddleSpeech/examples/vctk/vc3/dump/train/norm/p329_414_speech.npy")
+    # # ！！！对 mel_extractor norm 后的操作
+    # # [1, 80, T]
+    # source = paddle.to_tensor(source.T).unsqueeze(0)
+
     output_dir = Path(args.output_dir)
     output_dir.mkdir(parents=True, exist_ok=True)
     orig_wav_name = str(output_dir / 'orig_voc.wav')
diff --git a/paddlespeech/t2s/exps/syn_utils.py b/paddlespeech/t2s/exps/syn_utils.py
index 354636b4..b720ae48 100644
--- a/paddlespeech/t2s/exps/syn_utils.py
+++ b/paddlespeech/t2s/exps/syn_utils.py
@@ -20,6 +20,7 @@ from typing import Dict
 from typing import List
 from typing import Optional
 
+import jsonlines
 import numpy as np
 import onnxruntime as ort
 import paddle
@@ -30,9 +31,10 @@ from paddle.static import InputSpec
 from paddlespeech.t2s.datasets.am_batch_fn import *
 from paddlespeech.t2s.datasets.data_table import DataTable
 from paddlespeech.t2s.datasets.vocoder_batch_fn import Clip_static
-from paddlespeech.t2s.frontend import English
 from paddlespeech.t2s.frontend.canton_frontend import CantonFrontend
+from paddlespeech.t2s.frontend.en_frontend import English
 from paddlespeech.t2s.frontend.mix_frontend import MixFrontend
+from paddlespeech.t2s.frontend.sing_frontend import SingFrontend
 from paddlespeech.t2s.frontend.zh_frontend import Frontend
 from paddlespeech.t2s.modules.normalizer import ZScore
 from paddlespeech.utils.dynamic_import import dynamic_import
@@ -55,6 +57,11 @@ model_alias = {
     "paddlespeech.t2s.models.tacotron2:Tacotron2",
     "tacotron2_inference":
     "paddlespeech.t2s.models.tacotron2:Tacotron2Inference",
+    "diffsinger":
+    "paddlespeech.t2s.models.diffsinger:DiffSinger",
+    "diffsinger_inference":
+    "paddlespeech.t2s.models.diffsinger:DiffSingerInference",
+
     # voc
     "pwgan":
     "paddlespeech.t2s.models.parallel_wavegan:PWGGenerator",
@@ -91,14 +98,23 @@ def norm(data, mean, std):
     return (data - mean) / std
 
 
-def get_chunks(data, block_size: int, pad_size: int):
-    data_len = data.shape[1]
+def get_chunks(mel, chunk_size: int, pad_size: int):
+    """
+    Split mel by chunk size with left and right context.
+
+    Args:
+        mel (paddle.Tensor): mel spectrogram, shape (B, T, D)
+        chunk_size (int): chunk size
+        pad_size (int): size for left and right context.
+    """
+    T = mel.shape[1]
+    n = math.ceil(T / chunk_size)
+
     chunks = []
-    n = math.ceil(data_len / block_size)
     for i in range(n):
-        start = max(0, i * block_size - pad_size)
-        end = min((i + 1) * block_size + pad_size, data_len)
-        chunks.append(data[:, start:end, :])
+        start = max(0, i * chunk_size - pad_size)
+        end = min((i + 1) * chunk_size + pad_size, T)
+        chunks.append(mel[:, start:end, :])
     return chunks
 
 
@@ -109,18 +125,27 @@ def get_sentences(text_file: Optional[os.PathLike], lang: str='zh'):
     with open(text_file, 'rt', encoding='utf-8') as f:
         for line in f:
             if line.strip() != "":
-                items = re.split(r"\s+", line.strip(), 1)
+                items = re.split(r"\s+", line.strip(), maxsplit=1)
+                assert len(items) == 2
                 utt_id = items[0]
-                if lang in {'zh', 'canton'}:
-                    sentence = "".join(items[1:])
-                elif lang == 'en':
-                    sentence = " ".join(items[1:])
-                elif lang == 'mix':
-                    sentence = " ".join(items[1:])
+                sentence = items[1]
             sentences.append((utt_id, sentence))
     return sentences
 
 
+# input for svs
+def get_sentences_svs(text_file: Optional[os.PathLike]):
+    # construct dataset for evaluation
+    sentences = []
+    with jsonlines.open(text_file, 'r') as reader:
+        svs_inputs = list(reader)
+    for svs_input in svs_inputs:
+        utt_id = svs_input['utt_id']
+        sentence = svs_input
+        sentences.append((utt_id, sentence))
+    return sentences
+
+
 # am only
 def get_test_dataset(test_metadata: List[Dict[str, Any]],
                      am: str,
@@ -141,6 +166,8 @@ def get_test_dataset(test_metadata: List[Dict[str, Any]],
             fields += ["spk_emb"]
         else:
             print("single speaker fastspeech2!")
+    elif am_name == 'diffsinger':
+        fields = ["utt_id", "text", "note", "note_dur", "is_slur"]
     elif am_name == 'speedyspeech':
         fields = ["utt_id", "phones", "tones"]
     elif am_name == 'tacotron2':
@@ -260,6 +287,7 @@ def get_dev_dataloader(dev_metadata: List[Dict[str, Any]],
 def get_frontend(lang: str='zh',
                  phones_dict: Optional[os.PathLike]=None,
                  tones_dict: Optional[os.PathLike]=None,
+                 pinyin_phone: Optional[os.PathLike]=None,
                  use_rhy=False):
     if lang == 'zh':
         frontend = Frontend(
@@ -273,23 +301,29 @@ def get_frontend(lang: str='zh',
     elif lang == 'mix':
         frontend = MixFrontend(
             phone_vocab_path=phones_dict, tone_vocab_path=tones_dict)
+    elif lang == 'sing':
+        frontend = SingFrontend(
+            pinyin_phone_path=pinyin_phone, phone_vocab_path=phones_dict)
     else:
         print("wrong lang!")
     return frontend
 
 
-def run_frontend(frontend: object,
-                 text: str,
-                 merge_sentences: bool=False,
-                 get_tone_ids: bool=False,
-                 lang: str='zh',
-                 to_tensor: bool=True,
-                 add_blank: bool=False):
+def run_frontend(
+        frontend: object,
+        text: str,
+        merge_sentences: bool=False,
+        get_tone_ids: bool=False,
+        lang: str='zh',
+        to_tensor: bool=True,
+        add_blank: bool=False,
+        svs_input: Dict[str, str]=None, ):
     outs = dict()
     if lang == 'zh':
         input_ids = {}
         if text.strip() != "" and re.match(r".*?<speak>.*?</speak>.*", text,
                                            re.DOTALL):
+            # using ssml
             input_ids = frontend.get_input_ids_ssml(
                 text,
                 merge_sentences=merge_sentences,
@@ -318,21 +352,34 @@ def run_frontend(frontend: object,
         input_ids = frontend.get_input_ids(
             text, merge_sentences=merge_sentences, to_tensor=to_tensor)
         phone_ids = input_ids["phone_ids"]
+    elif lang == 'sing':
+        input_ids = frontend.get_input_ids(
+            svs_input=svs_input, to_tensor=to_tensor)
+        phone_ids = input_ids["phone_ids"]
+        note_ids = input_ids["note_ids"]
+        note_durs = input_ids["note_durs"]
+        is_slurs = input_ids["is_slurs"]
+        outs.update({'note_ids': note_ids})
+        outs.update({'note_durs': note_durs})
+        outs.update({'is_slurs': is_slurs})
     else:
-        print("lang should in {'zh', 'en', 'mix', 'canton'}!")
+        print("lang should in {'zh', 'en', 'mix', 'canton', 'sing'}!")
+
     outs.update({'phone_ids': phone_ids})
     return outs
 
 
 # dygraph
-def get_am_inference(am: str='fastspeech2_csmsc',
-                     am_config: CfgNode=None,
-                     am_ckpt: Optional[os.PathLike]=None,
-                     am_stat: Optional[os.PathLike]=None,
-                     phones_dict: Optional[os.PathLike]=None,
-                     tones_dict: Optional[os.PathLike]=None,
-                     speaker_dict: Optional[os.PathLike]=None,
-                     return_am: bool=False):
+def get_am_inference(
+        am: str='fastspeech2_csmsc',
+        am_config: CfgNode=None,
+        am_ckpt: Optional[os.PathLike]=None,
+        am_stat: Optional[os.PathLike]=None,
+        phones_dict: Optional[os.PathLike]=None,
+        tones_dict: Optional[os.PathLike]=None,
+        speaker_dict: Optional[os.PathLike]=None,
+        return_am: bool=False,
+        speech_stretchs: Optional[os.PathLike]=None, ):
     with open(phones_dict, 'rt', encoding='utf-8') as f:
         phn_id = [line.strip().split() for line in f.readlines()]
     vocab_size = len(phn_id)
@@ -355,6 +402,19 @@ def get_am_inference(am: str='fastspeech2_csmsc',
     if am_name == 'fastspeech2':
         am = am_class(
             idim=vocab_size, odim=odim, spk_num=spk_num, **am_config["model"])
+    elif am_name == 'diffsinger':
+        with open(speech_stretchs, "r") as f:
+            spec_min = np.load(speech_stretchs)[0]
+            spec_max = np.load(speech_stretchs)[1]
+            spec_min = paddle.to_tensor(spec_min)
+            spec_max = paddle.to_tensor(spec_max)
+        am_config["model"]["fastspeech2_params"]["spk_num"] = spk_num
+        am = am_class(
+            spec_min=spec_min,
+            spec_max=spec_max,
+            idim=vocab_size,
+            odim=odim,
+            **am_config["model"], )
     elif am_name == 'speedyspeech':
         am = am_class(
             vocab_size=vocab_size,
@@ -365,8 +425,6 @@ def get_am_inference(am: str='fastspeech2_csmsc',
         am = am_class(idim=vocab_size, odim=odim, **am_config["model"])
     elif am_name == 'erniesat':
         am = am_class(idim=vocab_size, odim=odim, **am_config["model"])
-    else:
-        print("wrong am, please input right am!!!")
 
     am.set_state_dict(paddle.load(am_ckpt)["main_params"])
     am.eval()
@@ -453,7 +511,8 @@ def am_to_static(am_inference,
     elif am_name == 'tacotron2':
         am_inference = jit.to_static(
             am_inference, input_spec=[InputSpec([-1], dtype=paddle.int64)])
-    elif am_name == 'vits':
+
+    elif am_name == 'vits' or am_name == 'jets':
         if am_dataset in {"aishell3", "vctk"} and speaker_dict is not None:
             am_inference = jit.to_static(
                 am_inference,
@@ -464,8 +523,20 @@ def am_to_static(am_inference,
         else:
             am_inference = jit.to_static(
                 am_inference, input_spec=[InputSpec([-1], dtype=paddle.int64)])
+
+    elif am_name == 'diffsinger':
+        am_inference = jit.to_static(
+            am_inference,
+            input_spec=[
+                InputSpec([-1], dtype=paddle.int64),  # phone
+                InputSpec([-1], dtype=paddle.int64),  # note
+                InputSpec([-1], dtype=paddle.float32),  # note_dur
+                InputSpec([-1], dtype=paddle.int64),  # is_slur
+            ])
+
     jit.save(am_inference, os.path.join(inference_dir, am))
     am_inference = jit.load(os.path.join(inference_dir, am))
+
     return am_inference
 
 
diff --git a/paddlespeech/t2s/exps/synthesize.py b/paddlespeech/t2s/exps/synthesize.py
index 70e52244..e7cf7850 100644
--- a/paddlespeech/t2s/exps/synthesize.py
+++ b/paddlespeech/t2s/exps/synthesize.py
@@ -60,7 +60,8 @@ def evaluate(args):
         am_stat=args.am_stat,
         phones_dict=args.phones_dict,
         tones_dict=args.tones_dict,
-        speaker_dict=args.speaker_dict)
+        speaker_dict=args.speaker_dict,
+        speech_stretchs=args.speech_stretchs, )
     test_dataset = get_test_dataset(
         test_metadata=test_metadata,
         am=args.am,
@@ -107,6 +108,20 @@ def evaluate(args):
                     if args.voice_cloning and "spk_emb" in datum:
                         spk_emb = paddle.to_tensor(np.load(datum["spk_emb"]))
                     mel = am_inference(phone_ids, spk_emb=spk_emb)
+                elif am_name == 'diffsinger':
+                    phone_ids = paddle.to_tensor(datum["text"])
+                    note = paddle.to_tensor(datum["note"])
+                    note_dur = paddle.to_tensor(datum["note_dur"])
+                    is_slur = paddle.to_tensor(datum["is_slur"])
+                    # get_mel_fs2 = False, means mel from diffusion, get_mel_fs2 = True, means mel from fastspeech2.
+                    get_mel_fs2 = False
+                    # mel: [T, mel_bin]
+                    mel = am_inference(
+                        phone_ids,
+                        note=note,
+                        note_dur=note_dur,
+                        is_slur=is_slur,
+                        get_mel_fs2=get_mel_fs2)
                 # vocoder
                 wav = voc_inference(mel)
 
@@ -134,10 +149,17 @@ def parse_args():
         type=str,
         default='fastspeech2_csmsc',
         choices=[
-            'speedyspeech_csmsc', 'fastspeech2_csmsc', 'fastspeech2_ljspeech',
-            'fastspeech2_aishell3', 'fastspeech2_vctk', 'tacotron2_csmsc',
-            'tacotron2_ljspeech', 'tacotron2_aishell3', 'fastspeech2_mix',
-            'fastspeech2_canton'
+            'speedyspeech_csmsc',
+            'fastspeech2_csmsc',
+            'fastspeech2_ljspeech',
+            'fastspeech2_aishell3',
+            'fastspeech2_vctk',
+            'tacotron2_csmsc',
+            'tacotron2_ljspeech',
+            'tacotron2_aishell3',
+            'fastspeech2_mix',
+            'fastspeech2_canton',
+            'diffsinger_opencpop',
         ],
         help='Choose acoustic model type of tts task.')
     parser.add_argument(
@@ -170,10 +192,19 @@ def parse_args():
         type=str,
         default='pwgan_csmsc',
         choices=[
-            'pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3', 'pwgan_vctk',
-            'mb_melgan_csmsc', 'wavernn_csmsc', 'hifigan_csmsc',
-            'hifigan_ljspeech', 'hifigan_aishell3', 'hifigan_vctk',
-            'style_melgan_csmsc'
+            'pwgan_csmsc',
+            'pwgan_ljspeech',
+            'pwgan_aishell3',
+            'pwgan_vctk',
+            'mb_melgan_csmsc',
+            'wavernn_csmsc',
+            'hifigan_csmsc',
+            'hifigan_ljspeech',
+            'hifigan_aishell3',
+            'hifigan_vctk',
+            'style_melgan_csmsc',
+            "pwgan_opencpop",
+            "hifigan_opencpop",
         ],
         help='Choose vocoder type of tts task.')
     parser.add_argument(
@@ -188,9 +219,20 @@ def parse_args():
     )
     # other
     parser.add_argument(
-        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu or xpu.")
+    parser.add_argument(
+        "--nxpu",
+        type=int,
+        default=0,
+        help="if wish to use xpu, set ngpu == 0 and nxpu > 0, and if ngpu == 0 and nxpu == 0, use cpu."
+    )
     parser.add_argument("--test_metadata", type=str, help="test metadata.")
     parser.add_argument("--output_dir", type=str, help="output dir.")
+    parser.add_argument(
+        "--speech_stretchs",
+        type=str,
+        default=None,
+        help="The min and max values of the mel spectrum.")
 
     args = parser.parse_args()
     return args
@@ -199,12 +241,14 @@ def parse_args():
 def main():
 
     args = parse_args()
-    if args.ngpu == 0:
-        paddle.set_device("cpu")
-    elif args.ngpu > 0:
+    if args.ngpu > 0:
         paddle.set_device("gpu")
+    elif args.nxpu > 0:
+        paddle.set_device("xpu")
+    elif args.ngpu == 0 and args.nxpu == 0:
+        paddle.set_device("cpu")
     else:
-        print("ngpu should >= 0 !")
+        print("ngpu or nxpu should >= 0 !")
 
     evaluate(args)
 
diff --git a/paddlespeech/t2s/exps/synthesize_e2e.py b/paddlespeech/t2s/exps/synthesize_e2e.py
index db94a6e5..c63a5fbe 100644
--- a/paddlespeech/t2s/exps/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/synthesize_e2e.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 import argparse
 from pathlib import Path
+from pprint import pprint
 
 import paddle
 import soundfile as sf
@@ -24,6 +25,7 @@ from paddlespeech.t2s.exps.syn_utils import am_to_static
 from paddlespeech.t2s.exps.syn_utils import get_am_inference
 from paddlespeech.t2s.exps.syn_utils import get_frontend
 from paddlespeech.t2s.exps.syn_utils import get_sentences
+from paddlespeech.t2s.exps.syn_utils import get_sentences_svs
 from paddlespeech.t2s.exps.syn_utils import get_voc_inference
 from paddlespeech.t2s.exps.syn_utils import run_frontend
 from paddlespeech.t2s.exps.syn_utils import voc_to_static
@@ -44,20 +46,18 @@ def evaluate(args):
     print(am_config)
     print(voc_config)
 
-    sentences = get_sentences(text_file=args.text, lang=args.lang)
-
     # frontend
     frontend = get_frontend(
         lang=args.lang,
         phones_dict=args.phones_dict,
         tones_dict=args.tones_dict,
+        pinyin_phone=args.pinyin_phone,
         use_rhy=args.use_rhy)
     print("frontend done!")
 
     # acoustic model
     am_name = args.am[:args.am.rindex('_')]
     am_dataset = args.am[args.am.rindex('_') + 1:]
-
     am_inference = get_am_inference(
         am=args.am,
         am_config=am_config,
@@ -65,8 +65,10 @@ def evaluate(args):
         am_stat=args.am_stat,
         phones_dict=args.phones_dict,
         tones_dict=args.tones_dict,
-        speaker_dict=args.speaker_dict)
+        speaker_dict=args.speaker_dict,
+        speech_stretchs=args.speech_stretchs, )
     print("acoustic model done!")
+
     # vocoder
     voc_inference = get_voc_inference(
         voc=args.voc,
@@ -77,6 +79,7 @@ def evaluate(args):
 
     # whether dygraph to static
     if args.inference_dir:
+        print("convert am and voc to static model.")
         # acoustic model
         am_inference = am_to_static(
             am_inference=am_inference,
@@ -91,6 +94,7 @@ def evaluate(args):
 
     output_dir = Path(args.output_dir)
     output_dir.mkdir(parents=True, exist_ok=True)
+
     merge_sentences = False
     # Avoid not stopping at the end of a sub sentence when tacotron2_ljspeech dygraph to static graph
     # but still not stopping in the end (NOTE by yuantian01 Feb 9 2022)
@@ -101,39 +105,76 @@ def evaluate(args):
     if am_name == 'speedyspeech':
         get_tone_ids = True
 
+    # wav samples
     N = 0
+    # inference time cost
     T = 0
+
+    # [(uid, text), ]
+    if am_name == 'diffsinger':
+        sentences = get_sentences_svs(text_file=args.text)
+    else:
+        sentences = get_sentences(text_file=args.text, lang=args.lang)
+
     for utt_id, sentence in sentences:
+        print(f"{utt_id} {sentence}")
         with timer() as t:
+            if am_name == "diffsinger":
+                text = ""
+                svs_input = sentence
+            else:
+                text = sentence
+                svs_input = None
+
+            # frontend
             frontend_dict = run_frontend(
                 frontend=frontend,
-                text=sentence,
+                text=text,
                 merge_sentences=merge_sentences,
                 get_tone_ids=get_tone_ids,
-                lang=args.lang)
+                lang=args.lang,
+                svs_input=svs_input)
             phone_ids = frontend_dict['phone_ids']
+            # pprint(f"{utt_id} {phone_ids}")
+
             with paddle.no_grad():
                 flags = 0
                 for i in range(len(phone_ids)):
+                    # sub phone, split by `sp` or punctuation.
                     part_phone_ids = phone_ids[i]
+
                     # acoustic model
                     if am_name == 'fastspeech2':
                         # multi speaker
                         if am_dataset in {"aishell3", "vctk", "mix", "canton"}:
-                            spk_id = paddle.to_tensor(args.spk_id)
+                            # multi-speaker
+                            spk_id = paddle.to_tensor([args.spk_id])
                             mel = am_inference(part_phone_ids, spk_id)
                         else:
+                            # single-speaker
                             mel = am_inference(part_phone_ids)
                     elif am_name == 'speedyspeech':
                         part_tone_ids = frontend_dict['tone_ids'][i]
                         if am_dataset in {"aishell3", "vctk", "mix"}:
-                            spk_id = paddle.to_tensor(args.spk_id)
+                            # multi-speaker
+                            spk_id = paddle.to_tensor([args.spk_id])
                             mel = am_inference(part_phone_ids, part_tone_ids,
                                                spk_id)
                         else:
+                            # single-speaker
                             mel = am_inference(part_phone_ids, part_tone_ids)
                     elif am_name == 'tacotron2':
                         mel = am_inference(part_phone_ids)
+                    elif am_name == 'diffsinger':
+                        part_note_ids = frontend_dict['note_ids'][i]
+                        part_note_durs = frontend_dict['note_durs'][i]
+                        part_is_slurs = frontend_dict['is_slurs'][i]
+                        mel = am_inference(
+                            text=part_phone_ids,
+                            note=part_note_ids,
+                            note_dur=part_note_durs,
+                            is_slur=part_is_slurs, )
+
                     # vocoder
                     wav = voc_inference(mel)
                     if flags == 0:
@@ -141,17 +182,23 @@ def evaluate(args):
                         flags = 1
                     else:
                         wav_all = paddle.concat([wav_all, wav])
+
         wav = wav_all.numpy()
         N += wav.size
         T += t.elapse
+
+        # samples per second
         speed = wav.size / t.elapse
+        # generate one second wav need `RTF` seconds
         rtf = am_config.fs / speed
         print(
             f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
         )
+
         sf.write(
             str(output_dir / (utt_id + ".wav")), wav, samplerate=am_config.fs)
         print(f"{utt_id} done!")
+
     print(f"generation speed: {N / T}Hz, RTF: {am_config.fs / (N / T) }")
 
 
@@ -178,6 +225,7 @@ def parse_args():
             'fastspeech2_male-zh',
             'fastspeech2_male-en',
             'fastspeech2_male-mix',
+            'diffsinger_opencpop',
         ],
         help='Choose acoustic model type of tts task.')
     parser.add_argument(
@@ -223,6 +271,8 @@ def parse_args():
             'wavernn_csmsc',
             'pwgan_male',
             'hifigan_male',
+            'pwgan_opencpop',
+            'hifigan_opencpop',
         ],
         help='Choose vocoder type of tts task.')
     parser.add_argument(
@@ -240,6 +290,7 @@ def parse_args():
         '--lang',
         type=str,
         default='zh',
+        choices=['zh', 'en', 'mix', 'canton', 'sing'],
         help='Choose model language. zh or en or mix')
 
     parser.add_argument(
@@ -248,7 +299,13 @@ def parse_args():
         default=None,
         help="dir to save inference models")
     parser.add_argument(
-        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu or xpu.")
+    parser.add_argument(
+        "--nxpu",
+        type=int,
+        default=0,
+        help="if wish to use xpu, set ngpu == 0 and nxpu > 0, and if ngpu == 0 and nxpu == 0, use cpu."
+    )
     parser.add_argument(
         "--text",
         type=str,
@@ -259,6 +316,17 @@ def parse_args():
         type=str2bool,
         default=False,
         help="run rhythm frontend or not")
+    parser.add_argument(
+        "--pinyin_phone",
+        type=str,
+        default=None,
+        help="pinyin to phone map file, using on sing_frontend.")
+    parser.add_argument(
+        "--speech_stretchs",
+        type=str,
+        default=None,
+        help="The min and max values of the mel spectrum, using on diffusion of diffsinger."
+    )
 
     args = parser.parse_args()
     return args
@@ -267,12 +335,14 @@ def parse_args():
 def main():
     args = parse_args()
 
-    if args.ngpu == 0:
-        paddle.set_device("cpu")
-    elif args.ngpu > 0:
+    if args.ngpu > 0:
         paddle.set_device("gpu")
+    elif args.nxpu > 0:
+        paddle.set_device("xpu")
+    elif args.ngpu == 0 and args.nxpu == 0:
+        paddle.set_device("cpu")
     else:
-        print("ngpu should >= 0 !")
+        print("ngpu or nxpu should >= 0 !")
 
     evaluate(args)
 
diff --git a/paddlespeech/t2s/exps/tacotron2/preprocess.py b/paddlespeech/t2s/exps/tacotron2/preprocess.py
index c27b9769..46b72591 100644
--- a/paddlespeech/t2s/exps/tacotron2/preprocess.py
+++ b/paddlespeech/t2s/exps/tacotron2/preprocess.py
@@ -311,6 +311,7 @@ def main():
             sentences=sentences,
             output_dir=dev_dump_dir,
             mel_extractor=mel_extractor,
+            nprocs=args.num_cpu,
             cut_sil=args.cut_sil,
             spk_emb_dir=spk_emb_dir)
     if test_wav_files:
diff --git a/paddlespeech/t2s/exps/transformer_tts/preprocess.py b/paddlespeech/t2s/exps/transformer_tts/preprocess.py
index 2ebd5ecc..4e82e53f 100644
--- a/paddlespeech/t2s/exps/transformer_tts/preprocess.py
+++ b/paddlespeech/t2s/exps/transformer_tts/preprocess.py
@@ -27,7 +27,7 @@ import yaml
 from yacs.config import CfgNode as Configuration
 
 from paddlespeech.t2s.datasets.get_feats import LogMelFBank
-from paddlespeech.t2s.frontend import English
+from paddlespeech.t2s.frontend.en_frontend import English
 
 
 def get_lj_sentences(file_name, frontend):
diff --git a/paddlespeech/t2s/exps/transformer_tts/synthesize_e2e.py b/paddlespeech/t2s/exps/transformer_tts/synthesize_e2e.py
index 0cd7d224..279407b3 100644
--- a/paddlespeech/t2s/exps/transformer_tts/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/transformer_tts/synthesize_e2e.py
@@ -21,7 +21,7 @@ import soundfile as sf
 import yaml
 from yacs.config import CfgNode
 
-from paddlespeech.t2s.frontend import English
+from paddlespeech.t2s.frontend.en_frontend import English
 from paddlespeech.t2s.models.transformer_tts import TransformerTTS
 from paddlespeech.t2s.models.transformer_tts import TransformerTTSInference
 from paddlespeech.t2s.models.waveflow import ConditionalWaveFlow
diff --git a/paddlespeech/t2s/exps/vits/lite_predict.py b/paddlespeech/t2s/exps/vits/lite_predict.py
index 790cd48e..32a544b7 100644
--- a/paddlespeech/t2s/exps/vits/lite_predict.py
+++ b/paddlespeech/t2s/exps/vits/lite_predict.py
@@ -21,6 +21,7 @@ from paddlespeech.t2s.exps.lite_syn_utils import get_lite_am_output
 from paddlespeech.t2s.exps.lite_syn_utils import get_lite_predictor
 from paddlespeech.t2s.exps.syn_utils import get_frontend
 from paddlespeech.t2s.exps.syn_utils import get_sentences
+from paddlespeech.t2s.utils import str2bool
 
 
 def parse_args():
@@ -75,12 +76,12 @@ def main():
     # frontend
     frontend = get_frontend(
         lang=args.lang,
-        phones_dict=args.phones_dict,
-        tones_dict=args.tones_dict)
+        phones_dict=args.phones_dict)
 
     # am_predictor
+    # vits can only run in arm
     am_predictor = get_lite_predictor(
-        model_dir=args.inference_dir, model_file=args.am + "_x86.nb")
+        model_dir=args.inference_dir, model_file=args.am + "_arm.nb")
     # model: {model_name}_{dataset}
     am_dataset = args.am[args.am.rindex('_') + 1:]
 
diff --git a/paddlespeech/t2s/exps/vits/preprocess.py b/paddlespeech/t2s/exps/vits/preprocess.py
index d6b226a2..23c959d4 100644
--- a/paddlespeech/t2s/exps/vits/preprocess.py
+++ b/paddlespeech/t2s/exps/vits/preprocess.py
@@ -321,6 +321,7 @@ def main():
             sentences=sentences,
             output_dir=dev_dump_dir,
             spec_extractor=spec_extractor,
+            nprocs=args.num_cpu,
             cut_sil=args.cut_sil,
             spk_emb_dir=spk_emb_dir)
     if test_wav_files:
diff --git a/paddlespeech/t2s/exps/vits/train.py b/paddlespeech/t2s/exps/vits/train.py
index 0e74bf63..cdfd3003 100644
--- a/paddlespeech/t2s/exps/vits/train.py
+++ b/paddlespeech/t2s/exps/vits/train.py
@@ -230,17 +230,15 @@ def train_sp(args, config):
         output_dir=output_dir)
 
     trainer = Trainer(
-        updater,
-        stop_trigger=(config.train_max_steps, "iteration"),
-        out=output_dir)
+        updater, stop_trigger=(config.max_epoch, 'epoch'), out=output_dir)
 
     if dist.get_rank() == 0:
         trainer.extend(
-            evaluator, trigger=(config.eval_interval_steps, 'iteration'))
+            evaluator, trigger=(config.eval_interval_epochs, 'epoch'))
         trainer.extend(VisualDL(output_dir), trigger=(1, 'iteration'))
     trainer.extend(
         Snapshot(max_size=config.num_snapshots),
-        trigger=(config.save_interval_steps, 'iteration'))
+        trigger=(config.save_interval_epochs, 'epoch'))
 
     print("Trainer Done!")
     trainer.run()
diff --git a/paddlespeech/t2s/frontend/__init__.py b/paddlespeech/t2s/frontend/__init__.py
index 64015435..a8f77d55 100644
--- a/paddlespeech/t2s/frontend/__init__.py
+++ b/paddlespeech/t2s/frontend/__init__.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 from .generate_lexicon import *
 from .normalizer import *
-from .phonectic import *
 from .punctuation import *
+from .ssml import *
 from .tone_sandhi import *
 from .vocab import *
 from .zh_normalization import *
diff --git a/paddlespeech/t2s/frontend/arpabet.py b/paddlespeech/t2s/frontend/arpabet.py
index 7a81b645..9b2b11b3 100644
--- a/paddlespeech/t2s/frontend/arpabet.py
+++ b/paddlespeech/t2s/frontend/arpabet.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from paddlespeech.t2s.frontend.phonectic import Phonetics
 """
 A phonology system with ARPABET symbols and limited punctuations. The G2P 
 conversion is done by g2p_en.
@@ -19,55 +18,68 @@ conversion is done by g2p_en.
 Note that g2p_en does not handle words with hypen well. So make sure the input
 sentence is first normalized.
 """
-from paddlespeech.t2s.frontend.vocab import Vocab
 from g2p_en import G2p
 
+from paddlespeech.t2s.frontend.phonectic import Phonetics
+from paddlespeech.t2s.frontend.vocab import Vocab
+
 
 class ARPABET(Phonetics):
-    """A phonology for English that uses ARPABET as the phoneme vocabulary.
+    """A phonology for English that uses ARPABET without stress as the phoneme vocabulary.
+
+    47 symbols = 39 phones + 4 punctuations + 4 special tokens(<pad> <unk> <s> </s>)
+
+    The current phoneme set contains 39 phonemes, vowels carry a lexical stress marker:
+        0    — No stress
+        1    — Primary stress
+        2    — Secondary stress
+
+    Phoneme Set:
+        Phoneme Example Translation
+            ------- ------- -----------
+            AA	odd     AA D
+            AE	at	AE T
+            AH	hut	HH AH T
+            AO	ought	AO T
+            AW	cow	K AW
+            AY	hide	HH AY D
+            B 	be	B IY
+            CH	cheese	CH IY Z
+            D 	dee	D IY
+            DH	thee	DH IY
+            EH	Ed	EH D
+            ER	hurt	HH ER T
+            EY	ate	EY T
+            F 	fee	F IY
+            G 	green	G R IY N
+            HH	he	HH IY
+            IH	it	IH T
+            IY	eat	IY T
+            JH	gee	JH IY
+            K 	key	K IY
+            L 	lee	L IY
+            M 	me	M IY
+            N 	knee	N IY
+            NG	ping	P IH NG
+            OW	oat	OW T
+            OY	toy	T OY
+            P 	pee	P IY
+            R 	read	R IY D
+            S 	sea	S IY
+            SH	she	SH IY
+            T 	tea	T IY
+            TH	theta	TH EY T AH
+            UH	hood	HH UH D
+            UW	two	T UW
+            V 	vee	V IY
+            W 	we	W IY
+            Y 	yield	Y IY L D
+            Z 	zee	Z IY
+            ZH	seizure	S IY ZH ER
+
     See http://www.speech.cs.cmu.edu/cgi-bin/cmudict for more details.
-    Phoneme Example Translation
-        ------- ------- -----------
-        AA	odd     AA D
-        AE	at	AE T
-        AH	hut	HH AH T
-        AO	ought	AO T
-        AW	cow	K AW
-        AY	hide	HH AY D
-        B 	be	B IY
-        CH	cheese	CH IY Z
-        D 	dee	D IY
-        DH	thee	DH IY
-        EH	Ed	EH D
-        ER	hurt	HH ER T
-        EY	ate	EY T
-        F 	fee	F IY
-        G 	green	G R IY N
-        HH	he	HH IY
-        IH	it	IH T
-        IY	eat	IY T
-        JH	gee	JH IY
-        K 	key	K IY
-        L 	lee	L IY
-        M 	me	M IY
-        N 	knee	N IY
-        NG	ping	P IH NG
-        OW	oat	OW T
-        OY	toy	T OY
-        P 	pee	P IY
-        R 	read	R IY D
-        S 	sea	S IY
-        SH	she	SH IY
-        T 	tea	T IY
-        TH	theta	TH EY T AH
-        UH	hood	HH UH D
-        UW	two	T UW
-        V 	vee	V IY
-        W 	we	W IY
-        Y 	yield	Y IY L D
-        Z 	zee	Z IY
-        ZH	seizure	S IY ZH ER
     """
+    # 39 phonemes
     phonemes = [
         'AA', 'AE', 'AH', 'AO', 'AW', 'AY', 'B', 'CH', 'D', 'DH', 'EH', 'ER',
         'EY', 'F', 'G', 'HH', 'IH', 'IY', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW',
@@ -76,6 +88,8 @@ class ARPABET(Phonetics):
     ]
     punctuations = [',', '.', '?', '!']
     symbols = phonemes + punctuations
+    # vowels carry a lexical stress marker：
+    # 0 unstressed（无重音）, 1 primary stress（主重音）和 2 secondary stress（次重音）
     _stress_to_no_stress_ = {
         'AA0': 'AA',
         'AA1': 'AA',
@@ -124,7 +138,12 @@ class ARPABET(Phonetics):
         'UW2': 'UW'
     }
 
+    def __repr__(self):
+        fmt = "ARPABETWithoutStress(phonemes: {}, punctuations: {})"
+        return fmt.format(len(phonemes), punctuations)
+
     def __init__(self):
+        # https://github.com/Kyubyong/g2p/blob/master/g2p_en/g2p.py
         self.backend = G2p()
         self.vocab = Vocab(self.phonemes + self.punctuations)
 
@@ -139,6 +158,7 @@ class ARPABET(Phonetics):
         Returns:
             List[str]: The list of pronunciation sequence.
         """
+        # g2p and remove vowel stress
         phonemes = [
             self._remove_vowels(item) for item in self.backend(sentence)
         ]
@@ -158,6 +178,7 @@ class ARPABET(Phonetics):
         Returns:
             List[int]: The list of pronunciation id sequence.
         """
+        # phonemes to ids
         ids = [self.vocab.lookup(item) for item in phonemes]
         return ids
 
@@ -189,11 +210,16 @@ class ARPABET(Phonetics):
     def vocab_size(self):
         """ Vocab size.
         """
-        # 47 = 39 phones + 4 punctuations + 4 special tokens
+        # 47 = 39 phones + 4 punctuations + 4 special tokens(<pad> <unk> <s> </s>)
         return len(self.vocab)
 
 
 class ARPABETWithStress(Phonetics):
+    """
+    A phonology for English that uses ARPABET with stress as the phoneme vocabulary.
+
+    77 symbols = 69 phones + 4 punctuations + 4 special tokens
+    """
     phonemes = [
         'AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1', 'AH2', 'AO0',
         'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'B', 'CH', 'D',
@@ -206,6 +232,10 @@ class ARPABETWithStress(Phonetics):
     punctuations = [',', '.', '?', '!']
     symbols = phonemes + punctuations
 
+    def __repr__(self):
+        fmt = "ARPABETWithStress(phonemes: {}, punctuations: {})"
+        return fmt.format(len(phonemes), punctuations)
+
     def __init__(self):
         self.backend = G2p()
         self.vocab = Vocab(self.phonemes + self.punctuations)
diff --git a/paddlespeech/t2s/frontend/canton_frontend.py b/paddlespeech/t2s/frontend/canton_frontend.py
index f2c7175f..bbb7bcf0 100644
--- a/paddlespeech/t2s/frontend/canton_frontend.py
+++ b/paddlespeech/t2s/frontend/canton_frontend.py
@@ -29,7 +29,8 @@ INITIALS = [
 INITIALS += ['sp', 'spl', 'spn', 'sil']
 
 
-def get_lines(cantons: List[str]):
+def jyuping_to_phonemes(cantons: List[str]):
+    # jyuping to inital and final
     phones = []
     for canton in cantons:
         for consonant in INITIALS:
@@ -47,7 +48,7 @@ def get_lines(cantons: List[str]):
 class CantonFrontend():
     def __init__(self, phone_vocab_path: str):
         self.text_normalizer = TextNormalizer()
-        self.punc = "：，；。？！“”‘’':,;.?!"
+        self.punc = "、：，；。？！“”‘’':,;.?!"
 
         self.vocab_phones = {}
         if phone_vocab_path:
@@ -61,8 +62,11 @@ class CantonFrontend():
              merge_sentences: bool=True) -> List[List[str]]:
         phones_list = []
         for sentence in sentences:
+            # jyuping
+            # 'gam3 ngaam1 lou5 sai3 jiu1 kau4 keoi5 dang2 zan6 jiu3 hoi1 wui2, zing6 dai1 ge2 je5 ngo5 wui5 gaau2 dim6 ga3 laa3.'
             phones_str = ToJyutping.get_jyutping_text(sentence)
-            phones_split = get_lines(phones_str.split(' '))
+            # phonemes 
+            phones_split = jyuping_to_phonemes(phones_str.split(' '))
             phones_list.append(phones_split)
         return phones_list
 
@@ -78,8 +82,11 @@ class CantonFrontend():
                      sentence: str,
                      merge_sentences: bool=True,
                      print_info: bool=False) -> List[List[str]]:
+        # TN & Text Segmentation
         sentences = self.text_normalizer.normalize(sentence)
+        # G2P
         phonemes = self._g2p(sentences, merge_sentences=merge_sentences)
+
         if print_info:
             print("----------------------------")
             print("text norm results:")
@@ -88,6 +95,7 @@ class CantonFrontend():
             print("g2p results:")
             print(phonemes)
             print("----------------------------")
+
         return phonemes
 
     def get_input_ids(self,
@@ -98,9 +106,9 @@ class CantonFrontend():
 
         phonemes = self.get_phonemes(
             sentence, merge_sentences=merge_sentences, print_info=print_info)
+
         result = {}
         temp_phone_ids = []
-
         for phones in phonemes:
             if phones:
                 phone_ids = self._p2id(phones)
@@ -108,6 +116,8 @@ class CantonFrontend():
                 if to_tensor:
                     phone_ids = paddle.to_tensor(phone_ids)
                 temp_phone_ids.append(phone_ids)
+
         if temp_phone_ids:
             result["phone_ids"] = temp_phone_ids
+
         return result
diff --git a/paddlespeech/t2s/frontend/en_frontend.py b/paddlespeech/t2s/frontend/en_frontend.py
new file mode 100644
index 00000000..c58bed7d
--- /dev/null
+++ b/paddlespeech/t2s/frontend/en_frontend.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .phonectic import English
diff --git a/paddlespeech/t2s/frontend/generate_lexicon.py b/paddlespeech/t2s/frontend/generate_lexicon.py
index 6b467d00..4fb748a6 100644
--- a/paddlespeech/t2s/frontend/generate_lexicon.py
+++ b/paddlespeech/t2s/frontend/generate_lexicon.py
@@ -45,7 +45,7 @@ def rule(C, V, R, T):
     'u' in syllables when certain conditions are satisfied.
 
     'i' is distinguished when appeared in phonemes, and separated into 3 categories, 'i', 'ii' and 'iii'.
-    Erhua is is possibly applied to every finals, except for finals that already ends with 'r'.
+    Erhua is possibly applied to every finals, except for finals that already ends with 'r'.
     When a syllable is impossible or does not have any characters with this pronunciation, return None
     to filter it out.
     """
diff --git a/paddlespeech/t2s/frontend/mix_frontend.py b/paddlespeech/t2s/frontend/mix_frontend.py
index b8c16097..2ebfe135 100644
--- a/paddlespeech/t2s/frontend/mix_frontend.py
+++ b/paddlespeech/t2s/frontend/mix_frontend.py
@@ -18,9 +18,9 @@ from typing import List
 import numpy as np
 import paddle
 
-from paddlespeech.t2s.frontend import English
-from paddlespeech.t2s.frontend.zh_frontend import Frontend
-from paddlespeech.t2s.ssml.xml_processor import MixTextProcessor
+from paddlespeech.t2s.frontend.en_frontend import English as EnFrontend
+from paddlespeech.t2s.frontend.ssml.xml_processor import MixTextProcessor
+from paddlespeech.t2s.frontend.zh_frontend import Frontend as ZhFrontend
 
 
 class MixFrontend():
@@ -28,10 +28,9 @@ class MixFrontend():
                  g2p_model="pypinyin",
                  phone_vocab_path=None,
                  tone_vocab_path=None):
-
-        self.zh_frontend = Frontend(
+        self.zh_frontend = ZhFrontend(
             phone_vocab_path=phone_vocab_path, tone_vocab_path=tone_vocab_path)
-        self.en_frontend = English(phone_vocab_path=phone_vocab_path)
+        self.en_frontend = EnFrontend(phone_vocab_path=phone_vocab_path)
         self.sp_id = self.zh_frontend.vocab_phones["sp"]
         self.sp_id_numpy = np.array([self.sp_id])
         self.sp_id_tensor = paddle.to_tensor([self.sp_id])
@@ -55,15 +54,12 @@ class MixFrontend():
         else:
             return False
 
-    def get_segment(self, text: str) -> List[str]:
+    def split_by_lang(self, text: str) -> List[str]:
         # sentence --> [ch_part, en_part, ch_part, ...]
         segments = []
         types = []
-        flag = 0
-        temp_seg = ""
-        temp_lang = ""
 
-        # Determine the type of each character. type: blank, chinese, alphabet, number, unk and point.
+        # Determine the type of each character. type: chinese, alphabet, other.
         for ch in text:
             if self.is_chinese(ch):
                 types.append("zh")
@@ -74,31 +70,31 @@ class MixFrontend():
 
         assert len(types) == len(text)
 
-        for i in range(len(types)):
+        flag = 0
+        temp_seg = ""
+        temp_lang = ""
+
+        for i in range(len(text)):
             # find the first char of the seg
             if flag == 0:
                 temp_seg += text[i]
                 temp_lang = types[i]
                 flag = 1
-
             else:
                 if temp_lang == "other":
-                    if types[i] == temp_lang:
-                        temp_seg += text[i]
-                    else:
-                        temp_seg += text[i]
+                    # text start is not lang.
+                    temp_seg += text[i]
+                    if types[i] != temp_lang:
                         temp_lang = types[i]
-
                 else:
-                    if types[i] == temp_lang:
-                        temp_seg += text[i]
-                    elif types[i] == "other":
+                    if types[i] == temp_lang or types[i] == "other":
+                        # merge same lang or other
                         temp_seg += text[i]
                     else:
+                        # change lang
                         segments.append((temp_seg, temp_lang))
                         temp_seg = text[i]
-                        temp_lang = types[i]
-                        flag = 1
+                        temp_lang = types[i]  # new lang
 
         segments.append((temp_seg, temp_lang))
 
@@ -110,76 +106,95 @@ class MixFrontend():
                       get_tone_ids: bool=False,
                       add_sp: bool=True,
                       to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]:
-        ''' 1. 添加SSML支持，先列出 文字 和 <say-as>标签内容，
-                然后添加到tmpSegments数组里
-        '''
-        d_inputs = MixTextProcessor.get_dom_split(sentence)
-        tmpSegments = []
-        for instr in d_inputs:
-            ''' 暂时只支持 say-as '''
-            if instr.lower().startswith("<say-as"):
-                tmpSegments.append((instr, "zh"))
+        # XML Document Object Model (DOM)
+        doms = MixTextProcessor.get_dom_split(sentence)
+
+        lang_splits = []
+        for dom in doms:
+            if dom.lower().startswith("<say-as pinyin="):
+                # `<say-as pinyin=` for zh lang
+                lang_splits.append((dom, "zh"))
             else:
-                tmpSegments.extend(self.get_segment(instr))
-        ''' 2. 把zh的merge到一起，避免合成结果中间停顿
-        '''
+                # process zh, en and zh/en
+                lang_splits.extend(self.split_by_lang(dom))
+
+        # merge adjacent zh segment
         segments = []
         currentSeg = ["", ""]
-        for seg in tmpSegments:
+        for seg in lang_splits:
             if seg[1] == "en" or seg[1] == "other":
                 if currentSeg[0] == '':
+                    # first see
                     segments.append(seg)
                 else:
+                    # zh
                     currentSeg[0] = "<speak>" + currentSeg[0] + "</speak>"
                     segments.append(tuple(currentSeg))
+                    # en
                     segments.append(seg)
+                    # reset
                     currentSeg = ["", ""]
             else:
+                # zh
                 if currentSeg[0] == '':
+                    # first see
                     currentSeg[0] = seg[0]
                     currentSeg[1] = seg[1]
                 else:
+                    # merge zh 
                     currentSeg[0] = currentSeg[0] + seg[0]
+
         if currentSeg[0] != '':
+            # last zh
             currentSeg[0] = "<speak>" + currentSeg[0] + "</speak>"
             segments.append(tuple(currentSeg))
 
         phones_list = []
         result = {}
 
+        # 008 我们要去云南 team building, 非常非常 happy.
+        # seg ('我们要去云南 ', 'zh')
+        # seg ('team building, ', 'en')
+        # seg ('非常非常 ', 'zh')
+        # seg ('happy.', 'en')
+        # [('<speak>我们要去云南 </speak>', 'zh'), ('team building, ', 'en'), ('<speak>非常非常 </speak>', 'zh'), ('happy.', 'en')]
         for seg in segments:
             content = seg[0]
             lang = seg[1]
-            if content != '':
-                if lang == "en":
-                    input_ids = self.en_frontend.get_input_ids(
-                        content, merge_sentences=False, to_tensor=to_tensor)
+
+            if not content:
+                continue
+
+            if lang == "en":
+                input_ids = self.en_frontend.get_input_ids(
+                    content, merge_sentences=False, to_tensor=to_tensor)
+            else:
+                if content.strip() != "" and \
+                    re.match(r".*?<speak>.*?</speak>.*", content, re.DOTALL):
+                    # process ssml
+                    input_ids = self.zh_frontend.get_input_ids_ssml(
+                        content,
+                        merge_sentences=False,
+                        get_tone_ids=get_tone_ids,
+                        to_tensor=to_tensor)
                 else:
-                    ''' 3. 把带speak tag的中文和普通文字分开处理
-                    '''
-                    if content.strip() != "" and \
-                        re.match(r".*?<speak>.*?</speak>.*", content, re.DOTALL):
-                        input_ids = self.zh_frontend.get_input_ids_ssml(
-                            content,
-                            merge_sentences=False,
-                            get_tone_ids=get_tone_ids,
-                            to_tensor=to_tensor)
-                    else:
-                        input_ids = self.zh_frontend.get_input_ids(
-                            content,
-                            merge_sentences=False,
-                            get_tone_ids=get_tone_ids,
-                            to_tensor=to_tensor)
-                if add_sp:
-                    if to_tensor:
-                        input_ids["phone_ids"][-1] = paddle.concat(
-                            [input_ids["phone_ids"][-1], self.sp_id_tensor])
-                    else:
-                        input_ids["phone_ids"][-1] = np.concatenate(
-                            (input_ids["phone_ids"][-1], self.sp_id_numpy))
+                    # process plain text
+                    input_ids = self.zh_frontend.get_input_ids(
+                        content,
+                        merge_sentences=False,
+                        get_tone_ids=get_tone_ids,
+                        to_tensor=to_tensor)
+
+            if add_sp:
+                # add sp between zh and en
+                if to_tensor:
+                    input_ids["phone_ids"][-1] = paddle.concat(
+                        [input_ids["phone_ids"][-1], self.sp_id_tensor])
+                else:
+                    input_ids["phone_ids"][-1] = np.concatenate(
+                        (input_ids["phone_ids"][-1], self.sp_id_numpy))
 
-                for phones in input_ids["phone_ids"]:
-                    phones_list.append(phones)
+            phones_list.extend(input_ids["phone_ids"])
 
         if merge_sentences:
             merge_list = paddle.concat(phones_list)
diff --git a/paddlespeech/t2s/frontend/phonectic.py b/paddlespeech/t2s/frontend/phonectic.py
index af86d9b8..d6c66f1e 100644
--- a/paddlespeech/t2s/frontend/phonectic.py
+++ b/paddlespeech/t2s/frontend/phonectic.py
@@ -47,15 +47,34 @@ class Phonetics(ABC):
 
 class English(Phonetics):
     """ Normalize the input text sequence and convert into pronunciation id sequence.
+
+    https://github.com/Kyubyong/g2p/blob/master/g2p_en/g2p.py
+
+    phonemes = ["<pad>", "<unk>", "<s>", "</s>"] + [   
+        'AA0', 'AA1', 'AA2', 'AE0', 'AE1', 'AE2', 'AH0', 'AH1', 'AH2', 'AO0',
+        'AO1', 'AO2', 'AW0', 'AW1', 'AW2', 'AY0', 'AY1', 'AY2', 'B', 'CH', 'D', 'DH',
+        'EH0', 'EH1', 'EH2', 'ER0', 'ER1', 'ER2', 'EY0', 'EY1',
+        'EY2', 'F', 'G', 'HH',
+        'IH0', 'IH1', 'IH2', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L',
+        'M', 'N', 'NG', 'OW0', 'OW1',
+        'OW2', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH',
+        'UH0', 'UH1', 'UH2', 'UW',
+        'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH']
     """
 
+    LEXICON = {
+        # key using lowercase
+        "AI".lower(): [["EY0", "AY1"]],
+    }
+
     def __init__(self, phone_vocab_path=None):
         self.backend = G2p()
+        self.backend.cmu.update(English.LEXICON)
         self.phonemes = list(self.backend.phonemes)
         self.punctuations = get_punctuations("en")
         self.vocab = Vocab(self.phonemes + self.punctuations)
         self.vocab_phones = {}
-        self.punc = "：，；。？！“”‘’':,;.?!"
+        self.punc = "、：，；。？！“”‘’':,;.?!"
         self.text_normalizer = TextNormalizer()
         if phone_vocab_path:
             with open(phone_vocab_path, 'rt', encoding='utf-8') as f:
@@ -86,8 +105,8 @@ class English(Phonetics):
                       sentence: str,
                       merge_sentences: bool=False,
                       to_tensor: bool=True) -> paddle.Tensor:
-        result = {}
         sentences = self.text_normalizer._split(sentence, lang="en")
+
         phones_list = []
         temp_phone_ids = []
         for sentence in sentences:
@@ -118,7 +137,10 @@ class English(Phonetics):
             if to_tensor:
                 phone_ids = paddle.to_tensor(phone_ids)
             temp_phone_ids.append(phone_ids)
+
+        result = {}
         result["phone_ids"] = temp_phone_ids
+
         return result
 
     def numericalize(self, phonemes):
diff --git a/paddlespeech/t2s/frontend/polyphonic.py b/paddlespeech/t2s/frontend/polyphonic.py
new file mode 100644
index 00000000..9a757e20
--- /dev/null
+++ b/paddlespeech/t2s/frontend/polyphonic.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+import yaml
+
+
+class Polyphonic():
+    def __init__(self):
+        with open(
+                os.path.join(
+                    os.path.dirname(os.path.abspath(__file__)),
+                    'polyphonic.yaml'),
+                'r',
+                encoding='utf-8') as polyphonic_file:
+            # 解析yaml
+            polyphonic_dict = yaml.load(polyphonic_file, Loader=yaml.FullLoader)
+        self.polyphonic_words = polyphonic_dict["polyphonic"]
+
+    def correct_pronunciation(self, word, pinyin):
+        # 词汇被词典收录则返回纠正后的读音
+        if word in self.polyphonic_words.keys():
+            pinyin = self.polyphonic_words[word]
+        # 否则返回原读音
+        return pinyin
diff --git a/paddlespeech/t2s/frontend/polyphonic.yaml b/paddlespeech/t2s/frontend/polyphonic.yaml
index 6885035e..f52b1cf5 100644
--- a/paddlespeech/t2s/frontend/polyphonic.yaml
+++ b/paddlespeech/t2s/frontend/polyphonic.yaml
@@ -47,4 +47,8 @@ polyphonic:
     恶行: ['e4','xing2']
     唉: ['ai4']
     扎实: ['zha1','shi2']
-    干将: ['gan4','jiang4']
\ No newline at end of file
+    干将: ['gan4','jiang4']
+    陈威行: ['chen2', 'wei1', 'hang2']
+    郭晟: ['guo1', 'sheng4']
+    中标: ['zhong4', 'biao1']
+    抗住: ['kang2', 'zhu4']
\ No newline at end of file
diff --git a/paddlespeech/t2s/frontend/sing_frontend.py b/paddlespeech/t2s/frontend/sing_frontend.py
new file mode 100644
index 00000000..fff72a10
--- /dev/null
+++ b/paddlespeech/t2s/frontend/sing_frontend.py
@@ -0,0 +1,175 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import re
+from typing import Dict
+from typing import List
+
+import librosa
+import numpy as np
+import paddle
+from pypinyin import lazy_pinyin
+
+
+class SingFrontend():
+    def __init__(self, pinyin_phone_path: str, phone_vocab_path: str):
+        """SVS Frontend
+
+        Args:
+            pinyin_phone_path (str): pinyin to phone file path, a 'pinyin|phones' (like: ba|b a ) pair per line.
+            phone_vocab_path (str): phone to phone id file path, a 'phone phone id' (like: a 4 ) pair per line.
+        """
+        self.punc = '[、：，；。？！“”‘’\':,;.?!]'
+
+        self.pinyin_phones = {'AP': 'AP', 'SP': 'SP'}
+        if pinyin_phone_path:
+            with open(pinyin_phone_path, 'rt', encoding='utf-8') as f:
+                for line in f.readlines():
+                    pinyin_phn = [
+                        x.strip() for x in line.split('|') if x.strip() != ''
+                    ]
+                    self.pinyin_phones[pinyin_phn[0]] = pinyin_phn[1]
+
+        self.vocab_phones = {}
+        if phone_vocab_path:
+            with open(phone_vocab_path, 'rt', encoding='utf-8') as f:
+                phn_id = [line.strip().split() for line in f.readlines()]
+            for phn, id in phn_id:
+                self.vocab_phones[phn] = int(id)
+
+    def get_phones(self, sentence: str) -> List[int]:
+        """get phone list
+
+        Args:
+            sentence (str): sentence
+
+        Returns:
+            List[int]: phones list
+
+        Example:
+            sentence = "你好"
+            phones = ['n i', 'h ao']
+        """
+        # remove all punc
+        sentence = re.sub(self.punc, "", sentence)
+
+        # Pypinyin can't solve polyphonic words
+        sentence = sentence.replace('最长', '最常').replace('长睫毛', '常睫毛') \
+            .replace('那么长', '那么常').replace('多长', '多常') \
+            .replace('很长', '很常')
+
+        # lyric
+        pinyins = lazy_pinyin(sentence, strict=False)
+        # replace unk word with SP
+        pinyins = [
+            pinyin if pinyin in self.pinyin_phones.keys() else "SP"
+            for pinyin in pinyins
+        ]
+        phones = [
+            self.pinyin_phones[pinyin.strip()] for pinyin in pinyins
+            if pinyin.strip() in self.pinyin_phones
+        ]
+
+        return phones
+
+    def get_note_info(self, note_info: str) -> List[str]:
+        note_info = [x.strip() for x in note_info.split('|') if x.strip() != '']
+        return note_info
+
+    def process(
+            self,
+            phones: List[int],
+            notes: List[str],
+            note_durs: List[float], ) -> Dict[str, List[paddle.Tensor]]:
+        new_phones = []
+        new_notes = []
+        new_note_durs = []
+        is_slurs = []
+        assert len(phones) == len(notes) == len(
+            note_durs
+        ), "Please check the input, text, notes, note_durs should be the same length."
+        for i in range(len(phones)):
+            phone = phones[i].split()
+            note = notes[i].split()
+            note_dur = note_durs[i].split()
+
+            for phn in phone:
+                new_phones.append(phn)
+                new_notes.append(note[0])
+                new_note_durs.append(note_dur[0])
+                is_slurs.append(0)
+
+            if len(note) > 1:
+                for i in range(1, len(note)):
+                    new_phones.append(phone[-1])
+                    new_notes.append(note[i])
+                    new_note_durs.append(note_dur[i])
+                    is_slurs.append(1)
+
+        return new_phones, new_notes, new_note_durs, is_slurs
+
+    def get_input_ids(self, svs_input: Dict[str, str],
+                      to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]:
+        """convert input to int/float.
+
+        Args:
+            svs_input (Dict[str, str]): include keys: if input_type is phones, phones, notes, note_durs and is_slurs are needed.
+            if  input_type is word, text, notes, and note_durs sre needed.
+            to_tensor (bool, optional): whether to convert to Tensor. Defaults to True.
+
+        Returns:
+            Dict[str, List[paddle.Tensor]]: result include phone_ids, note_ids, note_durs, is_slurs.
+        """
+        result = {}
+        input_type = svs_input['input_type']
+        if input_type == 'phoneme':
+            assert "phones" in svs_input.keys() and "notes" in svs_input.keys() and "note_durs" in svs_input.keys() and "is_slurs" in svs_input.keys(), \
+                "When input_type is phoneme, phones, notes, note_durs, is_slurs should be in the svs_input."
+            phones = svs_input["phones"].split()
+            notes = svs_input["notes"].split()
+            note_durs = svs_input["note_durs"].split()
+            is_slurs = svs_input["is_slurs"].split()
+            assert len(phones) == len(notes) == len(note_durs) == len(
+                is_slurs
+            ), "Please check the input, phones, notes, note_durs is_slurs should be the same length."
+        elif input_type == "word":
+            assert "text" in svs_input.keys() and "notes" in svs_input.keys() and "note_durs" in svs_input.keys(), \
+                "When input_type is word, text, notes, note_durs, should be in the svs_input."
+            phones = self.get_phones(svs_input['text'])
+            notes = self.get_note_info(svs_input['notes'])
+            note_durs = self.get_note_info(svs_input['note_durs'])
+            phones, notes, note_durs, is_slurs = self.process(
+                phones=phones, notes=notes, note_durs=note_durs)
+
+        phone_ids = [self.vocab_phones[phn] for phn in phones]
+        phone_ids = np.array(phone_ids, np.int64)
+        note_ids = [
+            librosa.note_to_midi(note.split("/")[0]) if note != 'rest' else 0
+            for note in notes
+        ]
+        note_ids = np.array(note_ids, np.int64)
+        note_durs = np.array(note_durs, np.float32)
+        is_slurs = np.array(is_slurs, np.int64)
+
+        if to_tensor:
+            phone_ids = paddle.to_tensor(phone_ids)
+            note_ids = paddle.to_tensor(note_ids)
+            note_durs = paddle.to_tensor(note_durs)
+            is_slurs = paddle.to_tensor(is_slurs)
+
+        result['phone_ids'] = [phone_ids]
+        result['note_ids'] = [note_ids]
+        result['note_durs'] = [note_durs]
+        result['is_slurs'] = [is_slurs]
+
+        return result
diff --git a/paddlespeech/t2s/ssml/__init__.py b/paddlespeech/t2s/frontend/ssml/__init__.py
similarity index 89%
rename from paddlespeech/t2s/ssml/__init__.py
rename to paddlespeech/t2s/frontend/ssml/__init__.py
index 9b4db053..b1b9d726 100644
--- a/paddlespeech/t2s/ssml/__init__.py
+++ b/paddlespeech/t2s/frontend/ssml/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddlespeech/t2s/ssml/xml_processor.py b/paddlespeech/t2s/frontend/ssml/xml_processor.py
similarity index 84%
rename from paddlespeech/t2s/ssml/xml_processor.py
rename to paddlespeech/t2s/frontend/ssml/xml_processor.py
index 892ca371..1d216c31 100644
--- a/paddlespeech/t2s/ssml/xml_processor.py
+++ b/paddlespeech/t2s/frontend/ssml/xml_processor.py
@@ -1,4 +1,17 @@
 # -*- coding: utf-8 -*-
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import re
 import xml.dom.minidom
 import xml.parsers.expat
@@ -17,7 +30,6 @@ Note:  xml 有5种特殊字符， &<>"'
 '  &apos;
 例如：
 <TitleName>&quot;姓名&quot;</TitleName>
-
 '''
 
 
@@ -61,17 +73,29 @@ class MixTextProcessor():
         patn = re.compile(r'(.*\s*?)(<speak>.*?</speak>)(.*\s*)$', re.M | re.S)
         mat = re.match(patn, mixstr)
         if mat:
+            # pre <speak>
             pre_xml = mat.group(1)
+            # between <speak> ... </speak>
             in_xml = mat.group(2)
+            # post </speak>
             after_xml = mat.group(3)
 
-            ctlist.append([pre_xml, []])
+            # pre with none syllable
+            if pre_xml:
+                ctlist.append([pre_xml, []])
+
+            # between with syllable
+            # [(sub sentence, [syllables]), ...]
             dom = DomXml(in_xml)
             pinyinlist = dom.get_pinyins_for_xml()
             ctlist = ctlist + pinyinlist
-            ctlist.append([after_xml, []])
+
+            # post with none syllable
+            if after_xml:
+                ctlist.append([after_xml, []])
         else:
             ctlist.append([mixstr, []])
+
         return ctlist
 
     @classmethod
@@ -86,17 +110,21 @@ class MixTextProcessor():
             in_xml = mat.group(2)
             after_xml = mat.group(3)
 
-            ctlist.append(pre_xml)
+            if pre_xml:
+                ctlist.append(pre_xml)
+
             dom = DomXml(in_xml)
             tags = dom.get_text_and_sayas_tags()
             ctlist.extend(tags)
-            
-            ctlist.append(after_xml)
-            return ctlist
+
+            if after_xml:
+                ctlist.append(after_xml)
         else:
             ctlist.append(mixstr)
+
         return ctlist
 
+
 class DomXml():
     def __init__(self, xmlstr):
         self.tdom = parseString(xmlstr)  #Document
diff --git a/paddlespeech/t2s/frontend/tone_sandhi.py b/paddlespeech/t2s/frontend/tone_sandhi.py
index 42f7b8b2..3558064c 100644
--- a/paddlespeech/t2s/frontend/tone_sandhi.py
+++ b/paddlespeech/t2s/frontend/tone_sandhi.py
@@ -20,6 +20,9 @@ from pypinyin import Style
 
 
 class ToneSandhi():
+    def __repr__(self):
+        return "MandarinToneSandhi"
+
     def __init__(self):
         self.must_neural_tone_words = {
             '麻烦', '麻利', '鸳鸯', '高粱', '骨头', '骆驼', '马虎', '首饰', '馒头', '馄饨', '风筝',
@@ -65,9 +68,22 @@ class ToneSandhi():
             '男子', '女子', '分子', '原子', '量子', '莲子', '石子', '瓜子', '电子', '人人', '虎虎',
             '幺幺', '干嘛', '学子', '哈哈', '数数', '袅袅', '局地', '以下', '娃哈哈', '花花草草', '留得',
             '耕地', '想想', '熙熙', '攘攘', '卵子', '死死', '冉冉', '恳恳', '佼佼', '吵吵', '打打',
-            '考考', '整整', '莘莘', '落地', '算子', '家家户户'
+            '考考', '整整', '莘莘', '落地', '算子', '家家户户', '青青'
         }
-        self.punc = "：，；。？！“”‘’':,;.?!"
+        self.punc = "、：，；。？！“”‘’':,;.?!"
+
+    def _split_word(self, word: str) -> List[str]:
+        word_list = jieba.cut_for_search(word)
+        word_list = sorted(word_list, key=lambda i: len(i), reverse=False)
+        first_subword = word_list[0]
+        first_begin_idx = word.find(first_subword)
+        if first_begin_idx == 0:
+            second_subword = word[len(first_subword):]
+            new_word_list = [first_subword, second_subword]
+        else:
+            second_subword = word[:-len(first_subword)]
+            new_word_list = [second_subword, first_subword]
+        return new_word_list
 
     # the meaning of jieba pos tag: https://blog.csdn.net/weixin_44174352/article/details/113731041
     # e.g.
@@ -154,18 +170,8 @@ class ToneSandhi():
                             finals[i] = finals[i][:-1] + "4"
         return finals
 
-    def _split_word(self, word: str) -> List[str]:
-        word_list = jieba.cut_for_search(word)
-        word_list = sorted(word_list, key=lambda i: len(i), reverse=False)
-        first_subword = word_list[0]
-        first_begin_idx = word.find(first_subword)
-        if first_begin_idx == 0:
-            second_subword = word[len(first_subword):]
-            new_word_list = [first_subword, second_subword]
-        else:
-            second_subword = word[:-len(first_subword)]
-            new_word_list = [second_subword, first_subword]
-        return new_word_list
+    def _all_tone_three(self, finals: List[str]) -> bool:
+        return all(x[-1] == "3" for x in finals)
 
     def _three_sandhi(self, word: str, finals: List[str]) -> List[str]:
 
@@ -207,9 +213,6 @@ class ToneSandhi():
 
         return finals
 
-    def _all_tone_three(self, finals: List[str]) -> bool:
-        return all(x[-1] == "3" for x in finals)
-
     # merge "不" and the word behind it
     # if don't merge, "不" sometimes appears alone according to jieba, which may occur sandhi error
     def _merge_bu(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
@@ -234,30 +237,25 @@ class ToneSandhi():
     # output seg: [['听一听', 'v']]
     def _merge_yi(self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
         new_seg = []
+        skip_next = False
         # function 1
         for i, (word, pos) in enumerate(seg):
-            if i - 1 >= 0 and word == "一" and i + 1 < len(seg) and seg[i - 1][
-                    0] == seg[i + 1][0] and seg[i - 1][1] == "v":
-                if i - 1 < len(new_seg):
-                    new_seg[i -
-                            1][0] = new_seg[i - 1][0] + "一" + new_seg[i - 1][0]
-                else:
-                    new_seg.append([word, pos])
-                    new_seg.append([seg[i + 1][0], pos])
+            if skip_next:
+                skip_next = False
+                continue
+            if i - 1 >= 0 and word == "一" and i + 1 < len(seg) and seg[i - 1][0] == seg[i + 1][0] and seg[i - 1][1] == "v":
+                new_seg[-1] = (new_seg[-1][0] + "一" + seg[i + 1][0], new_seg[-1][1])
+                skip_next = True
             else:
-                if i - 2 >= 0 and seg[i - 1][0] == "一" and seg[i - 2][
-                        0] == word and pos == "v":
-                    continue
-                else:
-                    new_seg.append([word, pos])
+                new_seg.append((word, pos))
         seg = new_seg
         new_seg = []
         # function 2
         for i, (word, pos) in enumerate(seg):
             if new_seg and new_seg[-1][0] == "一":
-                new_seg[-1][0] = new_seg[-1][0] + word
+                new_seg[-1] = (new_seg[-1][0] + word, new_seg[-1][1])
             else:
-                new_seg.append([word, pos])
+                new_seg.append((word, pos))
         return new_seg
 
     # the first and the second words are all_tone_three
@@ -336,6 +334,9 @@ class ToneSandhi():
 
     def pre_merge_for_modify(
             self, seg: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
+        """
+            seg: [(word, pos), ...]
+        """
         seg = self._merge_bu(seg)
         seg = self._merge_yi(seg)
         seg = self._merge_reduplication(seg)
@@ -346,7 +347,11 @@ class ToneSandhi():
 
     def modified_tone(self, word: str, pos: str,
                       finals: List[str]) -> List[str]:
-
+        """
+            word: 分词
+            pos: 词性
+            finals: 带调韵母, [final1, ..., finaln]
+        """
         finals = self._bu_sandhi(word, finals)
         finals = self._yi_sandhi(word, finals)
         finals = self._neural_sandhi(word, pos, finals)
diff --git a/paddlespeech/t2s/frontend/zh_frontend.py b/paddlespeech/t2s/frontend/zh_frontend.py
index 35b97a93..1431bc6d 100644
--- a/paddlespeech/t2s/frontend/zh_frontend.py
+++ b/paddlespeech/t2s/frontend/zh_frontend.py
@@ -14,6 +14,7 @@
 import os
 import re
 from operator import itemgetter
+from pprint import pprint
 from typing import Dict
 from typing import List
 
@@ -30,10 +31,11 @@ from pypinyin_dict.phrase_pinyin_data import large_pinyin
 
 from paddlespeech.t2s.frontend.g2pw import G2PWOnnxConverter
 from paddlespeech.t2s.frontend.generate_lexicon import generate_lexicon
+from paddlespeech.t2s.frontend.polyphonic import Polyphonic
 from paddlespeech.t2s.frontend.rhy_prediction.rhy_predictor import RhyPredictor
+from paddlespeech.t2s.frontend.ssml.xml_processor import MixTextProcessor
 from paddlespeech.t2s.frontend.tone_sandhi import ToneSandhi
 from paddlespeech.t2s.frontend.zh_normalization.text_normlization import TextNormalizer
-from paddlespeech.t2s.ssml.xml_processor import MixTextProcessor
 
 INITIALS = [
     'b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', 'zh', 'ch', 'sh',
@@ -41,6 +43,9 @@ INITIALS = [
 ]
 INITIALS += ['y', 'w', 'sp', 'spl', 'spn', 'sil']
 
+# 0 for None, 5 for neutral
+TONES = ["0", "1", "2", "3", "4", "5"]
+
 
 def intersperse(lst, item):
     result = [item] * (len(lst) * 2 + 1)
@@ -49,34 +54,19 @@ def intersperse(lst, item):
 
 
 def insert_after_character(lst, item):
+    """
+    inset `item` after finals.
+    """
     result = [item]
+
     for phone in lst:
         result.append(phone)
         if phone not in INITIALS:
             # finals has tones
             # assert phone[-1] in "12345"
             result.append(item)
-    return result
-
-
-class Polyphonic():
-    def __init__(self):
-        with open(
-                os.path.join(
-                    os.path.dirname(os.path.abspath(__file__)),
-                    'polyphonic.yaml'),
-                'r',
-                encoding='utf-8') as polyphonic_file:
-            # 解析yaml
-            polyphonic_dict = yaml.load(polyphonic_file, Loader=yaml.FullLoader)
-        self.polyphonic_words = polyphonic_dict["polyphonic"]
 
-    def correct_pronunciation(self, word, pinyin):
-        # 词汇被词典收录则返回纠正后的读音
-        if word in self.polyphonic_words.keys():
-            pinyin = self.polyphonic_words[word]
-        # 否则返回原读音
-        return pinyin
+    return result
 
 
 class Frontend():
@@ -85,10 +75,8 @@ class Frontend():
                  phone_vocab_path=None,
                  tone_vocab_path=None,
                  use_rhy=False):
-        self.mix_ssml_processor = MixTextProcessor()
-        self.tone_modifier = ToneSandhi()
-        self.text_normalizer = TextNormalizer()
-        self.punc = "：，；。？！“”‘’':,;.?!"
+
+        self.punc = "、：，；。？！“”‘’':,;.?!"
         self.rhy_phns = ['sp1', 'sp2', 'sp3', 'sp4']
         self.phrases_dict = {
             '开户行': [['ka1i'], ['hu4'], ['hang2']],
@@ -108,28 +96,7 @@ class Frontend():
             '嘞': [['lei5']],
             '掺和': [['chan1'], ['huo5']]
         }
-        self.use_rhy = use_rhy
-        if use_rhy:
-            self.rhy_predictor = RhyPredictor()
-            print("Rhythm predictor loaded.")
-        # g2p_model can be pypinyin and g2pM and g2pW
-        self.g2p_model = g2p_model
-        if self.g2p_model == "g2pM":
-            self.g2pM_model = G2pM()
-            self.pinyin2phone = generate_lexicon(
-                with_tone=True, with_erhua=False)
-        elif self.g2p_model == "g2pW":
-            # use pypinyin as backup for non polyphonic characters in g2pW
-            self._init_pypinyin()
-            self.corrector = Polyphonic()
-            self.g2pM_model = G2pM()
-            self.g2pW_model = G2PWOnnxConverter(
-                style='pinyin', enable_non_tradional_chinese=True)
-            self.pinyin2phone = generate_lexicon(
-                with_tone=True, with_erhua=False)
 
-        else:
-            self._init_pypinyin()
         self.must_erhua = {
             "小院儿", "胡同儿", "范儿", "老汉儿", "撒欢儿", "寻老礼儿", "妥妥儿", "媳妇儿"
         }
@@ -154,13 +121,51 @@ class Frontend():
             for tone, id in tone_id:
                 self.vocab_tones[tone] = int(id)
 
+        # SSML
+        self.mix_ssml_processor = MixTextProcessor()
+        # tone sandhi
+        self.tone_modifier = ToneSandhi()
+        # TN
+        self.text_normalizer = TextNormalizer()
+
+        # prosody
+        self.use_rhy = use_rhy
+        if use_rhy:
+            self.rhy_predictor = RhyPredictor()
+            print("Rhythm predictor loaded.")
+
+        # g2p
+        assert g2p_model in ('pypinyin', 'g2pM', 'g2pW')
+        self.g2p_model = g2p_model
+        if self.g2p_model == "g2pM":
+            self.g2pM_model = G2pM()
+            self.pinyin2phone = generate_lexicon(
+                with_tone=True, with_erhua=False)
+        elif self.g2p_model == "g2pW":
+            # use pypinyin as backup for non polyphonic characters in g2pW
+            self._init_pypinyin()
+            self.corrector = Polyphonic()
+            self.g2pM_model = G2pM()
+            self.g2pW_model = G2PWOnnxConverter(
+                style='pinyin', enable_non_tradional_chinese=True)
+            self.pinyin2phone = generate_lexicon(
+                with_tone=True, with_erhua=False)
+        else:
+            self._init_pypinyin()
+
     def _init_pypinyin(self):
+        """
+        Load pypinyin G2P module.
+        """
         large_pinyin.load()
         load_phrases_dict(self.phrases_dict)
         # 调整字的拼音顺序
         load_single_dict({ord(u'地'): u'de,di4'})
 
     def _get_initials_finals(self, word: str) -> List[List[str]]:
+        """
+        Get word initial and final by pypinyin or g2pM
+        """
         initials = []
         finals = []
         if self.g2p_model == "pypinyin":
@@ -171,11 +176,14 @@ class Frontend():
             for c, v in zip(orig_initials, orig_finals):
                 if re.match(r'i\d', v):
                     if c in ['z', 'c', 's']:
+                        # zi, ci, si
                         v = re.sub('i', 'ii', v)
                     elif c in ['zh', 'ch', 'sh', 'r']:
+                        # zhi, chi, shi
                         v = re.sub('i', 'iii', v)
                 initials.append(c)
                 finals.append(v)
+
         elif self.g2p_model == "g2pM":
             pinyins = self.g2pM_model(word, tone=True, char_split=False)
             for pinyin in pinyins:
@@ -192,58 +200,123 @@ class Frontend():
                     # If it's not pinyin (possibly punctuation) or no conversion is required
                     initials.append(pinyin)
                     finals.append(pinyin)
+
         return initials, finals
 
+    def _merge_erhua(self,
+                     initials: List[str],
+                     finals: List[str],
+                     word: str,
+                     pos: str) -> List[List[str]]:
+        """
+        Do erhub.
+        """
+        # fix er1
+        for i, phn in enumerate(finals):
+            if i == len(finals) - 1 and word[i] == "儿" and phn == 'er1':
+                finals[i] = 'er2'
+
+        # 发音
+        if word not in self.must_erhua and (word in self.not_erhua or
+                                            pos in {"a", "j", "nr"}):
+            return initials, finals
+
+        # "……" 等情况直接返回
+        if len(finals) != len(word):
+            return initials, finals
+
+        assert len(finals) == len(word)
+
+        # 不发音
+        new_initials = []
+        new_finals = []
+        for i, phn in enumerate(finals):
+            if i == len(finals) - 1 and word[i] == "儿" and phn in {
+                    "er2", "er5"
+            } and word[-2:] not in self.not_erhua and new_finals:
+                new_finals[-1] = new_finals[-1][:-1] + "r" + new_finals[-1][-1]
+            else:
+                new_initials.append(initials[i])
+                new_finals.append(phn)
+
+        return new_initials, new_finals
+
     # if merge_sentences, merge all sentences into one phone sequence
     def _g2p(self,
              sentences: List[str],
              merge_sentences: bool=True,
              with_erhua: bool=True) -> List[List[str]]:
+        """
+        Return: list of list phonemes.
+            [['w', 'o3', 'm', 'en2', 'sp'], ...]
+        """
         segments = sentences
         phones_list = []
+
+        # split by punctuation
         for seg in segments:
             if self.use_rhy:
                 seg = self.rhy_predictor._clean_text(seg)
-            phones = []
-            # Replace all English words in the sentence
+
+            # remove all English words in the sentence
             seg = re.sub('[a-zA-Z]+', '', seg)
+
+            # add prosody mark
             if self.use_rhy:
                 seg = self.rhy_predictor.get_prediction(seg)
+
+            # [(word, pos), ...]
             seg_cut = psg.lcut(seg)
-            initials = []
-            finals = []
+            # fix wordseg bad case for sandhi
             seg_cut = self.tone_modifier.pre_merge_for_modify(seg_cut)
+
             # 为了多音词获得更好的效果，这里采用整句预测
+            phones = []
+            initials = []
+            finals = []
             if self.g2p_model == "g2pW":
                 try:
+                    # undo prosody 
                     if self.use_rhy:
                         seg = self.rhy_predictor._clean_text(seg)
+
+                    # g2p
                     pinyins = self.g2pW_model(seg)[0]
                 except Exception:
-                    # g2pW采用模型采用繁体输入，如果有cover不了的简体词，采用g2pM预测
+                    # g2pW 模型采用繁体输入，如果有cover不了的简体词，采用g2pM预测
                     print("[%s] not in g2pW dict,use g2pM" % seg)
                     pinyins = self.g2pM_model(seg, tone=True, char_split=False)
+
+                # do prosody
                 if self.use_rhy:
                     rhy_text = self.rhy_predictor.get_prediction(seg)
                     final_py = self.rhy_predictor.pinyin_align(pinyins,
                                                                rhy_text)
                     pinyins = final_py
+
                 pre_word_length = 0
                 for word, pos in seg_cut:
                     sub_initials = []
                     sub_finals = []
                     now_word_length = pre_word_length + len(word)
+
+                    # skip english word
                     if pos == 'eng':
                         pre_word_length = now_word_length
                         continue
+
                     word_pinyins = pinyins[pre_word_length:now_word_length]
-                    # 矫正发音
+
+                    # 多音字消歧
                     word_pinyins = self.corrector.correct_pronunciation(
                         word, word_pinyins)
+
                     for pinyin, char in zip(word_pinyins, word):
                         if pinyin is None:
                             pinyin = char
+
                         pinyin = pinyin.replace("u:", "v")
+
                         if pinyin in self.pinyin2phone:
                             initial_final_list = self.pinyin2phone[
                                 pinyin].split(" ")
@@ -257,28 +330,41 @@ class Frontend():
                             # If it's not pinyin (possibly punctuation) or no conversion is required
                             sub_initials.append(pinyin)
                             sub_finals.append(pinyin)
+
                     pre_word_length = now_word_length
+                    # tone sandhi
                     sub_finals = self.tone_modifier.modified_tone(word, pos,
                                                                   sub_finals)
+                    # er hua                                
                     if with_erhua:
                         sub_initials, sub_finals = self._merge_erhua(
                             sub_initials, sub_finals, word, pos)
+
                     initials.append(sub_initials)
                     finals.append(sub_finals)
                     # assert len(sub_initials) == len(sub_finals) == len(word)
             else:
+                # pypinyin, g2pM
                 for word, pos in seg_cut:
                     if pos == 'eng':
+                        # skip english word
                         continue
+
+                    # g2p
                     sub_initials, sub_finals = self._get_initials_finals(word)
+                    # tone sandhi
                     sub_finals = self.tone_modifier.modified_tone(word, pos,
                                                                   sub_finals)
+                    # er hua
                     if with_erhua:
                         sub_initials, sub_finals = self._merge_erhua(
                             sub_initials, sub_finals, word, pos)
+
                     initials.append(sub_initials)
                     finals.append(sub_finals)
                     # assert len(sub_initials) == len(sub_finals) == len(word)
+
+                # sum(iterable[, start])
             initials = sum(initials, [])
             finals = sum(finals, [])
 
@@ -287,111 +373,34 @@ class Frontend():
                 # we discriminate i, ii and iii
                 if c and c not in self.punc:
                     phones.append(c)
+                # replace punctuation by `sp`
                 if c and c in self.punc:
                     phones.append('sp')
+
                 if v and v not in self.punc and v not in self.rhy_phns:
                     phones.append(v)
-            phones_list.append(phones)
-        if merge_sentences:
-            merge_list = sum(phones_list, [])
-            # rm the last 'sp' to avoid the noise at the end
-            # cause in the training data, no 'sp' in the end
-            if merge_list[-1] == 'sp':
-                merge_list = merge_list[:-1]
-            phones_list = []
-            phones_list.append(merge_list)
-        return phones_list
 
-    def _split_word_to_char(self, words):
-        res = []
-        for x in words:
-            res.append(x)
-        return res
-
-    # if using ssml, have pingyin specified, assign pinyin to words
-    def _g2p_assign(self,
-                    words: List[str],
-                    pinyin_spec: List[str],
-                    merge_sentences: bool=True) -> List[List[str]]:
-        phones_list = []
-        initials = []
-        finals = []
-
-        words = self._split_word_to_char(words[0])
-        for pinyin, char in zip(pinyin_spec, words):
-            sub_initials = []
-            sub_finals = []
-            pinyin = pinyin.replace("u:", "v")
-            #self.pinyin2phone: is a dict with all pinyin mapped with sheng_mu yun_mu
-            if pinyin in self.pinyin2phone:
-                initial_final_list = self.pinyin2phone[pinyin].split(" ")
-                if len(initial_final_list) == 2:
-                    sub_initials.append(initial_final_list[0])
-                    sub_finals.append(initial_final_list[1])
-                elif len(initial_final_list) == 1:
-                    sub_initials.append('')
-                    sub_finals.append(initial_final_list[1])
-            else:
-                # If it's not pinyin (possibly punctuation) or no conversion is required
-                sub_initials.append(pinyin)
-                sub_finals.append(pinyin)
-            initials.append(sub_initials)
-            finals.append(sub_finals)
+            phones_list.append(phones)
 
-        initials = sum(initials, [])
-        finals = sum(finals, [])
-        phones = []
-        for c, v in zip(initials, finals):
-            # NOTE: post process for pypinyin outputs
-            # we discriminate i, ii and iii
-            if c and c not in self.punc:
-                phones.append(c)
-            if c and c in self.punc:
-                phones.append('sp')
-            if v and v not in self.punc and v not in self.rhy_phns:
-                phones.append(v)
-        phones_list.append(phones)
+        # merge split sub sentence into one sentence.
         if merge_sentences:
+            # sub sentence phonemes
             merge_list = sum(phones_list, [])
             # rm the last 'sp' to avoid the noise at the end
             # cause in the training data, no 'sp' in the end
             if merge_list[-1] == 'sp':
                 merge_list = merge_list[:-1]
+
+            # sentence phonemes
             phones_list = []
             phones_list.append(merge_list)
-        return phones_list
 
-    def _merge_erhua(self,
-                     initials: List[str],
-                     finals: List[str],
-                     word: str,
-                     pos: str) -> List[List[str]]:
-        # fix er1
-        for i, phn in enumerate(finals):
-            if i == len(finals) - 1 and word[i] == "儿" and phn == 'er1':
-                finals[i] = 'er2'
-        if word not in self.must_erhua and (word in self.not_erhua or
-                                            pos in {"a", "j", "nr"}):
-            return initials, finals
-        # "……" 等情况直接返回
-        if len(finals) != len(word):
-            return initials, finals
-
-        assert len(finals) == len(word)
-
-        new_initials = []
-        new_finals = []
-        for i, phn in enumerate(finals):
-            if i == len(finals) - 1 and word[i] == "儿" and phn in {
-                    "er2", "er5"
-            } and word[-2:] not in self.not_erhua and new_finals:
-                new_finals[-1] = new_finals[-1][:-1] + "r" + new_finals[-1][-1]
-            else:
-                new_finals.append(phn)
-                new_initials.append(initials[i])
-        return new_initials, new_finals
+        return phones_list
 
     def _p2id(self, phonemes: List[str]) -> np.ndarray:
+        """
+        Phoneme to Index
+        """
         # replace unk phone with sp
         phonemes = [
             phn if phn in self.vocab_phones else "sp" for phn in phonemes
@@ -400,6 +409,9 @@ class Frontend():
         return np.array(phone_ids, np.int64)
 
     def _t2id(self, tones: List[str]) -> np.ndarray:
+        """
+        Tone to Index.
+        """
         # replace unk phone with sp
         tones = [tone if tone in self.vocab_tones else "0" for tone in tones]
         tone_ids = [self.vocab_tones[item] for item in tones]
@@ -407,6 +419,9 @@ class Frontend():
 
     def _get_phone_tone(self, phonemes: List[str],
                         get_tone_ids: bool=False) -> List[List[str]]:
+        """
+        Get tone from phonemes.
+        """
         phones = []
         tones = []
         if get_tone_ids and self.vocab_tones:
@@ -423,13 +438,14 @@ class Frontend():
                             -1] == 'r' and phone not in self.vocab_phones and phone[:
                                                                                     -1] in self.vocab_phones:
                         phones.append(phone[:-1])
-                        phones.append("er")
                         tones.append(tone)
+                        phones.append("er")
                         tones.append("2")
                     else:
                         phones.append(phone)
                         tones.append(tone)
                 else:
+                    # initals with 0 tone.
                     phones.append(full_phone)
                     tones.append('0')
         else:
@@ -443,6 +459,7 @@ class Frontend():
                     phones.append("er2")
                 else:
                     phones.append(phone)
+
         return phones, tones
 
     def get_phonemes(self,
@@ -451,10 +468,16 @@ class Frontend():
                      with_erhua: bool=True,
                      robot: bool=False,
                      print_info: bool=False) -> List[List[str]]:
+        """
+        Main function to do G2P
+        """
+        # TN & Text Segmentation
         sentences = self.text_normalizer.normalize(sentence)
+        # Prosody & WS & g2p & tone sandhi
         phonemes = self._g2p(
             sentences, merge_sentences=merge_sentences, with_erhua=with_erhua)
-        # change all tones to `1`
+
+        # simulate robot pronunciation, change all tones to `1`
         if robot:
             new_phonemes = []
             for sentence in phonemes:
@@ -466,6 +489,7 @@ class Frontend():
                     new_sentence.append(item)
                 new_phonemes.append(new_sentence)
             phonemes = new_phonemes
+
         if print_info:
             print("----------------------------")
             print("text norm results:")
@@ -476,25 +500,104 @@ class Frontend():
             print("----------------------------")
         return phonemes
 
-    #@an added for ssml pinyin 
+    def _split_word_to_char(self, words):
+        res = []
+        for x in words:
+            res.append(x)
+        return res
+
+    # if using ssml, have pingyin specified, assign pinyin to words
+    def _g2p_assign(self,
+                    words: List[str],
+                    pinyin_spec: List[str],
+                    merge_sentences: bool=True) -> List[List[str]]:
+        """
+        Replace phoneme by SSML
+        """
+        phones_list = []
+        initials = []
+        finals = []
+
+        # to charactor list
+        words = self._split_word_to_char(words[0])
+
+        for pinyin, char in zip(pinyin_spec, words):
+            sub_initials = []
+            sub_finals = []
+            pinyin = pinyin.replace("u:", "v")
+
+            #self.pinyin2phone: is a dict with all pinyin mapped with sheng_mu yun_mu
+            if pinyin in self.pinyin2phone:
+                initial_final_list = self.pinyin2phone[pinyin].split(" ")
+                if len(initial_final_list) == 2:
+                    sub_initials.append(initial_final_list[0])
+                    sub_finals.append(initial_final_list[1])
+                elif len(initial_final_list) == 1:
+                    sub_initials.append('')
+                    sub_finals.append(initial_final_list[1])
+            else:
+                # If it's not pinyin (possibly punctuation) or no conversion is required
+                sub_initials.append(pinyin)
+                sub_finals.append(pinyin)
+
+            initials.append(sub_initials)
+            finals.append(sub_finals)
+
+        initials = sum(initials, [])
+        finals = sum(finals, [])
+
+        phones = []
+        for c, v in zip(initials, finals):
+            # c for consonant, v for vowel
+            # NOTE: post process for pypinyin outputs
+            # we discriminate i, ii and iii
+            if c and c not in self.punc:
+                phones.append(c)
+            # replace punc to `sp`
+            if c and c in self.punc:
+                phones.append('sp')
+            if v and v not in self.punc and v not in self.rhy_phns:
+                phones.append(v)
+        phones_list.append(phones)
+
+        if merge_sentences:
+            merge_list = sum(phones_list, [])
+            # rm the last 'sp' to avoid the noise at the end
+            # cause in the training data, no 'sp' in the end
+            if merge_list[-1] == 'sp':
+                merge_list = merge_list[:-1]
+            phones_list = []
+            phones_list.append(merge_list)
+
+        return phones_list
+
     def get_phonemes_ssml(self,
                           ssml_inputs: list,
                           merge_sentences: bool=True,
                           with_erhua: bool=True,
                           robot: bool=False,
                           print_info: bool=False) -> List[List[str]]:
+        """
+         Main function to do G2P with SSML support.
+        """
         all_phonemes = []
         for word_pinyin_item in ssml_inputs:
             phonemes = []
+
+            # ['你喜欢', []] -> 你喜欢 []
             sentence, pinyin_spec = itemgetter(0, 1)(word_pinyin_item)
+
+            # TN & Text Segmentation
             sentences = self.text_normalizer.normalize(sentence)
+
             if len(pinyin_spec) == 0:
+                # g2p word w/o specified <say-as>
                 phonemes = self._g2p(
                     sentences,
                     merge_sentences=merge_sentences,
                     with_erhua=with_erhua)
             else:
-                # phonemes should be pinyin_spec 
+                # word phonemes specified by <say-as>
                 phonemes = self._g2p_assign(
                     sentences, pinyin_spec, merge_sentences=merge_sentences)
 
@@ -512,17 +615,24 @@ class Frontend():
                 new_phonemes.append(new_sentence)
             all_phonemes = new_phonemes
 
+        if merge_sentences:
+            all_phonemes = [sum(all_phonemes, [])]
+
         if print_info:
             print("----------------------------")
             print("text norm results:")
             print(sentences)
             print("----------------------------")
             print("g2p results:")
-            print(all_phonemes[0])
+            print(all_phonemes)
             print("----------------------------")
-        return [sum(all_phonemes, [])]
+
+        return all_phonemes
 
     def add_sp_if_no(self, phonemes):
+        """
+        Prosody mark #4 added at sentence end.
+        """
         if not phonemes[-1][-1].startswith('sp'):
             phonemes[-1].append('sp4')
         return phonemes
@@ -542,8 +652,11 @@ class Frontend():
             merge_sentences=merge_sentences,
             print_info=print_info,
             robot=robot)
+
+        # add #4 for sentence end.
         if self.use_rhy:
             phonemes = self.add_sp_if_no(phonemes)
+
         result = {}
         phones = []
         tones = []
@@ -551,28 +664,33 @@ class Frontend():
         temp_tone_ids = []
 
         for part_phonemes in phonemes:
+
             phones, tones = self._get_phone_tone(
                 part_phonemes, get_tone_ids=get_tone_ids)
+
             if add_blank:
                 phones = insert_after_character(phones, blank_token)
+
             if tones:
                 tone_ids = self._t2id(tones)
                 if to_tensor:
                     tone_ids = paddle.to_tensor(tone_ids)
                 temp_tone_ids.append(tone_ids)
+
             if phones:
                 phone_ids = self._p2id(phones)
                 # if use paddle.to_tensor() in onnxruntime, the first time will be too low
                 if to_tensor:
                     phone_ids = paddle.to_tensor(phone_ids)
                 temp_phone_ids.append(phone_ids)
+
         if temp_tone_ids:
             result["tone_ids"] = temp_tone_ids
         if temp_phone_ids:
             result["phone_ids"] = temp_phone_ids
+
         return result
 
-    # @an added for ssml
     def get_input_ids_ssml(
             self,
             sentence: str,
@@ -584,12 +702,15 @@ class Frontend():
             blank_token: str="<pad>",
             to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]:
 
-        l_inputs = MixTextProcessor.get_pinyin_split(sentence)
+        # split setence by SSML tag.
+        texts = MixTextProcessor.get_pinyin_split(sentence)
+
         phonemes = self.get_phonemes_ssml(
-            l_inputs,
+            texts,
             merge_sentences=merge_sentences,
             print_info=print_info,
             robot=robot)
+
         result = {}
         phones = []
         tones = []
@@ -599,21 +720,26 @@ class Frontend():
         for part_phonemes in phonemes:
             phones, tones = self._get_phone_tone(
                 part_phonemes, get_tone_ids=get_tone_ids)
+
             if add_blank:
                 phones = insert_after_character(phones, blank_token)
+
             if tones:
                 tone_ids = self._t2id(tones)
                 if to_tensor:
                     tone_ids = paddle.to_tensor(tone_ids)
                 temp_tone_ids.append(tone_ids)
+
             if phones:
                 phone_ids = self._p2id(phones)
                 # if use paddle.to_tensor() in onnxruntime, the first time will be too low
                 if to_tensor:
                     phone_ids = paddle.to_tensor(phone_ids)
                 temp_phone_ids.append(phone_ids)
+
         if temp_tone_ids:
             result["tone_ids"] = temp_tone_ids
         if temp_phone_ids:
             result["phone_ids"] = temp_phone_ids
+
         return result
diff --git a/paddlespeech/t2s/frontend/zh_normalization/phonecode.py b/paddlespeech/t2s/frontend/zh_normalization/phonecode.py
index 06b5d41b..51835112 100644
--- a/paddlespeech/t2s/frontend/zh_normalization/phonecode.py
+++ b/paddlespeech/t2s/frontend/zh_normalization/phonecode.py
@@ -24,7 +24,7 @@ from .num import verbalize_digit
 RE_MOBILE_PHONE = re.compile(
     r"(?<!\d)((\+?86 ?)?1([38]\d|5[0-35-9]|7[678]|9[89])\d{8})(?!\d)")
 RE_TELEPHONE = re.compile(
-    r"(?<!\d)((0(10|2[1-3]|[3-9]\d{2})-?)?[1-9]\d{7,8})(?!\d)")
+    r"(?<!\d)((0(10|2[1-3]|[3-9]\d{2})-?)?[1-9]\d{6,7})(?!\d)")
 
 # 全国统一的号码400开头
 RE_NATIONAL_UNIFORM_NUMBER = re.compile(r"(400)(-)?\d{3}(-)?\d{4}")
diff --git a/paddlespeech/t2s/models/diffsinger/__init__.py b/paddlespeech/t2s/models/diffsinger/__init__.py
new file mode 100644
index 00000000..785293ee
--- /dev/null
+++ b/paddlespeech/t2s/models/diffsinger/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .diffsinger import *
+from .diffsinger_updater import *
diff --git a/paddlespeech/t2s/models/diffsinger/diffsinger.py b/paddlespeech/t2s/models/diffsinger/diffsinger.py
new file mode 100644
index 00000000..990cfc56
--- /dev/null
+++ b/paddlespeech/t2s/models/diffsinger/diffsinger.py
@@ -0,0 +1,399 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+"""DiffSinger related modules for paddle"""
+from typing import Any
+from typing import Dict
+from typing import Tuple
+
+import numpy as np
+import paddle
+from paddle import nn
+from typeguard import check_argument_types
+
+from paddlespeech.t2s.models.diffsinger.fastspeech2midi import FastSpeech2MIDI
+from paddlespeech.t2s.modules.diffnet import DiffNet
+from paddlespeech.t2s.modules.diffusion import GaussianDiffusion
+
+
+class DiffSinger(nn.Layer):
+    """DiffSinger module.
+
+    This is a module of DiffSinger described in `DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism`._
+    .. _`DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism`:
+        https://arxiv.org/pdf/2105.02446.pdf
+
+    Args:
+
+    Returns:
+
+    """
+
+    def __init__(
+            self,
+            # min and max spec for stretching before diffusion
+            spec_min: paddle.Tensor,
+            spec_max: paddle.Tensor,
+            # fastspeech2midi config
+            idim: int,
+            odim: int,
+            use_energy_pred: bool=False,
+            use_postnet: bool=False,
+            # music score related 
+            note_num: int=300,
+            is_slur_num: int=2,
+            fastspeech2_params: Dict[str, Any]={
+                "adim": 256,
+                "aheads": 2,
+                "elayers": 4,
+                "eunits": 1024,
+                "dlayers": 4,
+                "dunits": 1024,
+                "positionwise_layer_type": "conv1d",
+                "positionwise_conv_kernel_size": 1,
+                "use_scaled_pos_enc": True,
+                "use_batch_norm": True,
+                "encoder_normalize_before": True,
+                "decoder_normalize_before": True,
+                "encoder_concat_after": False,
+                "decoder_concat_after": False,
+                "reduction_factor": 1,
+                # for transformer
+                "transformer_enc_dropout_rate": 0.1,
+                "transformer_enc_positional_dropout_rate": 0.1,
+                "transformer_enc_attn_dropout_rate": 0.1,
+                "transformer_dec_dropout_rate": 0.1,
+                "transformer_dec_positional_dropout_rate": 0.1,
+                "transformer_dec_attn_dropout_rate": 0.1,
+                "transformer_activation_type": "gelu",
+                # duration predictor
+                "duration_predictor_layers": 2,
+                "duration_predictor_chans": 384,
+                "duration_predictor_kernel_size": 3,
+                "duration_predictor_dropout_rate": 0.1,
+                # pitch predictor
+                "use_pitch_embed": True,
+                "pitch_predictor_layers": 2,
+                "pitch_predictor_chans": 384,
+                "pitch_predictor_kernel_size": 3,
+                "pitch_predictor_dropout": 0.5,
+                "pitch_embed_kernel_size": 9,
+                "pitch_embed_dropout": 0.5,
+                "stop_gradient_from_pitch_predictor": False,
+                # energy predictor
+                "use_energy_embed": False,
+                "energy_predictor_layers": 2,
+                "energy_predictor_chans": 384,
+                "energy_predictor_kernel_size": 3,
+                "energy_predictor_dropout": 0.5,
+                "energy_embed_kernel_size": 9,
+                "energy_embed_dropout": 0.5,
+                "stop_gradient_from_energy_predictor": False,
+                # postnet
+                "postnet_layers": 5,
+                "postnet_chans": 512,
+                "postnet_filts": 5,
+                "postnet_dropout_rate": 0.5,
+                # spk emb
+                "spk_num": None,
+                "spk_embed_dim": None,
+                "spk_embed_integration_type": "add",
+                # training related
+                "init_type": "xavier_uniform",
+                "init_enc_alpha": 1.0,
+                "init_dec_alpha": 1.0,
+                # speaker classifier
+                "enable_speaker_classifier": False,
+                "hidden_sc_dim": 256,
+            },
+            # denoiser config
+            denoiser_params: Dict[str, Any]={
+                "in_channels": 80,
+                "out_channels": 80,
+                "kernel_size": 3,
+                "layers": 20,
+                "stacks": 5,
+                "residual_channels": 256,
+                "gate_channels": 512,
+                "skip_channels": 256,
+                "aux_channels": 256,
+                "dropout": 0.,
+                "bias": True,
+                "use_weight_norm": False,
+                "init_type": "kaiming_normal",
+            },
+            # diffusion config
+            diffusion_params: Dict[str, Any]={
+                "num_train_timesteps": 100,
+                "beta_start": 0.0001,
+                "beta_end": 0.06,
+                "beta_schedule": "squaredcos_cap_v2",
+                "num_max_timesteps": 60,
+                "stretch": True,
+            }, ):
+        """Initialize DiffSinger module.
+
+        Args:
+            spec_min (paddle.Tensor): The minimum value of the feature(mel) to stretch before diffusion.
+            spec_max (paddle.Tensor): The maximum value of the feature(mel) to stretch before diffusion.
+            idim (int): Dimension of the inputs (Input vocabrary size.).
+            odim (int): Dimension of the outputs (Acoustic feature dimension.).
+            use_energy_pred (bool, optional): whether use energy predictor. Defaults False.
+            use_postnet (bool, optional): whether use postnet. Defaults False.
+            note_num (int, optional): The number of note. Defaults to 300.
+            is_slur_num (int, optional): The number of slur. Defaults to 2.
+            fastspeech2_params (Dict[str, Any]): Parameter dict for fastspeech2 module.
+            denoiser_params (Dict[str, Any]): Parameter dict for dinoiser module.
+            diffusion_params (Dict[str, Any]): Parameter dict for diffusion module.
+        """
+        assert check_argument_types()
+        super().__init__()
+        self.fs2 = FastSpeech2MIDI(
+            idim=idim,
+            odim=odim,
+            fastspeech2_params=fastspeech2_params,
+            note_num=note_num,
+            is_slur_num=is_slur_num,
+            use_energy_pred=use_energy_pred,
+            use_postnet=use_postnet, )
+        denoiser = DiffNet(**denoiser_params)
+        self.diffusion = GaussianDiffusion(
+            denoiser,
+            **diffusion_params,
+            min_values=spec_min,
+            max_values=spec_max, )
+
+    def forward(
+            self,
+            text: paddle.Tensor,
+            note: paddle.Tensor,
+            note_dur: paddle.Tensor,
+            is_slur: paddle.Tensor,
+            text_lengths: paddle.Tensor,
+            speech: paddle.Tensor,
+            speech_lengths: paddle.Tensor,
+            durations: paddle.Tensor,
+            pitch: paddle.Tensor,
+            energy: paddle.Tensor,
+            spk_emb: paddle.Tensor=None,
+            spk_id: paddle.Tensor=None,
+            only_train_fs2: bool=True,
+    ) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]:
+        """Calculate forward propagation.
+
+        Args:
+            text(Tensor(int64)): 
+                Batch of padded token (phone) ids (B, Tmax).
+            note(Tensor(int64)): 
+                Batch of padded note (element in music score) ids (B, Tmax).
+            note_dur(Tensor(float32)): 
+                Batch of padded note durations in seconds (element in music score) (B, Tmax).
+            is_slur(Tensor(int64)): 
+                Batch of padded slur (element in music score) ids (B, Tmax).
+            text_lengths(Tensor(int64)): 
+                Batch of phone lengths of each input (B,).
+            speech(Tensor[float32]): 
+                Batch of padded target features (e.g. mel) (B, Lmax, odim).
+            speech_lengths(Tensor(int64)): 
+                Batch of the lengths of each target features (B,).
+            durations(Tensor(int64)): 
+                Batch of padded token durations in frame (B, Tmax).
+            pitch(Tensor[float32]): 
+                Batch of padded frame-averaged pitch (B, Lmax, 1).
+            energy(Tensor[float32]): 
+                Batch of padded frame-averaged energy (B, Lmax, 1).
+            spk_emb(Tensor[float32], optional): 
+                Batch of speaker embeddings (B, spk_embed_dim).
+            spk_id(Tnesor[int64], optional(int64)): 
+                Batch of speaker ids (B,)
+            only_train_fs2(bool):
+                Whether to train only the fastspeech2 module
+
+        Returns:
+
+        """
+        # only train fastspeech2 module firstly
+        before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens, spk_logits = self.fs2(
+            text=text,
+            note=note,
+            note_dur=note_dur,
+            is_slur=is_slur,
+            text_lengths=text_lengths,
+            speech=speech,
+            speech_lengths=speech_lengths,
+            durations=durations,
+            pitch=pitch,
+            energy=energy,
+            spk_id=spk_id,
+            spk_emb=spk_emb)
+        if only_train_fs2:
+            return before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens, spk_logits
+
+        # get the encoder output from fastspeech2 as the condition of denoiser module
+        cond_fs2, mel_masks = self.fs2.encoder_infer_batch(
+            text=text,
+            note=note,
+            note_dur=note_dur,
+            is_slur=is_slur,
+            text_lengths=text_lengths,
+            speech_lengths=speech_lengths,
+            ds=durations,
+            ps=pitch,
+            es=energy)
+        cond_fs2 = cond_fs2.transpose((0, 2, 1))
+
+        # get the output(final mel) from diffusion module
+        noise_pred, noise_target = self.diffusion(
+            speech.transpose((0, 2, 1)), cond_fs2)
+        return noise_pred, noise_target, mel_masks
+
+    def inference(
+            self,
+            text: paddle.Tensor,
+            note: paddle.Tensor,
+            note_dur: paddle.Tensor,
+            is_slur: paddle.Tensor,
+            get_mel_fs2: bool=False, ):
+        """Run inference
+
+        Args:
+            text(Tensor(int64)): 
+                Batch of padded token (phone) ids (B, Tmax).
+            note(Tensor(int64)): 
+                Batch of padded note (element in music score) ids (B, Tmax).
+            note_dur(Tensor(float32)): 
+                Batch of padded note durations in seconds (element in music score) (B, Tmax).
+            is_slur(Tensor(int64)): 
+                Batch of padded slur (element in music score) ids (B, Tmax).
+            get_mel_fs2 (bool, optional): . Defaults to False.
+                Whether to get mel from fastspeech2 module.
+
+        Returns:
+            
+        """
+        mel_fs2, _, _, _ = self.fs2.inference(text, note, note_dur, is_slur)
+        if get_mel_fs2:
+            return mel_fs2
+        mel_fs2 = mel_fs2.unsqueeze(0).transpose((0, 2, 1))
+        cond_fs2 = self.fs2.encoder_infer(text, note, note_dur, is_slur)
+        cond_fs2 = cond_fs2.transpose((0, 2, 1))
+        noise = paddle.randn(mel_fs2.shape)
+        mel = self.diffusion.inference(
+            noise=noise,
+            cond=cond_fs2,
+            ref_x=mel_fs2,
+            scheduler_type="ddpm",
+            num_inference_steps=60)
+        mel = mel.transpose((0, 2, 1))
+        return mel[0]
+
+
+class DiffSingerInference(nn.Layer):
+    def __init__(self, normalizer, model):
+        super().__init__()
+        self.normalizer = normalizer
+        self.acoustic_model = model
+
+    def forward(self, text, note, note_dur, is_slur, get_mel_fs2: bool=False):
+        """Calculate forward propagation.
+
+        Args:
+            text(Tensor(int64)): 
+                Batch of padded token (phone) ids (B, Tmax).
+            note(Tensor(int64)): 
+                Batch of padded note (element in music score) ids (B, Tmax).
+            note_dur(Tensor(float32)): 
+                Batch of padded note durations in seconds (element in music score) (B, Tmax).
+            is_slur(Tensor(int64)): 
+                Batch of padded slur (element in music score) ids (B, Tmax).
+            get_mel_fs2 (bool, optional): . Defaults to False.
+                Whether to get mel from fastspeech2 module.
+
+        Returns:
+            logmel(Tensor(float32)): denorm logmel, [T, mel_bin]
+        """
+        normalized_mel = self.acoustic_model.inference(
+            text=text,
+            note=note,
+            note_dur=note_dur,
+            is_slur=is_slur,
+            get_mel_fs2=get_mel_fs2)
+        logmel = normalized_mel
+        return logmel
+
+
+class DiffusionLoss(nn.Layer):
+    """Loss function module for Diffusion module on DiffSinger."""
+
+    def __init__(self, use_masking: bool=True,
+                 use_weighted_masking: bool=False):
+        """Initialize feed-forward Transformer loss module.
+        Args:
+            use_masking (bool): 
+                Whether to apply masking for padded part in loss calculation.
+            use_weighted_masking (bool): 
+                Whether to weighted masking in loss calculation.
+        """
+        assert check_argument_types()
+        super().__init__()
+
+        assert (use_masking != use_weighted_masking) or not use_masking
+        self.use_masking = use_masking
+        self.use_weighted_masking = use_weighted_masking
+
+        # define criterions
+        reduction = "none" if self.use_weighted_masking else "mean"
+        self.l1_criterion = nn.L1Loss(reduction=reduction)
+
+    def forward(
+            self,
+            noise_pred: paddle.Tensor,
+            noise_target: paddle.Tensor,
+            mel_masks: paddle.Tensor, ) -> paddle.Tensor:
+        """Calculate forward propagation.
+
+        Args:
+            noise_pred(Tensor): 
+                Batch of outputs predict noise (B, Lmax, odim).
+            noise_target(Tensor):  
+                Batch of target noise (B, Lmax, odim).
+            mel_masks(Tensor): 
+                Batch of mask of real mel (B, Lmax, 1).
+        Returns:
+        
+        """
+        # apply mask to remove padded part
+        if self.use_masking:
+            noise_pred = noise_pred.masked_select(
+                mel_masks.broadcast_to(noise_pred.shape))
+            noise_target = noise_target.masked_select(
+                mel_masks.broadcast_to(noise_target.shape))
+
+        # calculate loss
+        l1_loss = self.l1_criterion(noise_pred, noise_target)
+
+        # make weighted mask and apply it
+        if self.use_weighted_masking:
+            mel_masks = mel_masks.unsqueeze(-1)
+            out_weights = mel_masks.cast(dtype=paddle.float32) / mel_masks.cast(
+                dtype=paddle.float32).sum(
+                    axis=1, keepdim=True)
+            out_weights /= noise_target.shape[0] * noise_target.shape[2]
+
+            # apply weight
+            l1_loss = l1_loss.multiply(out_weights)
+            l1_loss = l1_loss.masked_select(
+                mel_masks.broadcast_to(l1_loss.shape)).sum()
+
+        return l1_loss
diff --git a/paddlespeech/t2s/models/diffsinger/diffsinger_updater.py b/paddlespeech/t2s/models/diffsinger/diffsinger_updater.py
new file mode 100644
index 00000000..d89b09b2
--- /dev/null
+++ b/paddlespeech/t2s/models/diffsinger/diffsinger_updater.py
@@ -0,0 +1,302 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from pathlib import Path
+from typing import Dict
+
+import paddle
+from paddle import distributed as dist
+from paddle.io import DataLoader
+from paddle.nn import Layer
+from paddle.optimizer import Optimizer
+
+from paddlespeech.t2s.training.extensions.evaluator import StandardEvaluator
+from paddlespeech.t2s.training.reporter import report
+from paddlespeech.t2s.training.updaters.standard_updater import StandardUpdater
+from paddlespeech.t2s.training.updaters.standard_updater import UpdaterState
+
+logging.basicConfig(
+    format='%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s',
+    datefmt='[%Y-%m-%d %H:%M:%S]')
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+class DiffSingerUpdater(StandardUpdater):
+    def __init__(self,
+                 model: Layer,
+                 optimizers: Dict[str, Optimizer],
+                 criterions: Dict[str, Layer],
+                 dataloader: DataLoader,
+                 ds_train_start_steps: int=160000,
+                 output_dir: Path=None,
+                 only_train_diffusion: bool=True):
+        super().__init__(model, optimizers, dataloader, init_state=None)
+        self.model = model._layers if isinstance(model,
+                                                 paddle.DataParallel) else model
+        self.only_train_diffusion = only_train_diffusion
+
+        self.optimizers = optimizers
+        self.optimizer_fs2: Optimizer = optimizers['fs2']
+        self.optimizer_ds: Optimizer = optimizers['ds']
+
+        self.criterions = criterions
+        self.criterion_fs2 = criterions['fs2']
+        self.criterion_ds = criterions['ds']
+
+        self.dataloader = dataloader
+
+        self.ds_train_start_steps = ds_train_start_steps
+
+        self.state = UpdaterState(iteration=0, epoch=0)
+        self.train_iterator = iter(self.dataloader)
+
+        log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
+        self.filehandler = logging.FileHandler(str(log_file))
+        logger.addHandler(self.filehandler)
+        self.logger = logger
+        self.msg = ""
+
+    def update_core(self, batch):
+        self.msg = "Rank: {}, ".format(dist.get_rank())
+        losses_dict = {}
+        # spk_id!=None in multiple spk diffsinger 
+        spk_id = batch["spk_id"] if "spk_id" in batch else None
+        spk_emb = batch["spk_emb"] if "spk_emb" in batch else None
+        # No explicit speaker identifier labels are used during voice cloning training.
+        if spk_emb is not None:
+            spk_id = None
+
+        # only train fastspeech2 module firstly
+        if self.state.iteration < self.ds_train_start_steps:
+            before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens, spk_logits = self.model(
+                text=batch["text"],
+                note=batch["note"],
+                note_dur=batch["note_dur"],
+                is_slur=batch["is_slur"],
+                text_lengths=batch["text_lengths"],
+                speech=batch["speech"],
+                speech_lengths=batch["speech_lengths"],
+                durations=batch["durations"],
+                pitch=batch["pitch"],
+                energy=batch["energy"],
+                spk_id=spk_id,
+                spk_emb=spk_emb,
+                only_train_fs2=True, )
+
+            l1_loss_fs2, ssim_loss_fs2, duration_loss, pitch_loss, energy_loss, speaker_loss = self.criterion_fs2(
+                after_outs=after_outs,
+                before_outs=before_outs,
+                d_outs=d_outs,
+                p_outs=p_outs,
+                e_outs=e_outs,
+                ys=ys,
+                ds=batch["durations"],
+                ps=batch["pitch"],
+                es=batch["energy"],
+                ilens=batch["text_lengths"],
+                olens=olens,
+                spk_logits=spk_logits,
+                spk_ids=spk_id, )
+
+            loss_fs2 = l1_loss_fs2 + ssim_loss_fs2 + duration_loss + pitch_loss + energy_loss + speaker_loss
+
+            self.optimizer_fs2.clear_grad()
+            loss_fs2.backward()
+            self.optimizer_fs2.step()
+
+            report("train/loss_fs2", float(loss_fs2))
+            report("train/l1_loss_fs2", float(l1_loss_fs2))
+            report("train/ssim_loss_fs2", float(ssim_loss_fs2))
+            report("train/duration_loss", float(duration_loss))
+            report("train/pitch_loss", float(pitch_loss))
+
+            losses_dict["l1_loss_fs2"] = float(l1_loss_fs2)
+            losses_dict["ssim_loss_fs2"] = float(ssim_loss_fs2)
+            losses_dict["duration_loss"] = float(duration_loss)
+            losses_dict["pitch_loss"] = float(pitch_loss)
+
+            if speaker_loss != 0.:
+                report("train/speaker_loss", float(speaker_loss))
+                losses_dict["speaker_loss"] = float(speaker_loss)
+            if energy_loss != 0.:
+                report("train/energy_loss", float(energy_loss))
+                losses_dict["energy_loss"] = float(energy_loss)
+
+            losses_dict["loss_fs2"] = float(loss_fs2)
+            self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                                  for k, v in losses_dict.items())
+
+        # Then only train diffusion module, freeze fastspeech2 parameters.
+        if self.state.iteration > self.ds_train_start_steps:
+            for param in self.model.fs2.parameters():
+                param.trainable = False if self.only_train_diffusion else True
+
+            noise_pred, noise_target, mel_masks = self.model(
+                text=batch["text"],
+                note=batch["note"],
+                note_dur=batch["note_dur"],
+                is_slur=batch["is_slur"],
+                text_lengths=batch["text_lengths"],
+                speech=batch["speech"],
+                speech_lengths=batch["speech_lengths"],
+                durations=batch["durations"],
+                pitch=batch["pitch"],
+                energy=batch["energy"],
+                spk_id=spk_id,
+                spk_emb=spk_emb,
+                only_train_fs2=False, )
+
+            noise_pred = noise_pred.transpose((0, 2, 1))
+            noise_target = noise_target.transpose((0, 2, 1))
+            mel_masks = mel_masks.transpose((0, 2, 1))
+            l1_loss_ds = self.criterion_ds(
+                noise_pred=noise_pred,
+                noise_target=noise_target,
+                mel_masks=mel_masks, )
+
+            loss_ds = l1_loss_ds
+
+            self.optimizer_ds.clear_grad()
+            loss_ds.backward()
+            self.optimizer_ds.step()
+
+            report("train/loss_ds", float(loss_ds))
+            report("train/l1_loss_ds", float(l1_loss_ds))
+            losses_dict["l1_loss_ds"] = float(l1_loss_ds)
+            losses_dict["loss_ds"] = float(loss_ds)
+            self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                                  for k, v in losses_dict.items())
+
+        self.logger.info(self.msg)
+
+
+class DiffSingerEvaluator(StandardEvaluator):
+    def __init__(
+            self,
+            model: Layer,
+            criterions: Dict[str, Layer],
+            dataloader: DataLoader,
+            output_dir: Path=None, ):
+        super().__init__(model, dataloader)
+        self.model = model._layers if isinstance(model,
+                                                 paddle.DataParallel) else model
+
+        self.criterions = criterions
+        self.criterion_fs2 = criterions['fs2']
+        self.criterion_ds = criterions['ds']
+        self.dataloader = dataloader
+
+        log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
+        self.filehandler = logging.FileHandler(str(log_file))
+        logger.addHandler(self.filehandler)
+        self.logger = logger
+        self.msg = ""
+
+    def evaluate_core(self, batch):
+        self.msg = "Evaluate: "
+        losses_dict = {}
+        # spk_id!=None in multiple spk diffsinger 
+        spk_id = batch["spk_id"] if "spk_id" in batch else None
+        spk_emb = batch["spk_emb"] if "spk_emb" in batch else None
+        if spk_emb is not None:
+            spk_id = None
+
+        # Here show fastspeech2 eval 
+        before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens, spk_logits = self.model(
+            text=batch["text"],
+            note=batch["note"],
+            note_dur=batch["note_dur"],
+            is_slur=batch["is_slur"],
+            text_lengths=batch["text_lengths"],
+            speech=batch["speech"],
+            speech_lengths=batch["speech_lengths"],
+            durations=batch["durations"],
+            pitch=batch["pitch"],
+            energy=batch["energy"],
+            spk_id=spk_id,
+            spk_emb=spk_emb,
+            only_train_fs2=True, )
+
+        l1_loss_fs2, ssim_loss_fs2, duration_loss, pitch_loss, energy_loss, speaker_loss = self.criterion_fs2(
+            after_outs=after_outs,
+            before_outs=before_outs,
+            d_outs=d_outs,
+            p_outs=p_outs,
+            e_outs=e_outs,
+            ys=ys,
+            ds=batch["durations"],
+            ps=batch["pitch"],
+            es=batch["energy"],
+            ilens=batch["text_lengths"],
+            olens=olens,
+            spk_logits=spk_logits,
+            spk_ids=spk_id, )
+
+        loss_fs2 = l1_loss_fs2 + ssim_loss_fs2 + duration_loss + pitch_loss + energy_loss + speaker_loss
+
+        report("eval/loss_fs2", float(loss_fs2))
+        report("eval/l1_loss_fs2", float(l1_loss_fs2))
+        report("eval/ssim_loss_fs2", float(ssim_loss_fs2))
+        report("eval/duration_loss", float(duration_loss))
+        report("eval/pitch_loss", float(pitch_loss))
+
+        losses_dict["l1_loss_fs2"] = float(l1_loss_fs2)
+        losses_dict["ssim_loss_fs2"] = float(ssim_loss_fs2)
+        losses_dict["duration_loss"] = float(duration_loss)
+        losses_dict["pitch_loss"] = float(pitch_loss)
+
+        if speaker_loss != 0.:
+            report("eval/speaker_loss", float(speaker_loss))
+            losses_dict["speaker_loss"] = float(speaker_loss)
+        if energy_loss != 0.:
+            report("eval/energy_loss", float(energy_loss))
+            losses_dict["energy_loss"] = float(energy_loss)
+
+        losses_dict["loss_fs2"] = float(loss_fs2)
+
+        # Here show diffusion eval
+        noise_pred, noise_target, mel_masks = self.model(
+            text=batch["text"],
+            note=batch["note"],
+            note_dur=batch["note_dur"],
+            is_slur=batch["is_slur"],
+            text_lengths=batch["text_lengths"],
+            speech=batch["speech"],
+            speech_lengths=batch["speech_lengths"],
+            durations=batch["durations"],
+            pitch=batch["pitch"],
+            energy=batch["energy"],
+            spk_id=spk_id,
+            spk_emb=spk_emb,
+            only_train_fs2=False, )
+
+        noise_pred = noise_pred.transpose((0, 2, 1))
+        noise_target = noise_target.transpose((0, 2, 1))
+        mel_masks = mel_masks.transpose((0, 2, 1))
+        l1_loss_ds = self.criterion_ds(
+            noise_pred=noise_pred,
+            noise_target=noise_target,
+            mel_masks=mel_masks, )
+
+        loss_ds = l1_loss_ds
+
+        report("eval/loss_ds", float(loss_ds))
+        report("eval/l1_loss_ds", float(l1_loss_ds))
+        losses_dict["l1_loss_ds"] = float(l1_loss_ds)
+        losses_dict["loss_ds"] = float(loss_ds)
+        self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                              for k, v in losses_dict.items())
+
+        self.logger.info(self.msg)
diff --git a/paddlespeech/t2s/models/diffsinger/fastspeech2midi.py b/paddlespeech/t2s/models/diffsinger/fastspeech2midi.py
new file mode 100644
index 00000000..cce88d8a
--- /dev/null
+++ b/paddlespeech/t2s/models/diffsinger/fastspeech2midi.py
@@ -0,0 +1,654 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+from typing import Any
+from typing import Dict
+from typing import Sequence
+from typing import Tuple
+
+import paddle
+from paddle import nn
+from typeguard import check_argument_types
+
+from paddlespeech.t2s.models.fastspeech2 import FastSpeech2
+from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Loss
+from paddlespeech.t2s.modules.losses import ssim
+from paddlespeech.t2s.modules.masked_fill import masked_fill
+from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask
+from paddlespeech.t2s.modules.nets_utils import make_pad_mask
+
+
+class FastSpeech2MIDI(FastSpeech2):
+    """The Fastspeech2 module of DiffSinger.
+    """
+
+    def __init__(
+            self,
+            # fastspeech2 network structure related
+            idim: int,
+            odim: int,
+            fastspeech2_params: Dict[str, Any],
+            # note emb
+            note_num: int=300,
+            # is_slur emb
+            is_slur_num: int=2,
+            use_energy_pred: bool=False,
+            use_postnet: bool=False, ):
+        """Initialize FastSpeech2 module for svs.
+        Args:
+            fastspeech2_params (Dict):
+                The config of FastSpeech2 module on DiffSinger model
+            note_num (Optional[int]): 
+                Number of note. If not None, assume that the
+                note_ids will be provided as the input and use note_embedding_table.
+            is_slur_num (Optional[int]): 
+                Number of note. If not None, assume that the
+                is_slur_ids will be provided as the input
+    
+        """
+        assert check_argument_types()
+        super().__init__(idim=idim, odim=odim, **fastspeech2_params)
+        self.use_energy_pred = use_energy_pred
+        self.use_postnet = use_postnet
+        if not self.use_postnet:
+            self.postnet = None
+
+        self.note_embed_dim = self.is_slur_embed_dim = fastspeech2_params[
+            "adim"]
+
+        # note_ embed
+        self.note_embedding_table = nn.Embedding(
+            num_embeddings=note_num,
+            embedding_dim=self.note_embed_dim,
+            padding_idx=self.padding_idx)
+        self.note_dur_layer = nn.Linear(1, self.note_embed_dim)
+
+        # slur embed
+        self.is_slur_embedding_table = nn.Embedding(
+            num_embeddings=is_slur_num,
+            embedding_dim=self.is_slur_embed_dim,
+            padding_idx=self.padding_idx)
+
+    def forward(
+            self,
+            text: paddle.Tensor,
+            note: paddle.Tensor,
+            note_dur: paddle.Tensor,
+            is_slur: paddle.Tensor,
+            text_lengths: paddle.Tensor,
+            speech: paddle.Tensor,
+            speech_lengths: paddle.Tensor,
+            durations: paddle.Tensor,
+            pitch: paddle.Tensor,
+            energy: paddle.Tensor,
+            spk_emb: paddle.Tensor=None,
+            spk_id: paddle.Tensor=None,
+    ) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]:
+        """Calculate forward propagation.
+
+        Args:
+            text(Tensor(int64)): 
+                Batch of padded token (phone) ids (B, Tmax).
+            note(Tensor(int64)): 
+                Batch of padded note (element in music score) ids (B, Tmax).
+            note_dur(Tensor(float32)): 
+                Batch of padded note durations in seconds (element in music score) (B, Tmax).
+            is_slur(Tensor(int64)): 
+                Batch of padded slur (element in music score) ids (B, Tmax).
+            text_lengths(Tensor(int64)): 
+                Batch of phone lengths of each input (B,).
+            speech(Tensor[float32]): 
+                Batch of padded target features (e.g. mel) (B, Lmax, odim).
+            speech_lengths(Tensor(int64)): 
+                Batch of the lengths of each target features (B,).
+            durations(Tensor(int64)): 
+                Batch of padded token durations in frame (B, Tmax).
+            pitch(Tensor[float32]): 
+                Batch of padded frame-averaged pitch (B, Lmax, 1).
+            energy(Tensor[float32]): 
+                Batch of padded frame-averaged energy (B, Lmax, 1).
+            spk_emb(Tensor[float32], optional): 
+                Batch of speaker embeddings (B, spk_embed_dim).
+            spk_id(Tnesor[int64], optional(int64)): 
+                Batch of speaker ids (B,)
+
+        Returns:
+
+        """
+        xs = paddle.cast(text, 'int64')
+        note = paddle.cast(note, 'int64')
+        note_dur = paddle.cast(note_dur, 'float32')
+        is_slur = paddle.cast(is_slur, 'int64')
+        ilens = paddle.cast(text_lengths, 'int64')
+        olens = paddle.cast(speech_lengths, 'int64')
+        ds = paddle.cast(durations, 'int64')
+        ps = pitch
+        es = energy
+        ys = speech
+        olens = speech_lengths
+        if spk_id is not None:
+            spk_id = paddle.cast(spk_id, 'int64')
+        # forward propagation
+        before_outs, after_outs, d_outs, p_outs, e_outs, spk_logits = self._forward(
+            xs=xs,
+            note=note,
+            note_dur=note_dur,
+            is_slur=is_slur,
+            ilens=ilens,
+            olens=olens,
+            ds=ds,
+            ps=ps,
+            es=es,
+            is_inference=False,
+            spk_emb=spk_emb,
+            spk_id=spk_id, )
+        # modify mod part of groundtruth
+        if self.reduction_factor > 1:
+            olens = olens - olens % self.reduction_factor
+            max_olen = max(olens)
+            ys = ys[:, :max_olen]
+
+        return before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens, spk_logits
+
+    def _forward(
+            self,
+            xs: paddle.Tensor,
+            note: paddle.Tensor,
+            note_dur: paddle.Tensor,
+            is_slur: paddle.Tensor,
+            ilens: paddle.Tensor,
+            olens: paddle.Tensor=None,
+            ds: paddle.Tensor=None,
+            ps: paddle.Tensor=None,
+            es: paddle.Tensor=None,
+            is_inference: bool=False,
+            is_train_diffusion: bool=False,
+            return_after_enc=False,
+            alpha: float=1.0,
+            spk_emb=None,
+            spk_id=None, ) -> Sequence[paddle.Tensor]:
+
+        before_outs = after_outs = d_outs = p_outs = e_outs = spk_logits = None
+        # forward encoder
+        masks = self._source_mask(ilens)
+        note_emb = self.note_embedding_table(note)
+        note_dur_emb = self.note_dur_layer(paddle.unsqueeze(note_dur, axis=-1))
+        is_slur_emb = self.is_slur_embedding_table(is_slur)
+
+        # (B, Tmax, adim)
+        hs, _ = self.encoder(
+            xs=xs,
+            masks=masks,
+            note_emb=note_emb,
+            note_dur_emb=note_dur_emb,
+            is_slur_emb=is_slur_emb, )
+
+        if self.spk_num and self.enable_speaker_classifier and not is_inference:
+            hs_for_spk_cls = self.grad_reverse(hs)
+            spk_logits = self.speaker_classifier(hs_for_spk_cls, ilens)
+        else:
+            spk_logits = None
+
+        # integrate speaker embedding
+        if self.spk_embed_dim is not None:
+            # spk_emb has a higher priority than spk_id
+            if spk_emb is not None:
+                hs = self._integrate_with_spk_embed(hs, spk_emb)
+            elif spk_id is not None:
+                spk_emb = self.spk_embedding_table(spk_id)
+                hs = self._integrate_with_spk_embed(hs, spk_emb)
+
+        # forward duration predictor (phone-level) and variance predictors (frame-level)
+        d_masks = make_pad_mask(ilens)
+        if olens is not None:
+            pitch_masks = make_pad_mask(olens).unsqueeze(-1)
+        else:
+            pitch_masks = None
+
+        # inference for decoder input for diffusion
+        if is_train_diffusion:
+            hs = self.length_regulator(hs, ds, is_inference=False)
+            p_outs = self.pitch_predictor(hs.detach(), pitch_masks)
+            p_embs = self.pitch_embed(p_outs.transpose((0, 2, 1))).transpose(
+                (0, 2, 1))
+            hs += p_embs
+            if self.use_energy_pred:
+                e_outs = self.energy_predictor(hs.detach(), pitch_masks)
+                e_embs = self.energy_embed(
+                    e_outs.transpose((0, 2, 1))).transpose((0, 2, 1))
+                hs += e_embs
+
+        elif is_inference:
+            # (B, Tmax)
+            if ds is not None:
+                d_outs = ds
+            else:
+                d_outs = self.duration_predictor.inference(hs, d_masks)
+
+            # (B, Lmax, adim)
+            hs = self.length_regulator(hs, d_outs, alpha, is_inference=True)
+
+            if ps is not None:
+                p_outs = ps
+            else:
+                if self.stop_gradient_from_pitch_predictor:
+                    p_outs = self.pitch_predictor(hs.detach(), pitch_masks)
+                else:
+                    p_outs = self.pitch_predictor(hs, pitch_masks)
+            p_embs = self.pitch_embed(p_outs.transpose((0, 2, 1))).transpose(
+                (0, 2, 1))
+            hs += p_embs
+
+            if self.use_energy_pred:
+                if es is not None:
+                    e_outs = es
+                else:
+                    if self.stop_gradient_from_energy_predictor:
+                        e_outs = self.energy_predictor(hs.detach(), pitch_masks)
+                    else:
+                        e_outs = self.energy_predictor(hs, pitch_masks)
+                e_embs = self.energy_embed(
+                    e_outs.transpose((0, 2, 1))).transpose((0, 2, 1))
+                hs += e_embs
+
+        # training
+        else:
+            d_outs = self.duration_predictor(hs, d_masks)
+            # (B, Lmax, adim)
+            hs = self.length_regulator(hs, ds, is_inference=False)
+            if self.stop_gradient_from_pitch_predictor:
+                p_outs = self.pitch_predictor(hs.detach(), pitch_masks)
+            else:
+                p_outs = self.pitch_predictor(hs, pitch_masks)
+            p_embs = self.pitch_embed(ps.transpose((0, 2, 1))).transpose(
+                (0, 2, 1))
+            hs += p_embs
+
+            if self.use_energy_pred:
+                if self.stop_gradient_from_energy_predictor:
+                    e_outs = self.energy_predictor(hs.detach(), pitch_masks)
+                else:
+                    e_outs = self.energy_predictor(hs, pitch_masks)
+                e_embs = self.energy_embed(es.transpose((0, 2, 1))).transpose(
+                    (0, 2, 1))
+                hs += e_embs
+
+        # forward decoder
+        if olens is not None and not is_inference:
+            if self.reduction_factor > 1:
+                olens_in = paddle.to_tensor(
+                    [olen // self.reduction_factor for olen in olens.numpy()])
+            else:
+                olens_in = olens
+            # (B, 1, T)
+            h_masks = self._source_mask(olens_in)
+        else:
+            h_masks = None
+
+        if return_after_enc:
+            return hs, h_masks
+
+        if self.decoder_type == 'cnndecoder':
+            # remove output masks for dygraph to static graph
+            zs = self.decoder(hs, h_masks)
+            before_outs = zs
+        else:
+            # (B, Lmax, adim)
+            zs, _ = self.decoder(hs, h_masks)
+            # (B, Lmax, odim)
+            before_outs = self.feat_out(zs).reshape(
+                (paddle.shape(zs)[0], -1, self.odim))
+
+        # postnet -> (B, Lmax//r * r, odim)
+        if self.postnet is None:
+            after_outs = before_outs
+        else:
+            after_outs = before_outs + self.postnet(
+                before_outs.transpose((0, 2, 1))).transpose((0, 2, 1))
+
+        return before_outs, after_outs, d_outs, p_outs, e_outs, spk_logits
+
+    def encoder_infer(
+            self,
+            text: paddle.Tensor,
+            note: paddle.Tensor,
+            note_dur: paddle.Tensor,
+            is_slur: paddle.Tensor,
+            alpha: float=1.0,
+            spk_emb=None,
+            spk_id=None,
+    ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
+        xs = paddle.cast(text, 'int64').unsqueeze(0)
+        note = paddle.cast(note, 'int64').unsqueeze(0)
+        note_dur = paddle.cast(note_dur, 'float32').unsqueeze(0)
+        is_slur = paddle.cast(is_slur, 'int64').unsqueeze(0)
+        # setup batch axis
+        ilens = paddle.shape(xs)[1]
+
+        if spk_emb is not None:
+            spk_emb = spk_emb.unsqueeze(0)
+
+        # (1, L, odim)
+        # use *_ to avoid bug in dygraph to static graph    
+        hs, _ = self._forward(
+            xs=xs,
+            note=note,
+            note_dur=note_dur,
+            is_slur=is_slur,
+            ilens=ilens,
+            is_inference=True,
+            return_after_enc=True,
+            alpha=alpha,
+            spk_emb=spk_emb,
+            spk_id=spk_id, )
+        return hs
+
+    # get encoder output for diffusion training
+    def encoder_infer_batch(
+            self,
+            text: paddle.Tensor,
+            note: paddle.Tensor,
+            note_dur: paddle.Tensor,
+            is_slur: paddle.Tensor,
+            text_lengths: paddle.Tensor,
+            speech_lengths: paddle.Tensor,
+            ds: paddle.Tensor=None,
+            ps: paddle.Tensor=None,
+            es: paddle.Tensor=None,
+            alpha: float=1.0,
+            spk_emb=None,
+            spk_id=None, ) -> Tuple[paddle.Tensor, paddle.Tensor]:
+
+        xs = paddle.cast(text, 'int64')
+        note = paddle.cast(note, 'int64')
+        note_dur = paddle.cast(note_dur, 'float32')
+        is_slur = paddle.cast(is_slur, 'int64')
+        ilens = paddle.cast(text_lengths, 'int64')
+        olens = paddle.cast(speech_lengths, 'int64')
+
+        if spk_emb is not None:
+            spk_emb = spk_emb.unsqueeze(0)
+
+        # (1, L, odim)
+        # use *_ to avoid bug in dygraph to static graph    
+        hs, h_masks = self._forward(
+            xs=xs,
+            note=note,
+            note_dur=note_dur,
+            is_slur=is_slur,
+            ilens=ilens,
+            olens=olens,
+            ds=ds,
+            ps=ps,
+            es=es,
+            return_after_enc=True,
+            is_train_diffusion=True,
+            alpha=alpha,
+            spk_emb=spk_emb,
+            spk_id=spk_id, )
+        return hs, h_masks
+
+    def inference(
+            self,
+            text: paddle.Tensor,
+            note: paddle.Tensor,
+            note_dur: paddle.Tensor,
+            is_slur: paddle.Tensor,
+            durations: paddle.Tensor=None,
+            pitch: paddle.Tensor=None,
+            energy: paddle.Tensor=None,
+            alpha: float=1.0,
+            use_teacher_forcing: bool=False,
+            spk_emb=None,
+            spk_id=None,
+    ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
+        """Generate the sequence of features given the sequences of characters.
+
+        Args:
+            text(Tensor(int64)): 
+                Input sequence of characters (T,).
+            note(Tensor(int64)): 
+                Input note (element in music score) ids (T,).
+            note_dur(Tensor(float32)): 
+               Input note durations in seconds (element in music score) (T,).
+            is_slur(Tensor(int64)): 
+                Input slur (element in music score) ids (T,).
+            durations(Tensor, optional (int64)): 
+                Groundtruth of duration (T,).
+            pitch(Tensor, optional): 
+                Groundtruth of token-averaged pitch (T, 1).
+            energy(Tensor, optional): 
+                Groundtruth of token-averaged energy (T, 1).
+            alpha(float, optional): 
+                Alpha to control the speed.
+            use_teacher_forcing(bool, optional): 
+                Whether to use teacher forcing.
+                If true, groundtruth of duration, pitch and energy will be used.
+            spk_emb(Tensor, optional, optional): 
+                peaker embedding vector (spk_embed_dim,). (Default value = None)
+            spk_id(Tensor, optional(int64), optional): 
+                spk ids (1,). (Default value = None)
+
+        Returns:
+
+        """
+        xs = paddle.cast(text, 'int64').unsqueeze(0)
+        note = paddle.cast(note, 'int64').unsqueeze(0)
+        note_dur = paddle.cast(note_dur, 'float32').unsqueeze(0)
+        is_slur = paddle.cast(is_slur, 'int64').unsqueeze(0)
+        d, p, e = durations, pitch, energy
+        # setup batch axis
+        ilens = paddle.shape(xs)[1]
+
+        if spk_emb is not None:
+            spk_emb = spk_emb.unsqueeze(0)
+
+        if use_teacher_forcing:
+            # use groundtruth of duration, pitch, and energy
+            ds = d.unsqueeze(0) if d is not None else None
+            ps = p.unsqueeze(0) if p is not None else None
+            es = e.unsqueeze(0) if e is not None else None
+
+            # (1, L, odim)
+            _, outs, d_outs, p_outs, e_outs, _ = self._forward(
+                xs=xs,
+                note=note,
+                note_dur=note_dur,
+                is_slur=is_slur,
+                ilens=ilens,
+                ds=ds,
+                ps=ps,
+                es=es,
+                spk_emb=spk_emb,
+                spk_id=spk_id,
+                is_inference=True)
+        else:
+            # (1, L, odim)
+            _, outs, d_outs, p_outs, e_outs, _ = self._forward(
+                xs=xs,
+                note=note,
+                note_dur=note_dur,
+                is_slur=is_slur,
+                ilens=ilens,
+                is_inference=True,
+                alpha=alpha,
+                spk_emb=spk_emb,
+                spk_id=spk_id, )
+
+        if e_outs is None:
+            e_outs = [None]
+
+        return outs[0], d_outs[0], p_outs[0], e_outs[0]
+
+
+class FastSpeech2MIDILoss(FastSpeech2Loss):
+    """Loss function module for DiffSinger."""
+
+    def __init__(self, use_masking: bool=True,
+                 use_weighted_masking: bool=False):
+        """Initialize feed-forward Transformer loss module.
+        Args:
+            use_masking (bool): 
+                Whether to apply masking for padded part in loss calculation.
+            use_weighted_masking (bool): 
+                Whether to weighted masking in loss calculation.
+        """
+        assert check_argument_types()
+        super().__init__(use_masking, use_weighted_masking)
+
+    def forward(
+            self,
+            after_outs: paddle.Tensor,
+            before_outs: paddle.Tensor,
+            d_outs: paddle.Tensor,
+            p_outs: paddle.Tensor,
+            e_outs: paddle.Tensor,
+            ys: paddle.Tensor,
+            ds: paddle.Tensor,
+            ps: paddle.Tensor,
+            es: paddle.Tensor,
+            ilens: paddle.Tensor,
+            olens: paddle.Tensor,
+            spk_logits: paddle.Tensor=None,
+            spk_ids: paddle.Tensor=None,
+    ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor,
+               paddle.Tensor, ]:
+        """Calculate forward propagation.
+
+        Args:
+            after_outs(Tensor):  
+                Batch of outputs after postnets (B, Lmax, odim).
+            before_outs(Tensor): 
+                Batch of outputs before postnets (B, Lmax, odim).
+            d_outs(Tensor): 
+                Batch of outputs of duration predictor (B, Tmax).
+            p_outs(Tensor): 
+                Batch of outputs of pitch predictor (B, Lmax, 1).
+            e_outs(Tensor): 
+                Batch of outputs of energy predictor (B, Lmax, 1).
+            ys(Tensor): 
+                Batch of target features (B, Lmax, odim).
+            ds(Tensor): 
+                Batch of durations (B, Tmax).
+            ps(Tensor): 
+                Batch of target frame-averaged pitch (B, Lmax, 1).
+            es(Tensor): 
+                Batch of target frame-averaged energy (B, Lmax, 1).
+            ilens(Tensor): 
+                Batch of the lengths of each input (B,).
+            olens(Tensor): 
+                Batch of the lengths of each target (B,).
+            spk_logits(Option[Tensor]):
+                Batch of outputs after speaker classifier (B, Lmax, num_spk)
+            spk_ids(Option[Tensor]):
+                Batch of target spk_id (B,)
+        
+
+        Returns:
+
+        
+        """
+        l1_loss = duration_loss = pitch_loss = energy_loss = speaker_loss = ssim_loss = 0.0
+
+        # apply mask to remove padded part
+        if self.use_masking:
+            # make feature for ssim loss
+            out_pad_masks = make_pad_mask(olens).unsqueeze(-1)
+            before_outs_ssim = masked_fill(before_outs, out_pad_masks, 0.0)
+            if not paddle.equal_all(after_outs, before_outs):
+                after_outs_ssim = masked_fill(after_outs, out_pad_masks, 0.0)
+            ys_ssim = masked_fill(ys, out_pad_masks, 0.0)
+
+            out_masks = make_non_pad_mask(olens).unsqueeze(-1)
+            before_outs = before_outs.masked_select(
+                out_masks.broadcast_to(before_outs.shape))
+            if not paddle.equal_all(after_outs, before_outs):
+                after_outs = after_outs.masked_select(
+                    out_masks.broadcast_to(after_outs.shape))
+            ys = ys.masked_select(out_masks.broadcast_to(ys.shape))
+            duration_masks = make_non_pad_mask(ilens)
+            d_outs = d_outs.masked_select(
+                duration_masks.broadcast_to(d_outs.shape))
+            ds = ds.masked_select(duration_masks.broadcast_to(ds.shape))
+            pitch_masks = out_masks
+            p_outs = p_outs.masked_select(
+                pitch_masks.broadcast_to(p_outs.shape))
+            ps = ps.masked_select(pitch_masks.broadcast_to(ps.shape))
+            if e_outs is not None:
+                e_outs = e_outs.masked_select(
+                    pitch_masks.broadcast_to(e_outs.shape))
+                es = es.masked_select(pitch_masks.broadcast_to(es.shape))
+
+            if spk_logits is not None and spk_ids is not None:
+                batch_size = spk_ids.shape[0]
+                spk_ids = paddle.repeat_interleave(spk_ids, spk_logits.shape[1],
+                                                   None)
+                spk_logits = paddle.reshape(spk_logits,
+                                            [-1, spk_logits.shape[-1]])
+                mask_index = spk_logits.abs().sum(axis=1) != 0
+                spk_ids = spk_ids[mask_index]
+                spk_logits = spk_logits[mask_index]
+
+        # calculate loss
+        l1_loss = self.l1_criterion(before_outs, ys)
+        ssim_loss = 1.0 - ssim(
+            before_outs_ssim.unsqueeze(1), ys_ssim.unsqueeze(1))
+        if not paddle.equal_all(after_outs, before_outs):
+            l1_loss += self.l1_criterion(after_outs, ys)
+            ssim_loss += (
+                1.0 - ssim(after_outs_ssim.unsqueeze(1), ys_ssim.unsqueeze(1)))
+        l1_loss = l1_loss * 0.5
+        ssim_loss = ssim_loss * 0.5
+
+        duration_loss = self.duration_criterion(d_outs, ds)
+        pitch_loss = self.l1_criterion(p_outs, ps)
+        if e_outs is not None:
+            energy_loss = self.l1_criterion(e_outs, es)
+
+        if spk_logits is not None and spk_ids is not None:
+            speaker_loss = self.ce_criterion(spk_logits, spk_ids) / batch_size
+
+        # make weighted mask and apply it
+        if self.use_weighted_masking:
+            out_masks = make_non_pad_mask(olens).unsqueeze(-1)
+            out_weights = out_masks.cast(dtype=paddle.float32) / out_masks.cast(
+                dtype=paddle.float32).sum(
+                    axis=1, keepdim=True)
+            out_weights /= ys.shape[0] * ys.shape[2]
+            duration_masks = make_non_pad_mask(ilens)
+            duration_weights = (duration_masks.cast(dtype=paddle.float32) /
+                                duration_masks.cast(dtype=paddle.float32).sum(
+                                    axis=1, keepdim=True))
+            duration_weights /= ds.shape[0]
+
+            # apply weight
+            l1_loss = l1_loss.multiply(out_weights)
+            l1_loss = l1_loss.masked_select(
+                out_masks.broadcast_to(l1_loss.shape)).sum()
+            ssim_loss = ssim_loss.multiply(out_weights)
+            ssim_loss = ssim_loss.masked_select(
+                out_masks.broadcast_to(ssim_loss.shape)).sum()
+            duration_loss = (duration_loss.multiply(duration_weights)
+                             .masked_select(duration_masks).sum())
+            pitch_masks = out_masks
+            pitch_weights = out_weights
+            pitch_loss = pitch_loss.multiply(pitch_weights)
+            pitch_loss = pitch_loss.masked_select(
+                pitch_masks.broadcast_to(pitch_loss.shape)).sum()
+            if e_outs is not None:
+                energy_loss = energy_loss.multiply(pitch_weights)
+                energy_loss = energy_loss.masked_select(
+                    pitch_masks.broadcast_to(energy_loss.shape)).sum()
+
+        return l1_loss, ssim_loss, duration_loss, pitch_loss, energy_loss, speaker_loss
diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
index c790c8cb..a95a9b28 100644
--- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
+++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
@@ -93,6 +93,7 @@ class FastSpeech2(nn.Layer):
             transformer_dec_dropout_rate: float=0.1,
             transformer_dec_positional_dropout_rate: float=0.1,
             transformer_dec_attn_dropout_rate: float=0.1,
+            transformer_activation_type: str="relu",
             # for conformer
             conformer_pos_enc_layer_type: str="rel_pos",
             conformer_self_attn_layer_type: str="rel_selfattn",
@@ -200,6 +201,8 @@ class FastSpeech2(nn.Layer):
                 Dropout rate after decoder positional encoding.
             transformer_dec_attn_dropout_rate (float): 
                 Dropout rate in decoder self-attention module.
+            transformer_activation_type (str): 
+                Activation function type in transformer.
             conformer_pos_enc_layer_type (str): 
                 Pos encoding layer type in conformer.
             conformer_self_attn_layer_type (str): 
@@ -250,7 +253,7 @@ class FastSpeech2(nn.Layer):
                 Kernel size of energy embedding.
             energy_embed_dropout_rate (float): 
                 Dropout rate for energy embedding.
-            stop_gradient_from_energy_predictor（bool): 
+            stop_gradient_from_energy_predictor (bool): 
                 Whether to stop gradient from energy predictor to encoder.
             spk_num (Optional[int]): 
                 Number of speakers. If not None, assume that the spk_embed_dim is not None,
@@ -269,7 +272,7 @@ class FastSpeech2(nn.Layer):
                 How to integrate tone embedding.
             init_type (str): 
                 How to initialize transformer parameters.
-            init_enc_alpha （float): 
+            init_enc_alpha (float): 
                 Initial value of alpha in scaled pos encoding of the encoder.
             init_dec_alpha (float): 
                 Initial value of alpha in scaled pos encoding of the decoder.
@@ -344,7 +347,8 @@ class FastSpeech2(nn.Layer):
                 normalize_before=encoder_normalize_before,
                 concat_after=encoder_concat_after,
                 positionwise_layer_type=positionwise_layer_type,
-                positionwise_conv_kernel_size=positionwise_conv_kernel_size, )
+                positionwise_conv_kernel_size=positionwise_conv_kernel_size,
+                activation_type=transformer_activation_type)
         elif encoder_type == "conformer":
             self.encoder = ConformerEncoder(
                 idim=idim,
@@ -453,7 +457,8 @@ class FastSpeech2(nn.Layer):
                 normalize_before=decoder_normalize_before,
                 concat_after=decoder_concat_after,
                 positionwise_layer_type=positionwise_layer_type,
-                positionwise_conv_kernel_size=positionwise_conv_kernel_size, )
+                positionwise_conv_kernel_size=positionwise_conv_kernel_size,
+                activation_type=conformer_activation_type, )
         elif decoder_type == "conformer":
             self.decoder = ConformerEncoder(
                 idim=0,
@@ -778,7 +783,7 @@ class FastSpeech2(nn.Layer):
         x = paddle.cast(text, 'int64')
         d, p, e = durations, pitch, energy
         # setup batch axis
-        ilens = paddle.shape(x)[0]
+        ilens = paddle.shape(x)[0:1]
 
         xs = x.unsqueeze(0)
 
diff --git a/paddlespeech/t2s/models/hifigan/hifigan.py b/paddlespeech/t2s/models/hifigan/hifigan.py
index 7a01840e..2759af9d 100644
--- a/paddlespeech/t2s/models/hifigan/hifigan.py
+++ b/paddlespeech/t2s/models/hifigan/hifigan.py
@@ -37,8 +37,8 @@ class HiFiGANGenerator(nn.Layer):
             channels: int=512,
             global_channels: int=-1,
             kernel_size: int=7,
-            upsample_scales: List[int]=(8, 8, 2, 2),
-            upsample_kernel_sizes: List[int]=(16, 16, 4, 4),
+            upsample_scales: List[int]=(5, 5, 4, 3),
+            upsample_kernel_sizes: List[int]=(10, 10, 8, 6),
             resblock_kernel_sizes: List[int]=(3, 7, 11),
             resblock_dilations: List[List[int]]=[(1, 3, 5), (1, 3, 5),
                                                  (1, 3, 5)],
@@ -47,8 +47,13 @@ class HiFiGANGenerator(nn.Layer):
             nonlinear_activation: str="leakyrelu",
             nonlinear_activation_params: Dict[str, Any]={"negative_slope": 0.1},
             use_weight_norm: bool=True,
-            init_type: str="xavier_uniform", ):
+            init_type: str="xavier_uniform",
+            use_istft: bool=False,
+            istft_layer_id: int=2,
+            n_fft: int=2048,
+            win_length: int=1200, ):
         """Initialize HiFiGANGenerator module.
+
         Args:
             in_channels (int): 
                 Number of input channels.
@@ -79,6 +84,14 @@ class HiFiGANGenerator(nn.Layer):
             use_weight_norm (bool): 
                 Whether to use weight norm.
                 If set to true, it will be applied to all of the conv layers.
+            use_istft (bool):
+                If set to true, it will be a iSTFTNet based on hifigan.
+            istft_layer_id (int):
+                Use istft after istft_layer_id layers of upsample layer if use_istft=True
+            n_fft (int):
+                Number of fft points in feature extraction
+            win_length (int):
+                Window length in feature extraction
         """
         super().__init__()
 
@@ -89,9 +102,11 @@ class HiFiGANGenerator(nn.Layer):
         assert kernel_size % 2 == 1, "Kernel size must be odd number."
         assert len(upsample_scales) == len(upsample_kernel_sizes)
         assert len(resblock_dilations) == len(resblock_kernel_sizes)
+        assert len(upsample_scales) >= istft_layer_id if use_istft else True
 
         # define modules
-        self.num_upsamples = len(upsample_kernel_sizes)
+        self.num_upsamples = len(
+            upsample_kernel_sizes) if not use_istft else istft_layer_id
         self.num_blocks = len(resblock_kernel_sizes)
         self.input_conv = nn.Conv1D(
             in_channels,
@@ -101,7 +116,7 @@ class HiFiGANGenerator(nn.Layer):
             padding=(kernel_size - 1) // 2, )
         self.upsamples = nn.LayerList()
         self.blocks = nn.LayerList()
-        for i in range(len(upsample_kernel_sizes)):
+        for i in range(self.num_upsamples):
             assert upsample_kernel_sizes[i] == 2 * upsample_scales[i]
             self.upsamples.append(
                 nn.Sequential(
@@ -126,15 +141,36 @@ class HiFiGANGenerator(nn.Layer):
                         nonlinear_activation=nonlinear_activation,
                         nonlinear_activation_params=nonlinear_activation_params,
                     ))
-        self.output_conv = nn.Sequential(
-            nn.LeakyReLU(),
-            nn.Conv1D(
+        self.use_istft = use_istft
+        if self.use_istft:
+            self.istft_hop_size = 1
+            for j in range(istft_layer_id, len(upsample_scales)):
+                self.istft_hop_size *= upsample_scales[j]
+            s = 1
+            for j in range(istft_layer_id):
+                s *= upsample_scales[j]
+            self.istft_n_fft = int(n_fft / s) if (
+                n_fft / s) % 2 == 0 else int((n_fft / s + 2) - n_fft / s % 2)
+            self.istft_win_length = int(win_length / s) if (
+                win_length /
+                s) % 2 == 0 else int((win_length / s + 2) - win_length / s % 2)
+            self.reflection_pad = nn.Pad1D(padding=[1, 0], mode='reflect')
+            self.output_conv = nn.Conv1D(
                 channels // (2**(i + 1)),
-                out_channels,
+                (self.istft_n_fft // 2 + 1) * 2,
                 kernel_size,
                 1,
-                padding=(kernel_size - 1) // 2, ),
-            nn.Tanh(), )
+                padding=(kernel_size - 1) // 2, )
+        else:
+            self.output_conv = nn.Sequential(
+                nn.LeakyReLU(),
+                nn.Conv1D(
+                    channels // (2**(i + 1)),
+                    out_channels,
+                    kernel_size,
+                    1,
+                    padding=(kernel_size - 1) // 2, ),
+                nn.Tanh(), )
 
         if global_channels > 0:
             self.global_conv = nn.Conv1D(global_channels, channels, 1)
@@ -167,7 +203,29 @@ class HiFiGANGenerator(nn.Layer):
             for j in range(self.num_blocks):
                 cs += self.blocks[i * self.num_blocks + j](c)
             c = cs / self.num_blocks
-        c = self.output_conv(c)
+
+        if self.use_istft:
+            c = F.leaky_relu(c)
+            c = self.reflection_pad(c)
+            c = self.output_conv(c)
+            """
+            Input of Exp operator, an N-D Tensor, with data type float32, float64 or float16.
+            https://www.paddlepaddle.org.cn/documentation/docs/en/api/paddle/exp_en.html
+            Use Euler's formula to implement spec*paddle.exp(1j*phase)
+            """
+            spec = paddle.exp(c[:, :self.istft_n_fft // 2 + 1, :])
+            phase = paddle.sin(c[:, self.istft_n_fft // 2 + 1:, :])
+
+            c = paddle.complex(spec * (paddle.cos(phase)),
+                               spec * (paddle.sin(phase)))
+            c = paddle.signal.istft(
+                c,
+                n_fft=self.istft_n_fft,
+                hop_length=self.istft_hop_size,
+                win_length=self.istft_win_length)
+            c = c.unsqueeze(1)
+        else:
+            c = self.output_conv(c)
 
         return c
 
diff --git a/paddlespeech/t2s/models/jets/__init__.py b/paddlespeech/t2s/models/jets/__init__.py
new file mode 100644
index 00000000..dec4a331
--- /dev/null
+++ b/paddlespeech/t2s/models/jets/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .jets import *
+from .jets_updater import *
diff --git a/paddlespeech/t2s/models/jets/alignments.py b/paddlespeech/t2s/models/jets/alignments.py
new file mode 100644
index 00000000..998f67e2
--- /dev/null
+++ b/paddlespeech/t2s/models/jets/alignments.py
@@ -0,0 +1,182 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Generator module in JETS.
+
+This code is based on https://github.com/imdanboy/jets.
+
+"""
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+from numba import jit
+from paddle import nn
+
+from paddlespeech.t2s.modules.masked_fill import masked_fill
+
+
+class AlignmentModule(nn.Layer):
+    """Alignment Learning Framework proposed for parallel TTS models in:
+    https://arxiv.org/abs/2108.10447
+    """
+
+    def __init__(self, adim, odim):
+        super().__init__()
+        self.t_conv1 = nn.Conv1D(adim, adim, kernel_size=3, padding=1)
+        self.t_conv2 = nn.Conv1D(adim, adim, kernel_size=1, padding=0)
+
+        self.f_conv1 = nn.Conv1D(odim, adim, kernel_size=3, padding=1)
+        self.f_conv2 = nn.Conv1D(adim, adim, kernel_size=3, padding=1)
+        self.f_conv3 = nn.Conv1D(adim, adim, kernel_size=1, padding=0)
+
+    def forward(self, text, feats, x_masks=None):
+        """
+        Args:
+            text (Tensor): Batched text embedding (B, T_text, adim)
+            feats (Tensor): Batched acoustic feature (B, T_feats, odim)
+            x_masks (Tensor): Mask tensor (B, T_text)
+
+        Returns:
+            Tensor: log probability of attention matrix (B, T_feats, T_text)
+        """
+
+        text = text.transpose((0, 2, 1))
+        text = F.relu(self.t_conv1(text))
+        text = self.t_conv2(text)
+        text = text.transpose((0, 2, 1))
+
+        feats = feats.transpose((0, 2, 1))
+        feats = F.relu(self.f_conv1(feats))
+        feats = F.relu(self.f_conv2(feats))
+        feats = self.f_conv3(feats)
+        feats = feats.transpose((0, 2, 1))
+
+        dist = feats.unsqueeze(2) - text.unsqueeze(1)
+        dist = paddle.linalg.norm(dist, p=2, axis=3)
+        score = -dist
+
+        if x_masks is not None:
+            x_masks = x_masks.unsqueeze(-2)
+            score = masked_fill(score, x_masks, -np.inf)
+        log_p_attn = F.log_softmax(score, axis=-1)
+        return log_p_attn, score
+
+
+@jit(nopython=True)
+def _monotonic_alignment_search(log_p_attn):
+    # https://arxiv.org/abs/2005.11129
+    T_mel = log_p_attn.shape[0]
+    T_inp = log_p_attn.shape[1]
+    Q = np.full((T_inp, T_mel), fill_value=-np.inf)
+
+    log_prob = log_p_attn.transpose(1, 0)  # -> (T_inp,T_mel)
+    # 1.  Q <- init first row for all j
+    for j in range(T_mel):
+        Q[0, j] = log_prob[0, :j + 1].sum()
+
+    # 2. 
+    for j in range(1, T_mel):
+        for i in range(1, min(j + 1, T_inp)):
+            Q[i, j] = max(Q[i - 1, j - 1], Q[i, j - 1]) + log_prob[i, j]
+
+    # 3.
+    A = np.full((T_mel, ), fill_value=T_inp - 1)
+    for j in range(T_mel - 2, -1, -1):  # T_mel-2, ..., 0
+        # 'i' in {A[j+1]-1, A[j+1]}
+        i_a = A[j + 1] - 1
+        i_b = A[j + 1]
+        if i_b == 0:
+            argmax_i = 0
+        elif Q[i_a, j] >= Q[i_b, j]:
+            argmax_i = i_a
+        else:
+            argmax_i = i_b
+        A[j] = argmax_i
+    return A
+
+
+def viterbi_decode(log_p_attn, text_lengths, feats_lengths):
+    """
+    Args:
+        log_p_attn (Tensor): 
+            Batched log probability of attention matrix (B, T_feats, T_text)
+        text_lengths (Tensor): 
+            Text length tensor (B,)
+        feats_legnths (Tensor): 
+            Feature length tensor (B,)
+    Returns:
+        Tensor: 
+            Batched token duration extracted from `log_p_attn` (B,T_text)
+        Tensor: 
+            binarization loss tensor ()
+    """
+    B = log_p_attn.shape[0]
+    T_text = log_p_attn.shape[2]
+    device = log_p_attn.place
+
+    bin_loss = 0
+    ds = paddle.zeros((B, T_text), dtype="int32")
+    for b in range(B):
+        cur_log_p_attn = log_p_attn[b, :feats_lengths[b], :text_lengths[b]]
+        viterbi = _monotonic_alignment_search(cur_log_p_attn.numpy())
+        _ds = np.bincount(viterbi)
+        ds[b, :len(_ds)] = paddle.to_tensor(
+            _ds, place=device, dtype="int32")  
+
+        t_idx = paddle.arange(feats_lengths[b])
+        bin_loss = bin_loss - cur_log_p_attn[t_idx, viterbi].mean()
+    bin_loss = bin_loss / B
+    return ds, bin_loss
+
+
+@jit(nopython=True)
+def _average_by_duration(ds, xs, text_lengths, feats_lengths):
+    B = ds.shape[0]
+    # xs_avg = np.zeros_like(ds)
+    xs_avg = np.zeros(shape=ds.shape, dtype=np.float32)
+    ds = ds.astype(np.int32)
+    for b in range(B):
+        t_text = text_lengths[b]
+        t_feats = feats_lengths[b]
+        d = ds[b, :t_text]
+        d_cumsum = d.cumsum()
+        d_cumsum = [0] + list(d_cumsum)
+        x = xs[b, :t_feats]
+        for n, (start, end) in enumerate(zip(d_cumsum[:-1], d_cumsum[1:])):
+            if len(x[start:end]) != 0:
+                xs_avg[b, n] = x[start:end].mean()
+            else:
+                xs_avg[b, n] = 0
+    return xs_avg
+
+
+def average_by_duration(ds, xs, text_lengths, feats_lengths):
+    """
+    Args:
+        ds (Tensor): 
+            Batched token duration (B,T_text)
+        xs (Tensor): 
+            Batched feature sequences to be averaged (B,T_feats)
+        text_lengths (Tensor): 
+            Text length tensor (B,)
+        feats_lengths (Tensor): 
+            Feature length tensor (B,)
+    Returns:
+        Tensor: Batched feature averaged according to the token duration (B, T_text)
+    """
+    device = ds.place
+    args = [ds, xs, text_lengths, feats_lengths]
+    args = [arg.numpy() for arg in args]
+    xs_avg = _average_by_duration(*args)
+    xs_avg = paddle.to_tensor(xs_avg, place=device)
+    return xs_avg
diff --git a/paddlespeech/t2s/models/jets/generator.py b/paddlespeech/t2s/models/jets/generator.py
new file mode 100644
index 00000000..9580d17d
--- /dev/null
+++ b/paddlespeech/t2s/models/jets/generator.py
@@ -0,0 +1,897 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Generator module in JETS.
+
+This code is based on https://github.com/imdanboy/jets.
+
+"""
+import logging
+import math
+from typing import Any
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Sequence
+from typing import Tuple
+
+import numpy as np
+import paddle
+from paddle import nn
+from typeguard import check_argument_types
+
+from paddlespeech.t2s.models.hifigan import HiFiGANGenerator
+from paddlespeech.t2s.models.jets.alignments import AlignmentModule
+from paddlespeech.t2s.models.jets.alignments import average_by_duration
+from paddlespeech.t2s.models.jets.alignments import viterbi_decode
+from paddlespeech.t2s.models.jets.length_regulator import GaussianUpsampling
+from paddlespeech.t2s.modules.nets_utils import get_random_segments
+from paddlespeech.t2s.modules.nets_utils import initialize
+from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask
+from paddlespeech.t2s.modules.nets_utils import make_pad_mask
+from paddlespeech.t2s.modules.predictor.duration_predictor import DurationPredictor
+from paddlespeech.t2s.modules.predictor.length_regulator import LengthRegulator
+from paddlespeech.t2s.modules.predictor.variance_predictor import VariancePredictor
+from paddlespeech.t2s.modules.style_encoder import StyleEncoder
+from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
+from paddlespeech.t2s.modules.transformer.embedding import ScaledPositionalEncoding
+from paddlespeech.t2s.modules.transformer.encoder import ConformerEncoder
+from paddlespeech.t2s.modules.transformer.encoder import TransformerEncoder
+
+
+class JETSGenerator(nn.Layer):
+    """Generator module in JETS.
+    """
+
+    def __init__(
+            self,
+            idim: int,
+            odim: int,
+            adim: int=256,
+            aheads: int=2,
+            elayers: int=4,
+            eunits: int=1024,
+            dlayers: int=4,
+            dunits: int=1024,
+            positionwise_layer_type: str="conv1d",
+            positionwise_conv_kernel_size: int=1,
+            use_scaled_pos_enc: bool=True,
+            use_batch_norm: bool=True,
+            encoder_normalize_before: bool=True,
+            decoder_normalize_before: bool=True,
+            encoder_concat_after: bool=False,
+            decoder_concat_after: bool=False,
+            reduction_factor: int=1,
+            encoder_type: str="transformer",
+            decoder_type: str="transformer",
+            transformer_enc_dropout_rate: float=0.1,
+            transformer_enc_positional_dropout_rate: float=0.1,
+            transformer_enc_attn_dropout_rate: float=0.1,
+            transformer_dec_dropout_rate: float=0.1,
+            transformer_dec_positional_dropout_rate: float=0.1,
+            transformer_dec_attn_dropout_rate: float=0.1,
+            transformer_activation_type: str="relu",
+            # only for conformer
+            conformer_rel_pos_type: str="legacy",
+            conformer_pos_enc_layer_type: str="rel_pos",
+            conformer_self_attn_layer_type: str="rel_selfattn",
+            conformer_activation_type: str="swish",
+            use_macaron_style_in_conformer: bool=True,
+            use_cnn_in_conformer: bool=True,
+            zero_triu: bool=False,
+            conformer_enc_kernel_size: int=7,
+            conformer_dec_kernel_size: int=31,
+            # duration predictor
+            duration_predictor_layers: int=2,
+            duration_predictor_chans: int=384,
+            duration_predictor_kernel_size: int=3,
+            duration_predictor_dropout_rate: float=0.1,
+            # energy predictor
+            energy_predictor_layers: int=2,
+            energy_predictor_chans: int=384,
+            energy_predictor_kernel_size: int=3,
+            energy_predictor_dropout: float=0.5,
+            energy_embed_kernel_size: int=9,
+            energy_embed_dropout: float=0.5,
+            stop_gradient_from_energy_predictor: bool=False,
+            # pitch predictor
+            pitch_predictor_layers: int=2,
+            pitch_predictor_chans: int=384,
+            pitch_predictor_kernel_size: int=3,
+            pitch_predictor_dropout: float=0.5,
+            pitch_embed_kernel_size: int=9,
+            pitch_embed_dropout: float=0.5,
+            stop_gradient_from_pitch_predictor: bool=False,
+            # extra embedding related
+            spks: Optional[int]=None,
+            langs: Optional[int]=None,
+            spk_embed_dim: Optional[int]=None,
+            spk_embed_integration_type: str="add",
+            use_gst: bool=False,
+            gst_tokens: int=10,
+            gst_heads: int=4,
+            gst_conv_layers: int=6,
+            gst_conv_chans_list: Sequence[int]=(32, 32, 64, 64, 128, 128),
+            gst_conv_kernel_size: int=3,
+            gst_conv_stride: int=2,
+            gst_gru_layers: int=1,
+            gst_gru_units: int=128,
+            # training related
+            init_type: str="xavier_uniform",
+            init_enc_alpha: float=1.0,
+            init_dec_alpha: float=1.0,
+            use_masking: bool=False,
+            use_weighted_masking: bool=False,
+            segment_size: int=64,
+            # hifigan generator
+            generator_out_channels: int=1,
+            generator_channels: int=512,
+            generator_global_channels: int=-1,
+            generator_kernel_size: int=7,
+            generator_upsample_scales: List[int]=[8, 8, 2, 2],
+            generator_upsample_kernel_sizes: List[int]=[16, 16, 4, 4],
+            generator_resblock_kernel_sizes: List[int]=[3, 7, 11],
+            generator_resblock_dilations: List[List[int]]=[[1, 3, 5], [1, 3, 5],
+                                                           [1, 3, 5]],
+            generator_use_additional_convs: bool=True,
+            generator_bias: bool=True,
+            generator_nonlinear_activation: str="LeakyReLU",
+            generator_nonlinear_activation_params: Dict[
+                str, Any]={"negative_slope": 0.1},
+            generator_use_weight_norm: bool=True, ):
+        """Initialize JETS generator module.
+
+        Args:
+            idim (int): 
+                Dimension of the inputs.
+            odim (int): 
+                Dimension of the outputs.
+            adim (int): 
+                Attention dimension.
+            aheads (int): 
+                Number of attention heads.
+            elayers (int): 
+                Number of encoder layers.
+            eunits (int): 
+                Number of encoder hidden units.
+            dlayers (int): 
+                Number of decoder layers.
+            dunits (int): 
+                Number of decoder hidden units.
+            use_scaled_pos_enc (bool): 
+                Whether to use trainable scaled pos encoding.
+            use_batch_norm (bool): 
+                Whether to use batch normalization in encoder prenet.
+            encoder_normalize_before (bool): 
+                Whether to apply layernorm layer before encoder block.
+            decoder_normalize_before (bool): 
+                Whether to apply layernorm layer before decoder block.
+            encoder_concat_after (bool): 
+                Whether to concatenate attention layer's input and output in encoder.
+            decoder_concat_after (bool): 
+                Whether to concatenate attention layer's input and output in decoder.
+            reduction_factor (int): 
+                Reduction factor.
+            encoder_type (str): 
+                Encoder type ("transformer" or "conformer").
+            decoder_type (str): 
+                Decoder type ("transformer" or "conformer").
+            transformer_enc_dropout_rate (float): 
+                Dropout rate in encoder except attention and positional encoding.
+            transformer_enc_positional_dropout_rate (float): 
+                Dropout rate after encoder positional encoding.
+            transformer_enc_attn_dropout_rate (float): 
+                Dropout rate in encoder self-attention module.
+            transformer_dec_dropout_rate (float): 
+                Dropout rate in decoder except attention & positional encoding.
+            transformer_dec_positional_dropout_rate (float): 
+                Dropout rate after decoder positional encoding.
+            transformer_dec_attn_dropout_rate (float): 
+                Dropout rate in decoder self-attention module.
+            conformer_rel_pos_type (str): 
+                Relative pos encoding type in conformer.
+            conformer_pos_enc_layer_type (str): 
+                Pos encoding layer type in conformer.
+            conformer_self_attn_layer_type (str): 
+                Self-attention layer type in conformer
+            conformer_activation_type (str): 
+                Activation function type in conformer.
+            use_macaron_style_in_conformer: 
+                Whether to use macaron style FFN.
+            use_cnn_in_conformer: 
+                Whether to use CNN in conformer.
+            zero_triu: 
+                Whether to use zero triu in relative self-attention module.
+            conformer_enc_kernel_size: 
+                Kernel size of encoder conformer.
+            conformer_dec_kernel_size: 
+                Kernel size of decoder conformer.
+            duration_predictor_layers (int): 
+                Number of duration predictor layers.
+            duration_predictor_chans (int): 
+                Number of duration predictor channels.
+            duration_predictor_kernel_size (int): 
+                Kernel size of duration predictor.
+            duration_predictor_dropout_rate (float): 
+                Dropout rate in duration predictor.
+            pitch_predictor_layers (int): 
+                Number of pitch predictor layers.
+            pitch_predictor_chans (int): 
+                Number of pitch predictor channels.
+            pitch_predictor_kernel_size (int): 
+                Kernel size of pitch predictor.
+            pitch_predictor_dropout_rate (float): 
+                Dropout rate in pitch predictor.
+            pitch_embed_kernel_size (float): 
+                Kernel size of pitch embedding.
+            pitch_embed_dropout_rate (float): 
+                Dropout rate for pitch embedding.
+            stop_gradient_from_pitch_predictor: 
+                Whether to stop gradient from pitch predictor to encoder.
+            energy_predictor_layers (int): 
+                Number of energy predictor layers.
+            energy_predictor_chans (int): 
+                Number of energy predictor channels.
+            energy_predictor_kernel_size (int): 
+                Kernel size of energy predictor.
+            energy_predictor_dropout_rate (float): 
+                Dropout rate in energy predictor.
+            energy_embed_kernel_size (float): 
+                Kernel size of energy embedding.
+            energy_embed_dropout_rate (float): 
+                Dropout rate for energy embedding.
+            stop_gradient_from_energy_predictor: 
+                Whether to stop gradient from energy predictor to encoder.
+            spks (Optional[int]): 
+                Number of speakers. If set to > 1, assume that the sids will be provided as the input and use sid embedding layer.
+            langs (Optional[int]): 
+                Number of languages. If set to > 1, assume that the lids will be provided as the input and use sid embedding layer.
+            spk_embed_dim (Optional[int]): 
+                Speaker embedding dimension. If set to > 0, assume that spembs will be provided as the input.
+            spk_embed_integration_type: 
+                How to integrate speaker embedding.
+            use_gst (str): 
+                Whether to use global style token.
+            gst_tokens (int): 
+                The number of GST embeddings.
+            gst_heads (int): 
+                The number of heads in GST multihead attention.
+            gst_conv_layers (int): 
+                The number of conv layers in GST.
+            gst_conv_chans_list: (Sequence[int]):
+                List of the number of channels of conv layers in GST.
+            gst_conv_kernel_size (int): 
+                Kernel size of conv layers in GST.
+            gst_conv_stride (int): 
+                Stride size of conv layers in GST.
+            gst_gru_layers (int): 
+                The number of GRU layers in GST.
+            gst_gru_units (int): 
+                The number of GRU units in GST.
+            init_type (str): 
+                How to initialize transformer parameters.
+            init_enc_alpha (float): 
+                Initial value of alpha in scaled pos encoding of the encoder.
+            init_dec_alpha (float): 
+                Initial value of alpha in scaled pos encoding of the decoder.
+            use_masking (bool): 
+                Whether to apply masking for padded part in loss calculation.
+            use_weighted_masking (bool): 
+                Whether to apply weighted masking in loss calculation.
+            segment_size (int): 
+                Segment size for random windowed discriminator
+            generator_out_channels (int): 
+                Number of output channels.
+            generator_channels (int): 
+                Number of hidden representation channels.
+            generator_global_channels (int): 
+                Number of global conditioning channels.
+            generator_kernel_size (int): 
+                Kernel size of initial and final conv layer.
+            generator_upsample_scales (List[int]): 
+                List of upsampling scales.
+            generator_upsample_kernel_sizes (List[int]): 
+                List of kernel sizes for upsample layers.
+            generator_resblock_kernel_sizes (List[int]): 
+                List of kernel sizes for residual blocks.
+            generator_resblock_dilations (List[List[int]]): 
+                List of list of dilations for residual blocks.
+            generator_use_additional_convs (bool): 
+                Whether to use additional conv layers in residual blocks.
+            generator_bias (bool): 
+                Whether to add bias parameter in convolution layers.
+            generator_nonlinear_activation (str): 
+                Activation function module name.
+            generator_nonlinear_activation_params (Dict[str, Any]): 
+                Hyperparameters for activation function.
+            generator_use_weight_norm (bool): 
+                Whether to use weight norm. If set to true, it will be applied to all of the conv layers.
+
+        """
+        super().__init__()
+        self.segment_size = segment_size
+        self.upsample_factor = int(np.prod(generator_upsample_scales))
+        self.idim = idim
+        self.odim = odim
+        self.reduction_factor = reduction_factor
+        self.encoder_type = encoder_type
+        self.decoder_type = decoder_type
+        self.stop_gradient_from_pitch_predictor = stop_gradient_from_pitch_predictor
+        self.stop_gradient_from_energy_predictor = stop_gradient_from_energy_predictor
+        self.use_scaled_pos_enc = use_scaled_pos_enc
+        self.use_gst = use_gst
+
+        # use idx 0 as padding idx
+        self.padding_idx = 0
+
+        # get positional encoding layer type
+        transformer_pos_enc_layer_type = "scaled_abs_pos" if self.use_scaled_pos_enc else "abs_pos"
+
+        # check relative positional encoding compatibility
+        if "conformer" in [encoder_type, decoder_type]:
+            if conformer_rel_pos_type == "legacy":
+                if conformer_pos_enc_layer_type == "rel_pos":
+                    conformer_pos_enc_layer_type = "legacy_rel_pos"
+                    logging.warning(
+                        "Fallback to conformer_pos_enc_layer_type = 'legacy_rel_pos' "
+                        "due to the compatibility. If you want to use the new one, "
+                        "please use conformer_pos_enc_layer_type = 'latest'.")
+                if conformer_self_attn_layer_type == "rel_selfattn":
+                    conformer_self_attn_layer_type = "legacy_rel_selfattn"
+                    logging.warning(
+                        "Fallback to "
+                        "conformer_self_attn_layer_type = 'legacy_rel_selfattn' "
+                        "due to the compatibility. If you want to use the new one, "
+                        "please use conformer_pos_enc_layer_type = 'latest'.")
+            elif conformer_rel_pos_type == "latest":
+                assert conformer_pos_enc_layer_type != "legacy_rel_pos"
+                assert conformer_self_attn_layer_type != "legacy_rel_selfattn"
+            else:
+                raise ValueError(
+                    f"Unknown rel_pos_type: {conformer_rel_pos_type}")
+
+        # define encoder
+        encoder_input_layer = nn.Embedding(
+            num_embeddings=idim,
+            embedding_dim=adim,
+            padding_idx=self.padding_idx)
+        if encoder_type == "transformer":
+            self.encoder = TransformerEncoder(
+                idim=idim,
+                attention_dim=adim,
+                attention_heads=aheads,
+                linear_units=eunits,
+                num_blocks=elayers,
+                input_layer=encoder_input_layer,
+                dropout_rate=transformer_enc_dropout_rate,
+                positional_dropout_rate=transformer_enc_positional_dropout_rate,
+                attention_dropout_rate=transformer_enc_attn_dropout_rate,
+                pos_enc_layer_type=transformer_pos_enc_layer_type,
+                normalize_before=encoder_normalize_before,
+                concat_after=encoder_concat_after,
+                positionwise_layer_type=positionwise_layer_type,
+                positionwise_conv_kernel_size=positionwise_conv_kernel_size,
+                activation_type=transformer_activation_type)
+        elif encoder_type == "conformer":
+            self.encoder = ConformerEncoder(
+                idim=idim,
+                attention_dim=adim,
+                attention_heads=aheads,
+                linear_units=eunits,
+                num_blocks=elayers,
+                input_layer=encoder_input_layer,
+                dropout_rate=transformer_enc_dropout_rate,
+                positional_dropout_rate=transformer_enc_positional_dropout_rate,
+                attention_dropout_rate=transformer_enc_attn_dropout_rate,
+                normalize_before=encoder_normalize_before,
+                concat_after=encoder_concat_after,
+                positionwise_layer_type=positionwise_layer_type,
+                positionwise_conv_kernel_size=positionwise_conv_kernel_size,
+                macaron_style=use_macaron_style_in_conformer,
+                pos_enc_layer_type=conformer_pos_enc_layer_type,
+                selfattention_layer_type=conformer_self_attn_layer_type,
+                activation_type=conformer_activation_type,
+                use_cnn_module=use_cnn_in_conformer,
+                cnn_module_kernel=conformer_enc_kernel_size,
+                zero_triu=zero_triu, )
+        else:
+            raise ValueError(f"{encoder_type} is not supported.")
+
+        # define GST
+        if self.use_gst:
+            self.gst = StyleEncoder(
+                idim=odim,  # the input is mel-spectrogram
+                gst_tokens=gst_tokens,
+                gst_token_dim=adim,
+                gst_heads=gst_heads,
+                conv_layers=gst_conv_layers,
+                conv_chans_list=gst_conv_chans_list,
+                conv_kernel_size=gst_conv_kernel_size,
+                conv_stride=gst_conv_stride,
+                gru_layers=gst_gru_layers,
+                gru_units=gst_gru_units, )
+
+        # define spk and lang embedding
+        self.spks = None
+        if spks is not None and spks > 1:
+            self.spks = spks
+            self.sid_emb = nn.Embedding(spks, adim)
+        self.langs = None
+        if langs is not None and langs > 1:
+            self.langs = langs
+            self.lid_emb = nn.Embedding(langs, adim)
+
+        # define additional projection for speaker embedding
+        self.spk_embed_dim = None
+        if spk_embed_dim is not None and spk_embed_dim > 0:
+            self.spk_embed_dim = spk_embed_dim
+            self.spk_embed_integration_type = spk_embed_integration_type
+        if self.spk_embed_dim is not None:
+            if self.spk_embed_integration_type == "add":
+                self.projection = nn.Linear(self.spk_embed_dim, adim)
+            else:
+                self.projection = nn.Linear(adim + self.spk_embed_dim, adim)
+
+        # define duration predictor
+        self.duration_predictor = DurationPredictor(
+            idim=adim,
+            n_layers=duration_predictor_layers,
+            n_chans=duration_predictor_chans,
+            kernel_size=duration_predictor_kernel_size,
+            dropout_rate=duration_predictor_dropout_rate, )
+
+        # define pitch predictor
+        self.pitch_predictor = VariancePredictor(
+            idim=adim,
+            n_layers=pitch_predictor_layers,
+            n_chans=pitch_predictor_chans,
+            kernel_size=pitch_predictor_kernel_size,
+            dropout_rate=pitch_predictor_dropout, )
+        # NOTE(kan-bayashi): We use continuous pitch + FastPitch style avg
+        self.pitch_embed = nn.Sequential(
+            nn.Conv1D(
+                in_channels=1,
+                out_channels=adim,
+                kernel_size=pitch_embed_kernel_size,
+                padding=(pitch_embed_kernel_size - 1) // 2, ),
+            nn.Dropout(pitch_embed_dropout), )
+
+        # define energy predictor
+        self.energy_predictor = VariancePredictor(
+            idim=adim,
+            n_layers=energy_predictor_layers,
+            n_chans=energy_predictor_chans,
+            kernel_size=energy_predictor_kernel_size,
+            dropout_rate=energy_predictor_dropout, )
+        # NOTE(kan-bayashi): We use continuous enegy + FastPitch style avg
+        self.energy_embed = nn.Sequential(
+            nn.Conv1D(
+                in_channels=1,
+                out_channels=adim,
+                kernel_size=energy_embed_kernel_size,
+                padding=(energy_embed_kernel_size - 1) // 2, ),
+            nn.Dropout(energy_embed_dropout), )
+
+        # define length regulator
+        self.length_regulator = GaussianUpsampling()
+
+        # define decoder
+        # NOTE: we use encoder as decoder
+        # because fastspeech's decoder is the same as encoder
+        if decoder_type == "transformer":
+            self.decoder = TransformerEncoder(
+                idim=0,
+                attention_dim=adim,
+                attention_heads=aheads,
+                linear_units=dunits,
+                num_blocks=dlayers,
+                # in decoder, don't need layer before pos_enc_class (we use embedding here in encoder)
+                input_layer=None,
+                dropout_rate=transformer_dec_dropout_rate,
+                positional_dropout_rate=transformer_dec_positional_dropout_rate,
+                attention_dropout_rate=transformer_dec_attn_dropout_rate,
+                pos_enc_layer_type=transformer_pos_enc_layer_type,
+                normalize_before=decoder_normalize_before,
+                concat_after=decoder_concat_after,
+                positionwise_layer_type=positionwise_layer_type,
+                positionwise_conv_kernel_size=positionwise_conv_kernel_size,
+                activation_type=conformer_activation_type, )
+
+        elif decoder_type == "conformer":
+            self.decoder = ConformerEncoder(
+                idim=0,
+                attention_dim=adim,
+                attention_heads=aheads,
+                linear_units=dunits,
+                num_blocks=dlayers,
+                input_layer=None,
+                dropout_rate=transformer_dec_dropout_rate,
+                positional_dropout_rate=transformer_dec_positional_dropout_rate,
+                attention_dropout_rate=transformer_dec_attn_dropout_rate,
+                normalize_before=decoder_normalize_before,
+                concat_after=decoder_concat_after,
+                positionwise_layer_type=positionwise_layer_type,
+                positionwise_conv_kernel_size=positionwise_conv_kernel_size,
+                macaron_style=use_macaron_style_in_conformer,
+                pos_enc_layer_type=conformer_pos_enc_layer_type,
+                selfattention_layer_type=conformer_self_attn_layer_type,
+                activation_type=conformer_activation_type,
+                use_cnn_module=use_cnn_in_conformer,
+                cnn_module_kernel=conformer_dec_kernel_size, )
+        else:
+            raise ValueError(f"{decoder_type} is not supported.")
+
+        self.generator = HiFiGANGenerator(
+            in_channels=adim,
+            out_channels=generator_out_channels,
+            channels=generator_channels,
+            global_channels=generator_global_channels,
+            kernel_size=generator_kernel_size,
+            upsample_scales=generator_upsample_scales,
+            upsample_kernel_sizes=generator_upsample_kernel_sizes,
+            resblock_kernel_sizes=generator_resblock_kernel_sizes,
+            resblock_dilations=generator_resblock_dilations,
+            use_additional_convs=generator_use_additional_convs,
+            bias=generator_bias,
+            nonlinear_activation=generator_nonlinear_activation,
+            nonlinear_activation_params=generator_nonlinear_activation_params,
+            use_weight_norm=generator_use_weight_norm, )
+
+        self.alignment_module = AlignmentModule(adim, odim)
+
+        # initialize parameters
+        self._reset_parameters(
+            init_type=init_type,
+            init_enc_alpha=init_enc_alpha,
+            init_dec_alpha=init_dec_alpha, )
+
+    def forward(
+            self,
+            text: paddle.Tensor,
+            text_lengths: paddle.Tensor,
+            feats: paddle.Tensor,
+            feats_lengths: paddle.Tensor,
+            durations: paddle.Tensor,
+            durations_lengths: paddle.Tensor,
+            pitch: paddle.Tensor,
+            energy: paddle.Tensor,
+            sids: Optional[paddle.Tensor]=None,
+            spembs: Optional[paddle.Tensor]=None,
+            lids: Optional[paddle.Tensor]=None,
+            use_alignment_module: bool=False,
+    ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor,
+               paddle.Tensor, paddle.Tensor,
+               Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor,
+                     paddle.Tensor, paddle.Tensor, ], ]:
+        """Calculate forward propagation.
+        Args:
+            text (Tensor):
+                Text index tensor (B, T_text).
+            text_lengths (Tensor):
+                Text length tensor (B,).
+            feats (Tensor):
+                Feature tensor (B, aux_channels, T_feats).
+            feats_lengths (Tensor):
+                Feature length tensor (B,).
+            pitch (Tensor): 
+                Batch of padded token-averaged pitch (B, T_text, 1).
+            energy (Tensor):
+                Batch of padded token-averaged energy (B, T_text, 1).
+            sids (Optional[Tensor]):
+                Speaker index tensor (B,) or (B, 1).
+            spembs (Optional[Tensor]):
+                Speaker embedding tensor (B, spk_embed_dim).
+            lids (Optional[Tensor]):
+                Language index tensor (B,) or (B, 1).
+            use_alignment_module (bool):
+                Whether to use alignment module.
+                
+        Returns:
+            Tensor: 
+                Waveform tensor (B, 1, segment_size * upsample_factor).
+            Tensor: 
+                binarization loss ()
+            Tensor: 
+                log probability attention matrix (B,T_feats,T_text)
+            Tensor: 
+                Segments start index tensor (B,).
+            Tensor: 
+                predicted duration (B,T_text)
+            Tensor: 
+                ground-truth duration obtained from an alignment module (B,T_text)
+            Tensor: 
+                predicted pitch (B,T_text,1)
+            Tensor: 
+                ground-truth averaged pitch (B,T_text,1)
+            Tensor: 
+                predicted energy (B,T_text,1)
+            Tensor: 
+                ground-truth averaged energy (B,T_text,1)
+        """
+        if use_alignment_module:
+            text = text[:, :text_lengths.max()]  # for data-parallel
+            feats = feats[:, :feats_lengths.max()]  # for data-parallel
+            pitch = pitch[:, :durations_lengths.max()]  # for data-parallel
+            energy = energy[:, :durations_lengths.max()]  # for data-parallel
+        else:
+            text = text[:, :text_lengths.max()]  # for data-parallel
+            feats = feats[:, :feats_lengths.max()]  # for data-parallel
+            pitch = pitch[:, :feats_lengths.max()]  # for data-parallel
+            energy = energy[:, :feats_lengths.max()]  # for data-parallel
+
+        # forward encoder
+        x_masks = self._source_mask(text_lengths)
+        hs, _ = self.encoder(text, x_masks)  # (B, T_text, adim)
+
+        # integrate with GST
+        if self.use_gst:
+            style_embs = self.gst(ys)
+            hs = hs + style_embs.unsqueeze(1)
+
+        # integrate with SID and LID embeddings
+        if self.spks is not None:
+            sid_embs = self.sid_emb(sids.reshape([-1]))
+            hs = hs + sid_embs.unsqueeze(1)
+        if self.langs is not None:
+            lid_embs = self.lid_emb(lids.reshape([-1]))
+            hs = hs + lid_embs.unsqueeze(1)
+
+        # integrate speaker embedding
+        if self.spk_embed_dim is not None:
+            hs = self._integrate_with_spk_embed(hs, spembs)
+
+        # forward alignment module and obtain duration, averaged pitch, energy
+        h_masks = make_pad_mask(text_lengths)
+        if use_alignment_module:
+            log_p_attn = self.alignment_module(hs, feats, h_masks)
+            ds, bin_loss = viterbi_decode(log_p_attn, text_lengths,
+                                          feats_lengths)
+            ps = average_by_duration(ds,
+                                     pitch.squeeze(-1), text_lengths,
+                                     feats_lengths).unsqueeze(-1)
+            es = average_by_duration(ds,
+                                     energy.squeeze(-1), text_lengths,
+                                     feats_lengths).unsqueeze(-1)
+        else:
+            ds = durations
+            ps = pitch
+            es = energy
+            log_p_attn = attn = bin_loss = None
+
+        # forward duration predictor and variance predictors
+        if self.stop_gradient_from_pitch_predictor:
+            p_outs = self.pitch_predictor(hs.detach(), h_masks.unsqueeze(-1))
+        else:
+            p_outs = self.pitch_predictor(hs, h_masks.unsqueeze(-1))
+        if self.stop_gradient_from_energy_predictor:
+            e_outs = self.energy_predictor(hs.detach(), h_masks.unsqueeze(-1))
+        else:
+            e_outs = self.energy_predictor(hs, h_masks.unsqueeze(-1))
+
+        d_outs = self.duration_predictor(hs, h_masks)
+
+        # use groundtruth in training
+        p_embs = self.pitch_embed(ps.transpose([0, 2, 1])).transpose([0, 2, 1])
+        e_embs = self.energy_embed(es.transpose([0, 2, 1])).transpose([0, 2, 1])
+        hs = hs + e_embs + p_embs
+
+        # upsampling
+        h_masks = make_non_pad_mask(feats_lengths)
+        # d_masks = make_non_pad_mask(text_lengths).to(ds.device)
+        d_masks = make_non_pad_mask(text_lengths)
+        hs = self.length_regulator(hs, ds, h_masks,
+                                   d_masks)  # (B, T_feats, adim)
+
+        # forward decoder
+        h_masks = self._source_mask(feats_lengths)
+        zs, _ = self.decoder(hs, h_masks)  # (B, T_feats, adim)
+
+        # get random segments
+        z_segments, z_start_idxs = get_random_segments(
+            zs.transpose([0, 2, 1]),
+            feats_lengths,
+            self.segment_size, )
+        # forward generator
+        wav = self.generator(z_segments)
+        if use_alignment_module:
+            return wav, bin_loss, log_p_attn, z_start_idxs, d_outs, ds, p_outs, ps, e_outs, es
+        else:
+            return wav, None, None, z_start_idxs, d_outs, ds, p_outs, ps, e_outs, es
+
+    def inference(
+            self,
+            text: paddle.Tensor,
+            text_lengths: paddle.Tensor,
+            feats: Optional[paddle.Tensor]=None,
+            feats_lengths: Optional[paddle.Tensor]=None,
+            pitch: Optional[paddle.Tensor]=None,
+            energy: Optional[paddle.Tensor]=None,
+            sids: Optional[paddle.Tensor]=None,
+            spembs: Optional[paddle.Tensor]=None,
+            lids: Optional[paddle.Tensor]=None,
+            use_alignment_module: bool=False,
+    ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
+        """Run inference.
+
+        Args:
+            text (Tensor): Input text index tensor (B, T_text,).
+            text_lengths (Tensor): Text length tensor (B,).
+            feats (Tensor): Feature tensor (B, T_feats, aux_channels).
+            feats_lengths (Tensor): Feature length tensor (B,).
+            pitch (Tensor): Pitch tensor (B, T_feats, 1)
+            energy (Tensor): Energy tensor (B, T_feats, 1)
+            sids (Optional[Tensor]): Speaker index tensor (B,) or (B, 1).
+            spembs (Optional[Tensor]): Speaker embedding tensor (B, spk_embed_dim).
+            lids (Optional[Tensor]): Language index tensor (B,) or (B, 1).
+            use_alignment_module (bool): Whether to use alignment module.
+
+        Returns:
+            Tensor: Generated waveform tensor (B, T_wav).
+            Tensor: Duration tensor (B, T_text).
+
+        """
+        # forward encoder
+        x_masks = self._source_mask(text_lengths)
+        hs, _ = self.encoder(text, x_masks)  # (B, T_text, adim)
+
+        # integrate with GST
+        if self.use_gst:
+            style_embs = self.gst(ys)
+            hs = hs + style_embs.unsqueeze(1)
+
+        # integrate with SID and LID embeddings
+        if self.spks is not None:
+            sid_embs = self.sid_emb(sids.view(-1))
+            hs = hs + sid_embs.unsqueeze(1)
+        if self.langs is not None:
+            lid_embs = self.lid_emb(lids.view(-1))
+            hs = hs + lid_embs.unsqueeze(1)
+
+        # integrate speaker embedding
+        if self.spk_embed_dim is not None:
+            hs = self._integrate_with_spk_embed(hs, spembs)
+
+        h_masks = make_pad_mask(text_lengths)
+        if use_alignment_module:
+            # forward alignment module and obtain duration, averaged pitch, energy
+            log_p_attn, attn = self.alignment_module(hs, feats, h_masks)
+            d_outs, _ = viterbi_decode(log_p_attn, text_lengths, feats_lengths)
+            p_outs = average_by_duration(d_outs,
+                                         pitch.squeeze(-1), text_lengths,
+                                         feats_lengths).unsqueeze(-1)
+            e_outs = average_by_duration(d_outs,
+                                         energy.squeeze(-1), text_lengths,
+                                         feats_lengths).unsqueeze(-1)
+        else:
+            # forward duration predictor and variance predictors
+            p_outs = self.pitch_predictor(hs, h_masks.unsqueeze(-1))
+            e_outs = self.energy_predictor(hs, h_masks.unsqueeze(-1))
+            d_outs = self.duration_predictor.inference(hs, h_masks)
+
+        p_embs = self.pitch_embed(p_outs.transpose([0, 2, 1])).transpose(
+            [0, 2, 1])
+        e_embs = self.energy_embed(e_outs.transpose([0, 2, 1])).transpose(
+            [0, 2, 1])
+        hs = hs + e_embs + p_embs
+
+        # upsampling
+        if feats_lengths is not None:
+            h_masks = make_non_pad_mask(feats_lengths)
+        else:
+            h_masks = None
+        d_masks = make_non_pad_mask(text_lengths)
+        hs = self.length_regulator(hs, d_outs, h_masks,
+                                   d_masks)  # (B, T_feats, adim)
+
+        # forward decoder
+        if feats_lengths is not None:
+            h_masks = self._source_mask(feats_lengths)
+        else:
+            h_masks = None
+        zs, _ = self.decoder(hs, h_masks)  # (B, T_feats, adim)
+
+        # forward generator
+        wav = self.generator(zs.transpose([0, 2, 1]))
+
+        return wav.squeeze(1), d_outs
+
+    def _integrate_with_spk_embed(self,
+                                  hs: paddle.Tensor,
+                                  spembs: paddle.Tensor) -> paddle.Tensor:
+        """Integrate speaker embedding with hidden states.
+
+        Args:
+            hs (Tensor): Batch of hidden state sequences (B, T_text, adim).
+            spembs (Tensor): Batch of speaker embeddings (B, spk_embed_dim).
+
+        Returns:
+            Tensor: Batch of integrated hidden state sequences (B, T_text, adim).
+
+        """
+        if self.spk_embed_integration_type == "add":
+            # apply projection and then add to hidden states
+            spembs = self.projection(F.normalize(spembs))
+            hs = hs + spembs.unsqueeze(1)
+        elif self.spk_embed_integration_type == "concat":
+            # concat hidden states with spk embeds and then apply projection
+            spembs = F.normalize(spembs).unsqueeze(1).expand(-1, hs.shape[1],
+                                                             -1)
+            hs = self.projection(paddle.concat([hs, spembs], axis=-1))
+        else:
+            raise NotImplementedError("support only add or concat.")
+
+        return hs
+
+    def _generate_path(self, dur: paddle.Tensor,
+                       mask: paddle.Tensor) -> paddle.Tensor:
+        """Generate path a.k.a. monotonic attention.
+        Args:
+            dur (Tensor):
+                Duration tensor (B, 1, T_text).
+            mask (Tensor):
+                Attention mask tensor (B, 1, T_feats, T_text).
+        Returns:
+            Tensor:
+                Path tensor (B, 1, T_feats, T_text).
+        """
+        b, _, t_y, t_x = paddle.shape(mask)
+        cum_dur = paddle.cumsum(dur, -1)
+        cum_dur_flat = paddle.reshape(cum_dur, [b * t_x])
+
+        path = paddle.arange(t_y, dtype=dur.dtype)
+        path = path.unsqueeze(0) < cum_dur_flat.unsqueeze(1)
+        path = paddle.reshape(path, [b, t_x, t_y])
+        '''
+        path will be like (t_x = 3, t_y = 5):
+        [[[1., 1., 0., 0., 0.],      [[[1., 1., 0., 0., 0.],
+          [1., 1., 1., 1., 0.],  -->   [0., 0., 1., 1., 0.],
+          [1., 1., 1., 1., 1.]]]       [0., 0., 0., 0., 1.]]]
+        '''
+
+        path = paddle.cast(path, dtype='float32')
+        pad_tmp = self.pad1d(path)[:, :-1]
+        path = path - pad_tmp
+        return path.unsqueeze(1).transpose([0, 1, 3, 2]) * mask
+
+    def _source_mask(self, ilens: paddle.Tensor) -> paddle.Tensor:
+        """Make masks for self-attention.
+
+        Args:
+            ilens (LongTensor): Batch of lengths (B,).
+
+        Returns:
+            Tensor: Mask tensor for self-attention.
+                dtype=paddle.uint8 
+
+        Examples:
+            >>> ilens = [5, 3]
+            >>> self._source_mask(ilens)
+            tensor([[[1, 1, 1, 1, 1],
+                     [1, 1, 1, 0, 0]]], dtype=torch.uint8)
+
+        """
+        x_masks = paddle.to_tensor(make_non_pad_mask(ilens))
+        return x_masks.unsqueeze(-2)
+
+    def _reset_parameters(self,
+                          init_type: str,
+                          init_enc_alpha: float,
+                          init_dec_alpha: float):
+        # initialize parameters
+        initialize(self, init_type)
+
+        # initialize alpha in scaled positional encoding
+        if self.encoder_type == "transformer" and self.use_scaled_pos_enc:
+            self.encoder.embed[-1].alpha.data = paddle.to_tensor(init_enc_alpha)
+        if self.decoder_type == "transformer" and self.use_scaled_pos_enc:
+            self.decoder.embed[-1].alpha.data = paddle.to_tensor(init_dec_alpha)
diff --git a/paddlespeech/t2s/models/jets/jets.py b/paddlespeech/t2s/models/jets/jets.py
new file mode 100644
index 00000000..4346c65b
--- /dev/null
+++ b/paddlespeech/t2s/models/jets/jets.py
@@ -0,0 +1,582 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Generator module in JETS.
+
+This code is based on https://github.com/imdanboy/jets.
+
+"""
+"""JETS module"""
+import math
+from typing import Any
+from typing import Dict
+from typing import Optional
+
+import paddle
+from paddle import nn
+from typeguard import check_argument_types
+
+from paddlespeech.t2s.models.hifigan import HiFiGANMultiPeriodDiscriminator
+from paddlespeech.t2s.models.hifigan import HiFiGANMultiScaleDiscriminator
+from paddlespeech.t2s.models.hifigan import HiFiGANMultiScaleMultiPeriodDiscriminator
+from paddlespeech.t2s.models.hifigan import HiFiGANPeriodDiscriminator
+from paddlespeech.t2s.models.hifigan import HiFiGANScaleDiscriminator
+from paddlespeech.t2s.models.jets.generator import JETSGenerator
+from paddlespeech.utils.initialize import _calculate_fan_in_and_fan_out
+from paddlespeech.utils.initialize import kaiming_uniform_
+from paddlespeech.utils.initialize import normal_
+from paddlespeech.utils.initialize import ones_
+from paddlespeech.utils.initialize import uniform_
+from paddlespeech.utils.initialize import zeros_
+
+AVAILABLE_GENERATERS = {
+    "jets_generator": JETSGenerator,
+}
+AVAILABLE_DISCRIMINATORS = {
+    "hifigan_period_discriminator":
+    HiFiGANPeriodDiscriminator,
+    "hifigan_scale_discriminator":
+    HiFiGANScaleDiscriminator,
+    "hifigan_multi_period_discriminator":
+    HiFiGANMultiPeriodDiscriminator,
+    "hifigan_multi_scale_discriminator":
+    HiFiGANMultiScaleDiscriminator,
+    "hifigan_multi_scale_multi_period_discriminator":
+    HiFiGANMultiScaleMultiPeriodDiscriminator,
+}
+
+
+class JETS(nn.Layer):
+    """JETS module (generator + discriminator).
+    This is a module of JETS described in `JETS: Jointly Training FastSpeech2 
+    and HiFi-GAN for End to End Text to Speech`_.
+    .. _`JETS: Jointly Training FastSpeech2 and HiFi-GAN for End to End Text to Speech
+        Text-to-Speech`: https://arxiv.org/abs/2203.16852v1
+    """
+
+    def __init__(
+            self,
+            # generator related
+            idim: int,
+            odim: int,
+            sampling_rate: int=22050,
+            generator_type: str="jets_generator",
+            generator_params: Dict[str, Any]={
+                "adim": 256,
+                "aheads": 2,
+                "elayers": 4,
+                "eunits": 1024,
+                "dlayers": 4,
+                "dunits": 1024,
+                "positionwise_layer_type": "conv1d",
+                "positionwise_conv_kernel_size": 1,
+                "use_scaled_pos_enc": True,
+                "use_batch_norm": True,
+                "encoder_normalize_before": True,
+                "decoder_normalize_before": True,
+                "encoder_concat_after": False,
+                "decoder_concat_after": False,
+                "reduction_factor": 1,
+                "encoder_type": "transformer",
+                "decoder_type": "transformer",
+                "transformer_enc_dropout_rate": 0.1,
+                "transformer_enc_positional_dropout_rate": 0.1,
+                "transformer_enc_attn_dropout_rate": 0.1,
+                "transformer_dec_dropout_rate": 0.1,
+                "transformer_dec_positional_dropout_rate": 0.1,
+                "transformer_dec_attn_dropout_rate": 0.1,
+                "conformer_rel_pos_type": "latest",
+                "conformer_pos_enc_layer_type": "rel_pos",
+                "conformer_self_attn_layer_type": "rel_selfattn",
+                "conformer_activation_type": "swish",
+                "use_macaron_style_in_conformer": True,
+                "use_cnn_in_conformer": True,
+                "zero_triu": False,
+                "conformer_enc_kernel_size": 7,
+                "conformer_dec_kernel_size": 31,
+                "duration_predictor_layers": 2,
+                "duration_predictor_chans": 384,
+                "duration_predictor_kernel_size": 3,
+                "duration_predictor_dropout_rate": 0.1,
+                "energy_predictor_layers": 2,
+                "energy_predictor_chans": 384,
+                "energy_predictor_kernel_size": 3,
+                "energy_predictor_dropout": 0.5,
+                "energy_embed_kernel_size": 1,
+                "energy_embed_dropout": 0.5,
+                "stop_gradient_from_energy_predictor": False,
+                "pitch_predictor_layers": 5,
+                "pitch_predictor_chans": 384,
+                "pitch_predictor_kernel_size": 5,
+                "pitch_predictor_dropout": 0.5,
+                "pitch_embed_kernel_size": 1,
+                "pitch_embed_dropout": 0.5,
+                "stop_gradient_from_pitch_predictor": True,
+                "generator_out_channels": 1,
+                "generator_channels": 512,
+                "generator_global_channels": -1,
+                "generator_kernel_size": 7,
+                "generator_upsample_scales": [8, 8, 2, 2],
+                "generator_upsample_kernel_sizes": [16, 16, 4, 4],
+                "generator_resblock_kernel_sizes": [3, 7, 11],
+                "generator_resblock_dilations":
+                [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+                "generator_use_additional_convs": True,
+                "generator_bias": True,
+                "generator_nonlinear_activation": "LeakyReLU",
+                "generator_nonlinear_activation_params": {
+                    "negative_slope": 0.1
+                },
+                "generator_use_weight_norm": True,
+                "segment_size": 64,
+                "spks": -1,
+                "langs": -1,
+                "spk_embed_dim": None,
+                "spk_embed_integration_type": "add",
+                "use_gst": False,
+                "gst_tokens": 10,
+                "gst_heads": 4,
+                "gst_conv_layers": 6,
+                "gst_conv_chans_list": [32, 32, 64, 64, 128, 128],
+                "gst_conv_kernel_size": 3,
+                "gst_conv_stride": 2,
+                "gst_gru_layers": 1,
+                "gst_gru_units": 128,
+                "init_type": "xavier_uniform",
+                "init_enc_alpha": 1.0,
+                "init_dec_alpha": 1.0,
+                "use_masking": False,
+                "use_weighted_masking": False,
+            },
+            # discriminator related
+            discriminator_type: str="hifigan_multi_scale_multi_period_discriminator",
+            discriminator_params: Dict[str, Any]={
+                "scales": 1,
+                "scale_downsample_pooling": "AvgPool1D",
+                "scale_downsample_pooling_params": {
+                    "kernel_size": 4,
+                    "stride": 2,
+                    "padding": 2,
+                },
+                "scale_discriminator_params": {
+                    "in_channels": 1,
+                    "out_channels": 1,
+                    "kernel_sizes": [15, 41, 5, 3],
+                    "channels": 128,
+                    "max_downsample_channels": 1024,
+                    "max_groups": 16,
+                    "bias": True,
+                    "downsample_scales": [2, 2, 4, 4, 1],
+                    "nonlinear_activation": "leakyrelu",
+                    "nonlinear_activation_params": {
+                        "negative_slope": 0.1
+                    },
+                    "use_weight_norm": True,
+                    "use_spectral_norm": False,
+                },
+                "follow_official_norm": False,
+                "periods": [2, 3, 5, 7, 11],
+                "period_discriminator_params": {
+                    "in_channels": 1,
+                    "out_channels": 1,
+                    "kernel_sizes": [5, 3],
+                    "channels": 32,
+                    "downsample_scales": [3, 3, 3, 3, 1],
+                    "max_downsample_channels": 1024,
+                    "bias": True,
+                    "nonlinear_activation": "leakyrelu",
+                    "nonlinear_activation_params": {
+                        "negative_slope": 0.1
+                    },
+                    "use_weight_norm": True,
+                    "use_spectral_norm": False,
+                },
+            },
+            cache_generator_outputs: bool=True, ):
+        """Initialize JETS module.
+        Args:
+            idim (int):
+                Input vocabrary size.
+            odim (int):
+                Acoustic feature dimension. The actual output channels will
+                be 1 since JETS is the end-to-end text-to-wave model but for the
+                compatibility odim is used to indicate the acoustic feature dimension.
+            sampling_rate (int):
+                Sampling rate, not used for the training but it will
+                be referred in saving waveform during the inference.
+            generator_type (str):
+                Generator type.
+            generator_params (Dict[str, Any]):
+                Parameter dict for generator.
+            discriminator_type (str):
+                Discriminator type.
+            discriminator_params (Dict[str, Any]):
+                Parameter dict for discriminator.
+            cache_generator_outputs (bool):
+                Whether to cache generator outputs.
+        """
+        assert check_argument_types()
+        super().__init__()
+
+        # define modules
+        generator_class = AVAILABLE_GENERATERS[generator_type]
+        if generator_type == "jets_generator":
+            # NOTE: Update parameters for the compatibility.
+            #   The idim and odim is automatically decided from input data,
+            #   where idim represents #vocabularies and odim represents
+            #   the input acoustic feature dimension.
+            generator_params.update(idim=idim, odim=odim)
+        self.generator = generator_class(
+            **generator_params, )
+        discriminator_class = AVAILABLE_DISCRIMINATORS[discriminator_type]
+        self.discriminator = discriminator_class(
+            **discriminator_params, )
+
+        # cache
+        self.cache_generator_outputs = cache_generator_outputs
+        self._cache = None
+
+        # store sampling rate for saving wav file
+        # (not used for the training)
+        self.fs = sampling_rate
+
+        # store parameters for test compatibility
+        self.spks = self.generator.spks
+        self.langs = self.generator.langs
+        self.spk_embed_dim = self.generator.spk_embed_dim
+
+        self.reuse_cache_gen = True
+        self.reuse_cache_dis = True
+
+        self.reset_parameters()
+        self.generator._reset_parameters(
+            init_type=generator_params["init_type"],
+            init_enc_alpha=generator_params["init_enc_alpha"],
+            init_dec_alpha=generator_params["init_dec_alpha"], )
+
+    def forward(
+            self,
+            text: paddle.Tensor,
+            text_lengths: paddle.Tensor,
+            feats: paddle.Tensor,
+            feats_lengths: paddle.Tensor,
+            durations: paddle.Tensor,
+            durations_lengths: paddle.Tensor,
+            pitch: paddle.Tensor,
+            energy: paddle.Tensor,
+            sids: Optional[paddle.Tensor]=None,
+            spembs: Optional[paddle.Tensor]=None,
+            lids: Optional[paddle.Tensor]=None,
+            forward_generator: bool=True,
+            use_alignment_module: bool=False,
+            **kwargs,
+    ) -> Dict[str, Any]:
+        """Perform generator forward.
+        Args:
+            text (Tensor):
+                Text index tensor (B, T_text).
+            text_lengths (Tensor):
+                Text length tensor (B,).
+            feats (Tensor):
+                Feature tensor (B, T_feats, aux_channels).
+            feats_lengths (Tensor):
+                Feature length tensor (B,).
+            durations(Tensor(int64)): 
+                Batch of padded durations (B, Tmax).
+            durations_lengths (Tensor):
+                durations length tensor (B,).
+            pitch(Tensor): 
+                Batch of padded token-averaged pitch (B, Tmax, 1).
+            energy(Tensor): 
+                Batch of padded token-averaged energy (B, Tmax, 1).
+            sids (Optional[Tensor]):
+                Speaker index tensor (B,) or (B, 1).
+            spembs (Optional[Tensor]):
+                Speaker embedding tensor (B, spk_embed_dim).
+            lids (Optional[Tensor]):
+                Language index tensor (B,) or (B, 1).
+            forward_generator (bool):
+                Whether to forward generator.
+            use_alignment_module (bool):
+                Whether to use alignment module.
+        Returns:
+
+        """
+        if forward_generator:
+            return self._forward_generator(
+                text=text,
+                text_lengths=text_lengths,
+                feats=feats,
+                feats_lengths=feats_lengths,
+                durations=durations,
+                durations_lengths=durations_lengths,
+                pitch=pitch,
+                energy=energy,
+                sids=sids,
+                spembs=spembs,
+                lids=lids,
+                use_alignment_module=use_alignment_module, )
+        else:
+            return self._forward_discrminator(
+                text=text,
+                text_lengths=text_lengths,
+                feats=feats,
+                feats_lengths=feats_lengths,
+                durations=durations,
+                durations_lengths=durations_lengths,
+                pitch=pitch,
+                energy=energy,
+                sids=sids,
+                spembs=spembs,
+                lids=lids,
+                use_alignment_module=use_alignment_module, )
+
+    def _forward_generator(
+            self,
+            text: paddle.Tensor,
+            text_lengths: paddle.Tensor,
+            feats: paddle.Tensor,
+            feats_lengths: paddle.Tensor,
+            durations: paddle.Tensor,
+            durations_lengths: paddle.Tensor,
+            pitch: paddle.Tensor,
+            energy: paddle.Tensor,
+            sids: Optional[paddle.Tensor]=None,
+            spembs: Optional[paddle.Tensor]=None,
+            lids: Optional[paddle.Tensor]=None,
+            use_alignment_module: bool=False,
+            **kwargs, ) -> Dict[str, Any]:
+        """Perform generator forward.
+        Args:
+            text (Tensor):
+                Text index tensor (B, T_text).
+            text_lengths (Tensor):
+                Text length tensor (B,).
+            feats (Tensor):
+                Feature tensor (B, T_feats, aux_channels).
+            feats_lengths (Tensor):
+                Feature length tensor (B,).
+            durations(Tensor(int64)): 
+                Batch of padded durations (B, Tmax).
+            durations_lengths (Tensor):
+                durations length tensor (B,).
+            pitch(Tensor): 
+                Batch of padded token-averaged pitch (B, Tmax, 1).
+            energy(Tensor): 
+                Batch of padded token-averaged energy (B, Tmax, 1).
+            sids (Optional[Tensor]):
+                Speaker index tensor (B,) or (B, 1).
+            spembs (Optional[Tensor]):
+                Speaker embedding tensor (B, spk_embed_dim).
+            lids (Optional[Tensor]):
+                Language index tensor (B,) or (B, 1).
+            use_alignment_module (bool):
+                Whether to use alignment module.
+        Returns:
+
+        """
+        # setup
+        # calculate generator outputs
+        self.reuse_cache_gen = True
+        if not self.cache_generator_outputs or self._cache is None:
+            self.reuse_cache_gen = False
+            outs = self.generator(
+                text=text,
+                text_lengths=text_lengths,
+                feats=feats,
+                feats_lengths=feats_lengths,
+                durations=durations,
+                durations_lengths=durations_lengths,
+                pitch=pitch,
+                energy=energy,
+                sids=sids,
+                spembs=spembs,
+                lids=lids,
+                use_alignment_module=use_alignment_module, )
+        else:
+            outs = self._cache
+
+        # store cache
+        if self.training and self.cache_generator_outputs and not self.reuse_cache_gen:
+            self._cache = outs
+
+        return outs
+
+    def _forward_discrminator(
+            self,
+            text: paddle.Tensor,
+            text_lengths: paddle.Tensor,
+            feats: paddle.Tensor,
+            feats_lengths: paddle.Tensor,
+            durations: paddle.Tensor,
+            durations_lengths: paddle.Tensor,
+            pitch: paddle.Tensor,
+            energy: paddle.Tensor,
+            sids: Optional[paddle.Tensor]=None,
+            spembs: Optional[paddle.Tensor]=None,
+            lids: Optional[paddle.Tensor]=None,
+            use_alignment_module: bool=False,
+            **kwargs, ) -> Dict[str, Any]:
+        """Perform discriminator forward.
+        Args:
+            text (Tensor):
+                Text index tensor (B, T_text).
+            text_lengths (Tensor):
+                Text length tensor (B,).
+            feats (Tensor):
+                Feature tensor (B, T_feats, aux_channels).
+            feats_lengths (Tensor):
+                Feature length tensor (B,).
+            durations(Tensor(int64)): 
+                Batch of padded durations (B, Tmax).
+            durations_lengths (Tensor):
+                durations length tensor (B,).
+            pitch(Tensor): 
+                Batch of padded token-averaged pitch (B, Tmax, 1).
+            energy(Tensor): 
+                Batch of padded token-averaged energy (B, Tmax, 1).
+            sids (Optional[Tensor]):
+                Speaker index tensor (B,) or (B, 1).
+            spembs (Optional[Tensor]):
+                Speaker embedding tensor (B, spk_embed_dim).
+            lids (Optional[Tensor]):
+                Language index tensor (B,) or (B, 1).
+            use_alignment_module (bool):
+                Whether to use alignment module.
+        Returns:
+
+        """
+        # setup
+        # calculate generator outputs
+        self.reuse_cache_dis = True
+        if not self.cache_generator_outputs or self._cache is None:
+            self.reuse_cache_dis = False
+            outs = self.generator(
+                text=text,
+                text_lengths=text_lengths,
+                feats=feats,
+                feats_lengths=feats_lengths,
+                durations=durations,
+                durations_lengths=durations_lengths,
+                pitch=pitch,
+                energy=energy,
+                sids=sids,
+                spembs=spembs,
+                lids=lids,
+                use_alignment_module=use_alignment_module,
+                **kwargs, )
+        else:
+            outs = self._cache
+
+        # store cache
+        if self.cache_generator_outputs and not self.reuse_cache_dis:
+            self._cache = outs
+
+        return outs
+
+    def inference(self,
+                  text: paddle.Tensor,
+                  feats: Optional[paddle.Tensor]=None,
+                  pitch: Optional[paddle.Tensor]=None,
+                  energy: Optional[paddle.Tensor]=None,
+                  use_alignment_module: bool=False,
+                  **kwargs) -> Dict[str, paddle.Tensor]:
+        """Run inference.
+        Args:
+            text (Tensor):
+                Input text index tensor (T_text,).
+            feats (Tensor):
+                Feature tensor (T_feats, aux_channels).
+            pitch (Tensor):
+                Pitch tensor (T_feats, 1).
+            energy (Tensor): 
+                Energy tensor (T_feats, 1).
+            use_alignment_module (bool):
+                Whether to use alignment module.
+        Returns:
+            Dict[str, Tensor]:
+                * wav (Tensor):
+                    Generated waveform tensor (T_wav,).
+                * duration (Tensor):
+                    Predicted duration tensor (T_text,).
+        """
+        # setup
+        text = text[None]
+        text_lengths = paddle.to_tensor(paddle.shape(text)[1])
+
+        # inference
+        if use_alignment_module:
+            assert feats is not None
+            feats = feats[None]
+            feats_lengths = paddle.to_tensor(paddle.shape(feats)[1])
+            pitch = pitch[None]
+            energy = energy[None]
+            wav, dur = self.generator.inference(
+                text=text,
+                text_lengths=text_lengths,
+                feats=feats,
+                feats_lengths=feats_lengths,
+                pitch=pitch,
+                energy=energy,
+                use_alignment_module=use_alignment_module,
+                **kwargs)
+        else:
+            wav, dur = self.generator.inference(
+                text=text,
+                text_lengths=text_lengths,
+                **kwargs, )
+        return dict(wav=paddle.reshape(wav, [-1]), duration=dur[0])
+
+    def reset_parameters(self):
+        def _reset_parameters(module):
+            if isinstance(
+                    module,
+                (nn.Conv1D, nn.Conv1DTranspose, nn.Conv2D, nn.Conv2DTranspose)):
+                kaiming_uniform_(module.weight, a=math.sqrt(5))
+                if module.bias is not None:
+                    fan_in, _ = _calculate_fan_in_and_fan_out(module.weight)
+                    if fan_in != 0:
+                        bound = 1 / math.sqrt(fan_in)
+                        uniform_(module.bias, -bound, bound)
+
+            if isinstance(
+                    module,
+                (nn.BatchNorm1D, nn.BatchNorm2D, nn.GroupNorm, nn.LayerNorm)):
+                ones_(module.weight)
+                zeros_(module.bias)
+
+            if isinstance(module, nn.Linear):
+                kaiming_uniform_(module.weight, a=math.sqrt(5))
+                if module.bias is not None:
+                    fan_in, _ = _calculate_fan_in_and_fan_out(module.weight)
+                    bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+                    uniform_(module.bias, -bound, bound)
+
+            if isinstance(module, nn.Embedding):
+                normal_(module.weight)
+                if module._padding_idx is not None:
+                    with paddle.no_grad():
+                        module.weight[module._padding_idx] = 0
+
+        self.apply(_reset_parameters)
+
+
+class JETSInference(nn.Layer):
+    def __init__(self, model):
+        super().__init__()
+        self.acoustic_model = model
+
+    def forward(self, text, sids=None):
+        out = self.acoustic_model.inference(text)
+        wav = out['wav']
+        return wav
diff --git a/paddlespeech/t2s/models/jets/jets_updater.py b/paddlespeech/t2s/models/jets/jets_updater.py
new file mode 100644
index 00000000..a82ac85c
--- /dev/null
+++ b/paddlespeech/t2s/models/jets/jets_updater.py
@@ -0,0 +1,437 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Generator module in JETS.
+
+This code is based on https://github.com/imdanboy/jets.
+
+"""
+import logging
+from typing import Dict
+
+import paddle
+from paddle import distributed as dist
+from paddle.io import DataLoader
+from paddle.nn import Layer
+from paddle.optimizer import Optimizer
+from paddle.optimizer.lr import LRScheduler
+
+from paddlespeech.t2s.modules.nets_utils import get_segments
+from paddlespeech.t2s.training.extensions.evaluator import StandardEvaluator
+from paddlespeech.t2s.training.reporter import report
+from paddlespeech.t2s.training.updaters.standard_updater import StandardUpdater
+from paddlespeech.t2s.training.updaters.standard_updater import UpdaterState
+
+logging.basicConfig(
+    format='%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s',
+    datefmt='[%Y-%m-%d %H:%M:%S]')
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+class JETSUpdater(StandardUpdater):
+    def __init__(self,
+                 model: Layer,
+                 optimizers: Dict[str, Optimizer],
+                 criterions: Dict[str, Layer],
+                 schedulers: Dict[str, LRScheduler],
+                 dataloader: DataLoader,
+                 generator_train_start_steps: int=0,
+                 discriminator_train_start_steps: int=100000,
+                 lambda_adv: float=1.0,
+                 lambda_mel: float=45.0,
+                 lambda_feat_match: float=2.0,
+                 lambda_var: float=1.0,
+                 lambda_align: float=2.0,
+                 generator_first: bool=False,
+                 use_alignment_module: bool=False,
+                 output_dir=None):
+        # it is designed to hold multiple models
+        # 因为输入的是单模型，但是没有用到父类的 init(), 所以需要重新写这部分
+        models = {"main": model}
+        self.models: Dict[str, Layer] = models
+        # self.model = model
+
+        self.model = model._layers if isinstance(model,
+                                                 paddle.DataParallel) else model
+
+        self.optimizers = optimizers
+        self.optimizer_g: Optimizer = optimizers['generator']
+        self.optimizer_d: Optimizer = optimizers['discriminator']
+
+        self.criterions = criterions
+        self.criterion_mel = criterions['mel']
+        self.criterion_feat_match = criterions['feat_match']
+        self.criterion_gen_adv = criterions["gen_adv"]
+        self.criterion_dis_adv = criterions["dis_adv"]
+        self.criterion_var = criterions["var"]
+        self.criterion_forwardsum = criterions["forwardsum"]
+
+        self.schedulers = schedulers
+        self.scheduler_g = schedulers['generator']
+        self.scheduler_d = schedulers['discriminator']
+
+        self.dataloader = dataloader
+
+        self.generator_train_start_steps = generator_train_start_steps
+        self.discriminator_train_start_steps = discriminator_train_start_steps
+
+        self.lambda_adv = lambda_adv
+        self.lambda_mel = lambda_mel
+        self.lambda_feat_match = lambda_feat_match
+        self.lambda_var = lambda_var
+        self.lambda_align = lambda_align
+
+        self.use_alignment_module = use_alignment_module
+
+        if generator_first:
+            self.turns = ["generator", "discriminator"]
+        else:
+            self.turns = ["discriminator", "generator"]
+
+        self.state = UpdaterState(iteration=0, epoch=0)
+        self.train_iterator = iter(self.dataloader)
+
+        log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
+        self.filehandler = logging.FileHandler(str(log_file))
+        logger.addHandler(self.filehandler)
+        self.logger = logger
+        self.msg = ""
+
+    def update_core(self, batch):
+        self.msg = "Rank: {}, ".format(dist.get_rank())
+        losses_dict = {}
+
+        for turn in self.turns:
+            speech = batch["speech"]
+            speech = speech.unsqueeze(1)
+            text_lengths = batch["text_lengths"]
+            feats_lengths = batch["feats_lengths"]
+            outs = self.model(
+                text=batch["text"],
+                text_lengths=batch["text_lengths"],
+                feats=batch["feats"],
+                feats_lengths=batch["feats_lengths"],
+                durations=batch["durations"],
+                durations_lengths=batch["durations_lengths"],
+                pitch=batch["pitch"],
+                energy=batch["energy"],
+                sids=batch.get("spk_id", None),
+                spembs=batch.get("spk_emb", None),
+                forward_generator=turn == "generator",
+                use_alignment_module=self.use_alignment_module)
+            # Generator
+            if turn == "generator":
+                # parse outputs
+                speech_hat_, bin_loss, log_p_attn, start_idxs, d_outs, ds, p_outs, ps, e_outs, es = outs
+                speech_ = get_segments(
+                    x=speech,
+                    start_idxs=start_idxs *
+                    self.model.generator.upsample_factor,
+                    segment_size=self.model.generator.segment_size *
+                    self.model.generator.upsample_factor, )
+
+                # calculate discriminator outputs
+                p_hat = self.model.discriminator(speech_hat_)
+                with paddle.no_grad():
+                    # do not store discriminator gradient in generator turn
+                    p = self.model.discriminator(speech_)
+
+                # calculate losses
+                mel_loss = self.criterion_mel(speech_hat_, speech_)
+
+                adv_loss = self.criterion_gen_adv(p_hat)
+                feat_match_loss = self.criterion_feat_match(p_hat, p)
+                dur_loss, pitch_loss, energy_loss = self.criterion_var(
+                    d_outs, ds, p_outs, ps, e_outs, es, text_lengths)
+
+                mel_loss = mel_loss * self.lambda_mel
+                adv_loss = adv_loss * self.lambda_adv
+                feat_match_loss = feat_match_loss * self.lambda_feat_match
+                g_loss = mel_loss + adv_loss + feat_match_loss
+                var_loss = (
+                    dur_loss + pitch_loss + energy_loss) * self.lambda_var
+
+                gen_loss = g_loss + var_loss  #+ align_loss
+
+                report("train/generator_loss", float(gen_loss))
+                report("train/generator_generator_loss", float(g_loss))
+                report("train/generator_variance_loss", float(var_loss))
+                report("train/generator_generator_mel_loss", float(mel_loss))
+                report("train/generator_generator_adv_loss", float(adv_loss))
+                report("train/generator_generator_feat_match_loss",
+                       float(feat_match_loss))
+                report("train/generator_variance_dur_loss", float(dur_loss))
+                report("train/generator_variance_pitch_loss", float(pitch_loss))
+                report("train/generator_variance_energy_loss",
+                       float(energy_loss))
+
+                losses_dict["generator_loss"] = float(gen_loss)
+                losses_dict["generator_generator_loss"] = float(g_loss)
+                losses_dict["generator_variance_loss"] = float(var_loss)
+                losses_dict["generator_generator_mel_loss"] = float(mel_loss)
+                losses_dict["generator_generator_adv_loss"] = float(adv_loss)
+                losses_dict["generator_generator_feat_match_loss"] = float(
+                    feat_match_loss)
+                losses_dict["generator_variance_dur_loss"] = float(dur_loss)
+                losses_dict["generator_variance_pitch_loss"] = float(pitch_loss)
+                losses_dict["generator_variance_energy_loss"] = float(
+                    energy_loss)
+
+                if self.use_alignment_module == True:
+                    forwardsum_loss = self.criterion_forwardsum(
+                        log_p_attn, text_lengths, feats_lengths)
+                    align_loss = (
+                        forwardsum_loss + bin_loss) * self.lambda_align
+                    report("train/generator_alignment_loss", float(align_loss))
+                    report("train/generator_alignment_forwardsum_loss",
+                           float(forwardsum_loss))
+                    report("train/generator_alignment_bin_loss",
+                           float(bin_loss))
+                    losses_dict["generator_alignment_loss"] = float(align_loss)
+                    losses_dict["generator_alignment_forwardsum_loss"] = float(
+                        forwardsum_loss)
+                    losses_dict["generator_alignment_bin_loss"] = float(
+                        bin_loss)
+
+                self.optimizer_g.clear_grad()
+                gen_loss.backward()
+
+                self.optimizer_g.step()
+                self.scheduler_g.step()
+
+                # reset cache
+                if self.model.reuse_cache_gen or not self.model.training:
+                    self.model._cache = None
+
+            # Disctiminator
+            elif turn == "discriminator":
+                # parse outputs
+                speech_hat_, _, _, start_idxs, *_ = outs
+                speech_ = get_segments(
+                    x=speech,
+                    start_idxs=start_idxs *
+                    self.model.generator.upsample_factor,
+                    segment_size=self.model.generator.segment_size *
+                    self.model.generator.upsample_factor, )
+
+                # calculate discriminator outputs
+                p_hat = self.model.discriminator(speech_hat_.detach())
+                p = self.model.discriminator(speech_)
+
+                # calculate losses
+                real_loss, fake_loss = self.criterion_dis_adv(p_hat, p)
+                dis_loss = real_loss + fake_loss
+
+                report("train/real_loss", float(real_loss))
+                report("train/fake_loss", float(fake_loss))
+                report("train/discriminator_loss", float(dis_loss))
+                losses_dict["real_loss"] = float(real_loss)
+                losses_dict["fake_loss"] = float(fake_loss)
+                losses_dict["discriminator_loss"] = float(dis_loss)
+
+                self.optimizer_d.clear_grad()
+                dis_loss.backward()
+
+                self.optimizer_d.step()
+                self.scheduler_d.step()
+
+                # reset cache
+                if self.model.reuse_cache_dis or not self.model.training:
+                    self.model._cache = None
+
+        self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                              for k, v in losses_dict.items())
+
+
+class JETSEvaluator(StandardEvaluator):
+    def __init__(self,
+                 model,
+                 criterions: Dict[str, Layer],
+                 dataloader: DataLoader,
+                 lambda_adv: float=1.0,
+                 lambda_mel: float=45.0,
+                 lambda_feat_match: float=2.0,
+                 lambda_var: float=1.0,
+                 lambda_align: float=2.0,
+                 generator_first: bool=False,
+                 use_alignment_module: bool=False,
+                 output_dir=None):
+        # 因为输入的是单模型，但是没有用到父类的 init(), 所以需要重新写这部分
+        models = {"main": model}
+        self.models: Dict[str, Layer] = models
+        # self.model = model
+        self.model = model._layers if isinstance(model,
+                                                 paddle.DataParallel) else model
+
+        self.criterions = criterions
+        self.criterion_mel = criterions['mel']
+        self.criterion_feat_match = criterions['feat_match']
+        self.criterion_gen_adv = criterions["gen_adv"]
+        self.criterion_dis_adv = criterions["dis_adv"]
+        self.criterion_var = criterions["var"]
+        self.criterion_forwardsum = criterions["forwardsum"]
+
+        self.dataloader = dataloader
+
+        self.lambda_adv = lambda_adv
+        self.lambda_mel = lambda_mel
+        self.lambda_feat_match = lambda_feat_match
+        self.lambda_var = lambda_var
+        self.lambda_align = lambda_align
+        self.use_alignment_module = use_alignment_module
+
+        if generator_first:
+            self.turns = ["generator", "discriminator"]
+        else:
+            self.turns = ["discriminator", "generator"]
+
+        log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
+        self.filehandler = logging.FileHandler(str(log_file))
+        logger.addHandler(self.filehandler)
+        self.logger = logger
+        self.msg = ""
+
+    def evaluate_core(self, batch):
+        # logging.debug("Evaluate: ")
+        self.msg = "Evaluate: "
+        losses_dict = {}
+
+        for turn in self.turns:
+            speech = batch["speech"]
+            speech = speech.unsqueeze(1)
+            text_lengths = batch["text_lengths"]
+            feats_lengths = batch["feats_lengths"]
+            outs = self.model(
+                text=batch["text"],
+                text_lengths=batch["text_lengths"],
+                feats=batch["feats"],
+                feats_lengths=batch["feats_lengths"],
+                durations=batch["durations"],
+                durations_lengths=batch["durations_lengths"],
+                pitch=batch["pitch"],
+                energy=batch["energy"],
+                sids=batch.get("spk_id", None),
+                spembs=batch.get("spk_emb", None),
+                forward_generator=turn == "generator",
+                use_alignment_module=self.use_alignment_module)
+            # Generator
+            if turn == "generator":
+                # parse outputs
+                speech_hat_, bin_loss, log_p_attn, start_idxs, d_outs, ds, p_outs, ps, e_outs, es = outs
+                speech_ = get_segments(
+                    x=speech,
+                    start_idxs=start_idxs *
+                    self.model.generator.upsample_factor,
+                    segment_size=self.model.generator.segment_size *
+                    self.model.generator.upsample_factor, )
+
+                # calculate discriminator outputs
+                p_hat = self.model.discriminator(speech_hat_)
+                with paddle.no_grad():
+                    # do not store discriminator gradient in generator turn
+                    p = self.model.discriminator(speech_)
+
+                # calculate losses
+                mel_loss = self.criterion_mel(speech_hat_, speech_)
+
+                adv_loss = self.criterion_gen_adv(p_hat)
+                feat_match_loss = self.criterion_feat_match(p_hat, p)
+                dur_loss, pitch_loss, energy_loss = self.criterion_var(
+                    d_outs, ds, p_outs, ps, e_outs, es, text_lengths)
+
+                mel_loss = mel_loss * self.lambda_mel
+                adv_loss = adv_loss * self.lambda_adv
+                feat_match_loss = feat_match_loss * self.lambda_feat_match
+                g_loss = mel_loss + adv_loss + feat_match_loss
+                var_loss = (
+                    dur_loss + pitch_loss + energy_loss) * self.lambda_var
+
+                gen_loss = g_loss + var_loss  #+ align_loss
+
+                report("eval/generator_loss", float(gen_loss))
+                report("eval/generator_generator_loss", float(g_loss))
+                report("eval/generator_variance_loss", float(var_loss))
+                report("eval/generator_generator_mel_loss", float(mel_loss))
+                report("eval/generator_generator_adv_loss", float(adv_loss))
+                report("eval/generator_generator_feat_match_loss",
+                       float(feat_match_loss))
+                report("eval/generator_variance_dur_loss", float(dur_loss))
+                report("eval/generator_variance_pitch_loss", float(pitch_loss))
+                report("eval/generator_variance_energy_loss",
+                       float(energy_loss))
+
+                losses_dict["generator_loss"] = float(gen_loss)
+                losses_dict["generator_generator_loss"] = float(g_loss)
+                losses_dict["generator_variance_loss"] = float(var_loss)
+                losses_dict["generator_generator_mel_loss"] = float(mel_loss)
+                losses_dict["generator_generator_adv_loss"] = float(adv_loss)
+                losses_dict["generator_generator_feat_match_loss"] = float(
+                    feat_match_loss)
+                losses_dict["generator_variance_dur_loss"] = float(dur_loss)
+                losses_dict["generator_variance_pitch_loss"] = float(pitch_loss)
+                losses_dict["generator_variance_energy_loss"] = float(
+                    energy_loss)
+
+                if self.use_alignment_module == True:
+                    forwardsum_loss = self.criterion_forwardsum(
+                        log_p_attn, text_lengths, feats_lengths)
+                    align_loss = (
+                        forwardsum_loss + bin_loss) * self.lambda_align
+                    report("eval/generator_alignment_loss", float(align_loss))
+                    report("eval/generator_alignment_forwardsum_loss",
+                           float(forwardsum_loss))
+                    report("eval/generator_alignment_bin_loss", float(bin_loss))
+                    losses_dict["generator_alignment_loss"] = float(align_loss)
+                    losses_dict["generator_alignment_forwardsum_loss"] = float(
+                        forwardsum_loss)
+                    losses_dict["generator_alignment_bin_loss"] = float(
+                        bin_loss)
+
+                # reset cache
+                if self.model.reuse_cache_gen or not self.model.training:
+                    self.model._cache = None
+
+            # Disctiminator
+            elif turn == "discriminator":
+                # parse outputs
+                speech_hat_, _, _, start_idxs, *_ = outs
+                speech_ = get_segments(
+                    x=speech,
+                    start_idxs=start_idxs *
+                    self.model.generator.upsample_factor,
+                    segment_size=self.model.generator.segment_size *
+                    self.model.generator.upsample_factor, )
+
+                # calculate discriminator outputs
+                p_hat = self.model.discriminator(speech_hat_.detach())
+                p = self.model.discriminator(speech_)
+
+                # calculate losses
+                real_loss, fake_loss = self.criterion_dis_adv(p_hat, p)
+                dis_loss = real_loss + fake_loss
+
+                report("eval/real_loss", float(real_loss))
+                report("eval/fake_loss", float(fake_loss))
+                report("eval/discriminator_loss", float(dis_loss))
+                losses_dict["real_loss"] = float(real_loss)
+                losses_dict["fake_loss"] = float(fake_loss)
+                losses_dict["discriminator_loss"] = float(dis_loss)
+
+                # reset cache
+                if self.model.reuse_cache_dis or not self.model.training:
+                    self.model._cache = None
+
+        self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                              for k, v in losses_dict.items())
+        self.logger.info(self.msg)
diff --git a/paddlespeech/t2s/models/jets/length_regulator.py b/paddlespeech/t2s/models/jets/length_regulator.py
new file mode 100644
index 00000000..f7a395a6
--- /dev/null
+++ b/paddlespeech/t2s/models/jets/length_regulator.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Generator module in JETS.
+
+This code is based on https://github.com/imdanboy/jets.
+
+"""
+import paddle
+import paddle.nn.functional as F
+from paddle import nn
+
+from paddlespeech.t2s.modules.masked_fill import masked_fill
+
+
+class GaussianUpsampling(nn.Layer):
+    """
+    Gaussian upsampling with fixed temperature as in:
+    https://arxiv.org/abs/2010.04301
+    """
+
+    def __init__(self, delta=0.1):
+        super().__init__()
+        self.delta = delta
+
+    def forward(self, hs, ds, h_masks=None, d_masks=None):
+        """
+        Args:
+            hs (Tensor): Batched hidden state to be expanded (B, T_text, adim)
+            ds (Tensor): Batched token duration (B, T_text)
+            h_masks (Tensor): Mask tensor (B,T_feats)
+            d_masks (Tensor): Mask tensor (B,T_text)
+        Returns:
+            Tensor: Expanded hidden state (B, T_feat, adim)
+        """
+        B = ds.shape[0]
+
+        if h_masks is None:
+            T_feats = paddle.to_tensor(ds.sum(), dtype="int32")
+        else:
+            T_feats = h_masks.shape[-1]
+        t = paddle.to_tensor(
+            paddle.arange(0, T_feats).unsqueeze(0).tile([B, 1]),
+            dtype="float32")
+        if h_masks is not None:
+            t = t * paddle.to_tensor(h_masks, dtype="float32")
+
+        c = ds.cumsum(axis=-1) - ds / 2
+        energy = -1 * self.delta * (t.unsqueeze(-1) - c.unsqueeze(1))**2
+        if d_masks is not None:
+            d_masks = ~(d_masks.unsqueeze(1))
+            d_masks.stop_gradient = True
+            d_masks = d_masks.tile([1, T_feats, 1])
+            energy = masked_fill(energy, d_masks, -float("inf"))
+        p_attn = F.softmax(energy, axis=2)  # (B, T_feats, T_text)
+        hs = paddle.matmul(p_attn, hs)
+        return hs
diff --git a/paddlespeech/t2s/models/starganv2_vc/AuxiliaryASR/layers.py b/paddlespeech/t2s/models/starganv2_vc/AuxiliaryASR/layers.py
index 71b9753c..5901c805 100644
--- a/paddlespeech/t2s/models/starganv2_vc/AuxiliaryASR/layers.py
+++ b/paddlespeech/t2s/models/starganv2_vc/AuxiliaryASR/layers.py
@@ -11,8 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import random
-
 import paddle
 import paddle.nn.functional as F
 import paddleaudio.functional as audio_F
@@ -46,7 +44,8 @@ class LinearNorm(nn.Layer):
             self.linear_layer.weight, gain=_calculate_gain(w_init_gain))
 
     def forward(self, x: paddle.Tensor):
-        return self.linear_layer(x)
+        out = self.linear_layer(x)
+        return out
 
 
 class ConvNorm(nn.Layer):
@@ -82,85 +81,6 @@ class ConvNorm(nn.Layer):
         return conv_signal
 
 
-class CausualConv(nn.Layer):
-    def __init__(self,
-                 in_channels: int,
-                 out_channels: int,
-                 kernel_size: int=1,
-                 stride: int=1,
-                 padding: int=1,
-                 dilation: int=1,
-                 bias: bool=True,
-                 w_init_gain: str='linear',
-                 param=None):
-        super().__init__()
-        if padding is None:
-            assert (kernel_size % 2 == 1)
-            padding = int(dilation * (kernel_size - 1) / 2) * 2
-        else:
-            self.padding = padding * 2
-        self.conv = nn.Conv1D(
-            in_channels,
-            out_channels,
-            kernel_size=kernel_size,
-            stride=stride,
-            padding=self.padding,
-            dilation=dilation,
-            bias_attr=bias)
-
-        xavier_uniform_(
-            self.conv.weight, gain=_calculate_gain(w_init_gain, param=param))
-
-    def forward(self, x: paddle.Tensor):
-        x = self.conv(x)
-        x = x[:, :, :-self.padding]
-        return x
-
-
-class CausualBlock(nn.Layer):
-    def __init__(self,
-                 hidden_dim: int,
-                 n_conv: int=3,
-                 dropout_p: float=0.2,
-                 activ: str='lrelu'):
-        super().__init__()
-        self.blocks = nn.LayerList([
-            self._get_conv(
-                hidden_dim=hidden_dim,
-                dilation=3**i,
-                activ=activ,
-                dropout_p=dropout_p) for i in range(n_conv)
-        ])
-
-    def forward(self, x):
-        for block in self.blocks:
-            res = x
-            x = block(x)
-            x += res
-        return x
-
-    def _get_conv(self,
-                  hidden_dim: int,
-                  dilation: int,
-                  activ: str='lrelu',
-                  dropout_p: float=0.2):
-        layers = [
-            CausualConv(
-                in_channels=hidden_dim,
-                out_channels=hidden_dim,
-                kernel_size=3,
-                padding=dilation,
-                dilation=dilation), _get_activation_fn(activ),
-            nn.BatchNorm1D(hidden_dim), nn.Dropout(p=dropout_p), CausualConv(
-                in_channels=hidden_dim,
-                out_channels=hidden_dim,
-                kernel_size=3,
-                padding=1,
-                dilation=1), _get_activation_fn(activ), nn.Dropout(p=dropout_p)
-        ]
-        return nn.Sequential(*layers)
-
-
 class ConvBlock(nn.Layer):
     def __init__(self,
                  hidden_dim: int,
@@ -264,13 +184,14 @@ class Attention(nn.Layer):
         """
         Args:
             query: 
-                decoder output (batch, n_mel_channels * n_frames_per_step)
+                decoder output (B, n_mel_channels * n_frames_per_step)
             processed_memory: 
                 processed encoder outputs (B, T_in, attention_dim)
             attention_weights_cat: 
                 cumulative and prev. att weights (B, 2, max_time)
         Returns:
-            Tensor: alignment (batch, max_time)
+            Tensor: 
+                alignment (B, max_time)
         """
 
         processed_query = self.query_layer(query.unsqueeze(1))
@@ -316,144 +237,6 @@ class Attention(nn.Layer):
         return attention_context, attention_weights
 
 
-class ForwardAttentionV2(nn.Layer):
-    def __init__(self,
-                 attention_rnn_dim: int,
-                 embedding_dim: int,
-                 attention_dim: int,
-                 attention_location_n_filters: int,
-                 attention_location_kernel_size: int):
-        super().__init__()
-        self.query_layer = LinearNorm(
-            in_dim=attention_rnn_dim,
-            out_dim=attention_dim,
-            bias=False,
-            w_init_gain='tanh')
-        self.memory_layer = LinearNorm(
-            in_dim=embedding_dim,
-            out_dim=attention_dim,
-            bias=False,
-            w_init_gain='tanh')
-        self.v = LinearNorm(in_dim=attention_dim, out_dim=1, bias=False)
-        self.location_layer = LocationLayer(
-            attention_n_filters=attention_location_n_filters,
-            attention_kernel_size=attention_location_kernel_size,
-            attention_dim=attention_dim)
-        self.score_mask_value = -float(1e20)
-
-    def get_alignment_energies(self,
-                               query: paddle.Tensor,
-                               processed_memory: paddle.Tensor,
-                               attention_weights_cat: paddle.Tensor):
-        """
-        Args:
-            query: 
-                decoder output (batch, n_mel_channels * n_frames_per_step)
-            processed_memory: 
-                processed encoder outputs (B, T_in, attention_dim)
-            attention_weights_cat: 
-                prev. and cumulative att weights (B, 2, max_time)
-        Returns:
-            Tensor: alignment (batch, max_time)
-        """
-
-        processed_query = self.query_layer(query.unsqueeze(1))
-        processed_attention_weights = self.location_layer(attention_weights_cat)
-        energies = self.v(
-            paddle.tanh(processed_query + processed_attention_weights +
-                        processed_memory))
-
-        energies = energies.squeeze(-1)
-        return energies
-
-    def forward(self,
-                attention_hidden_state: paddle.Tensor,
-                memory: paddle.Tensor,
-                processed_memory: paddle.Tensor,
-                attention_weights_cat: paddle.Tensor,
-                mask: paddle.Tensor,
-                log_alpha: paddle.Tensor):
-        """
-        Args:
-            attention_hidden_state: 
-                attention rnn last output
-            memory: 
-                encoder outputs
-            processed_memory: 
-                processed encoder outputs
-            attention_weights_cat: 
-                previous and cummulative attention weights
-            mask: 
-                binary mask for padded data
-        """
-        log_energy = self.get_alignment_energies(
-            query=attention_hidden_state,
-            processed_memory=processed_memory,
-            attention_weights_cat=attention_weights_cat)
-
-        if mask is not None:
-            log_energy[:] = paddle.where(
-                mask,
-                paddle.full(log_energy.shape, self.score_mask_value,
-                            log_energy.dtype), log_energy)
-        log_alpha_shift_padded = []
-        max_time = log_energy.shape[1]
-        for sft in range(2):
-            shifted = log_alpha[:, :max_time - sft]
-            shift_padded = F.pad(shifted, (sft, 0), 'constant',
-                                 self.score_mask_value)
-            log_alpha_shift_padded.append(shift_padded.unsqueeze(2))
-
-        biased = paddle.logsumexp(paddle.conat(log_alpha_shift_padded, 2), 2)
-        log_alpha_new = biased + log_energy
-        attention_weights = F.softmax(log_alpha_new, axis=1)
-        attention_context = paddle.bmm(attention_weights.unsqueeze(1), memory)
-        attention_context = attention_context.squeeze(1)
-
-        return attention_context, attention_weights, log_alpha_new
-
-
-class PhaseShuffle2D(nn.Layer):
-    def __init__(self, n: int=2):
-        super().__init__()
-        self.n = n
-        self.random = random.Random(1)
-
-    def forward(self, x: paddle.Tensor, move: int=None):
-        # x.size = (B, C, M, L)
-        if move is None:
-            move = self.random.randint(-self.n, self.n)
-
-        if move == 0:
-            return x
-        else:
-            left = x[:, :, :, :move]
-            right = x[:, :, :, move:]
-            shuffled = paddle.concat([right, left], axis=3)
-        return shuffled
-
-
-class PhaseShuffle1D(nn.Layer):
-    def __init__(self, n: int=2):
-        super().__init__()
-        self.n = n
-        self.random = random.Random(1)
-
-    def forward(self, x: paddle.Tensor, move: int=None):
-        # x.size = (B, C, M, L)
-        if move is None:
-            move = self.random.randint(-self.n, self.n)
-
-        if move == 0:
-            return x
-        else:
-            left = x[:, :, :move]
-            right = x[:, :, move:]
-            shuffled = paddle.concat([right, left], axis=2)
-
-        return shuffled
-
-
 class MFCC(nn.Layer):
     def __init__(self, n_mfcc: int=40, n_mels: int=80):
         super().__init__()
@@ -473,7 +256,6 @@ class MFCC(nn.Layer):
         # -> (channel, time, n_mfcc).tranpose(...)
         mfcc = paddle.matmul(mel_specgram.transpose([0, 2, 1]),
                              self.dct_mat).transpose([0, 2, 1])
-
         # unpack batch
         if unsqueezed:
             mfcc = mfcc.squeeze(0)
diff --git a/paddlespeech/t2s/models/starganv2_vc/AuxiliaryASR/model.py b/paddlespeech/t2s/models/starganv2_vc/AuxiliaryASR/model.py
index 48de8af1..85b3453d 100644
--- a/paddlespeech/t2s/models/starganv2_vc/AuxiliaryASR/model.py
+++ b/paddlespeech/t2s/models/starganv2_vc/AuxiliaryASR/model.py
@@ -22,6 +22,7 @@ from .layers import ConvBlock
 from .layers import ConvNorm
 from .layers import LinearNorm
 from .layers import MFCC
+from paddlespeech.t2s.modules.nets_utils import _reset_parameters
 from paddlespeech.utils.initialize import uniform_
 
 
@@ -59,6 +60,9 @@ class ASRCNN(nn.Layer):
             hidden_dim=hidden_dim // 2,
             n_token=n_token)
 
+        self.reset_parameters()
+        self.asr_s2s.reset_parameters()
+
     def forward(self,
                 x: paddle.Tensor,
                 src_key_padding_mask: paddle.Tensor=None,
@@ -99,7 +103,7 @@ class ASRCNN(nn.Layer):
             unmask_futre_steps (int): 
                 unmasking future step size.
         Return:
-            mask (paddle.BoolTensor): 
+            Tensor (paddle.Tensor(bool)): 
                 mask future timesteps mask[i, j] = True if i > j + unmask_future_steps else False
         """
         index_tensor = paddle.arange(out_length).unsqueeze(0).expand(
@@ -108,6 +112,9 @@ class ASRCNN(nn.Layer):
                                    index_tensor.T + unmask_future_steps)
         return mask
 
+    def reset_parameters(self):
+        self.apply(_reset_parameters)
+
 
 class ASRS2S(nn.Layer):
     def __init__(self,
@@ -118,8 +125,7 @@ class ASRS2S(nn.Layer):
                  n_token: int=40):
         super().__init__()
         self.embedding = nn.Embedding(n_token, embedding_dim)
-        val_range = math.sqrt(6 / hidden_dim)
-        uniform_(self.embedding.weight, -val_range, val_range)
+        self.val_range = math.sqrt(6 / hidden_dim)
 
         self.decoder_rnn_dim = hidden_dim
         self.project_to_n_symbols = nn.Linear(self.decoder_rnn_dim, n_token)
@@ -194,9 +200,8 @@ class ASRS2S(nn.Layer):
             logit_outputs += [logit]
             alignments += [attention_weights]
 
-        hidden_outputs, logit_outputs, alignments = \
-            self.parse_decoder_outputs(
-                hidden_outputs, logit_outputs, alignments)
+        hidden_outputs, logit_outputs, alignments = self.parse_decoder_outputs(
+            hidden_outputs, logit_outputs, alignments)
 
         return hidden_outputs, logit_outputs, alignments
 
@@ -237,3 +242,6 @@ class ASRS2S(nn.Layer):
         hidden = paddle.stack(hidden).transpose([1, 0, 2])
 
         return hidden, logit, alignments
+
+    def reset_parameters(self):
+        uniform_(self.embedding.weight, -self.val_range, self.val_range)
diff --git a/paddlespeech/t2s/models/starganv2_vc/JDCNet/model.py b/paddlespeech/t2s/models/starganv2_vc/JDCNet/model.py
index 118b8f0e..5938e6a7 100644
--- a/paddlespeech/t2s/models/starganv2_vc/JDCNet/model.py
+++ b/paddlespeech/t2s/models/starganv2_vc/JDCNet/model.py
@@ -33,10 +33,9 @@ class JDCNet(nn.Layer):
         super().__init__()
         self.seq_len = seq_len
         self.num_class = num_class
-
-        # input = (b, 1, 31, 513), b = batch size
+        # input: (B, num_class, T, n_mels)
         self.conv_block = nn.Sequential(
-            # out: (b, 64, 31, 513)
+            # output: (B, out_channels, T, n_mels)
             nn.Conv2D(
                 in_channels=1,
                 out_channels=64,
@@ -45,127 +44,99 @@ class JDCNet(nn.Layer):
                 bias_attr=False),
             nn.BatchNorm2D(num_features=64),
             nn.LeakyReLU(leaky_relu_slope),
-            # (b, 64, 31, 513)
+            # out: (B, out_channels, T, n_mels)
             nn.Conv2D(64, 64, 3, padding=1, bias_attr=False), )
-
-        # res blocks
-        # (b, 128, 31, 128)
+        # output: (B, out_channels, T, n_mels // 2)
         self.res_block1 = ResBlock(in_channels=64, out_channels=128)
-        # (b, 192, 31, 32) 
+        # output: (B, out_channels, T, n_mels // 4) 
         self.res_block2 = ResBlock(in_channels=128, out_channels=192)
-        # (b, 256, 31, 8)  
+        # output: (B, out_channels, T, n_mels // 8)  
         self.res_block3 = ResBlock(in_channels=192, out_channels=256)
-
         # pool block
         self.pool_block = nn.Sequential(
             nn.BatchNorm2D(num_features=256),
             nn.LeakyReLU(leaky_relu_slope),
-            # (b, 256, 31, 2)
+            # (B, num_features, T, 2)
             nn.MaxPool2D(kernel_size=(1, 4)),
             nn.Dropout(p=0.5), )
-
-        # maxpool layers (for auxiliary network inputs)
-        # in = (b, 128, 31, 513) from conv_block, out = (b, 128, 31, 2)
-        self.maxpool1 = nn.MaxPool2D(kernel_size=(1, 40))
-        # in = (b, 128, 31, 128) from res_block1, out = (b, 128, 31, 2)
-        self.maxpool2 = nn.MaxPool2D(kernel_size=(1, 20))
-        # in = (b, 128, 31, 32) from res_block2, out = (b, 128, 31, 2)
-        self.maxpool3 = nn.MaxPool2D(kernel_size=(1, 10))
-
-        # in = (b, 640, 31, 2), out = (b, 256, 31, 2)
-        self.detector_conv = nn.Sequential(
-            nn.Conv2D(
-                in_channels=640,
-                out_channels=256,
-                kernel_size=1,
-                bias_attr=False),
-            nn.BatchNorm2D(256),
-            nn.LeakyReLU(leaky_relu_slope),
-            nn.Dropout(p=0.5), )
-
-        # input: (b, 31, 512) - resized from (b, 256, 31, 2)
-        # output: (b, 31, 512)
+        # input: (B, T, input_size), resized from (B, input_size // 2, T, 2)
+        # output: (B, T, input_size)
         self.bilstm_classifier = nn.LSTM(
             input_size=512,
             hidden_size=256,
             time_major=False,
             direction='bidirectional')
-
-        # input: (b, 31, 512) - resized from (b, 256, 31, 2)
-        # output: (b, 31, 512)
-        self.bilstm_detector = nn.LSTM(
-            input_size=512,
-            hidden_size=256,
-            time_major=False,
-            direction='bidirectional')
-
-        # input: (b * 31, 512)
-        # output: (b * 31, num_class)
+        # input: (B * T, in_features)
+        # output: (B * T, num_class)
         self.classifier = nn.Linear(
             in_features=512, out_features=self.num_class)
 
-        # input: (b * 31, 512)
-        # output: (b * 31, 2) - binary classifier
-        self.detector = nn.Linear(in_features=512, out_features=2)
-
         # initialize weights
         self.apply(self.init_weights)
 
     def get_feature_GAN(self, x: paddle.Tensor):
-        seq_len = x.shape[-2]
-        x = x.astype(paddle.float32).transpose([0, 1, 3, 2] if len(x.shape) == 4
-                                               else [0, 2, 1])
-
+        """Calculate feature_GAN.
+        Args:
+            x(Tensor(float32)): 
+                Shape (B, num_class, n_mels, T).
+        Returns:
+            Tensor:
+                Shape (B, num_features, n_mels // 8, T).
+        """
+        x = x.astype(paddle.float32)
+        x = x.transpose([0, 1, 3, 2] if len(x.shape) == 4 else [0, 2, 1])
         convblock_out = self.conv_block(x)
-
         resblock1_out = self.res_block1(convblock_out)
         resblock2_out = self.res_block2(resblock1_out)
         resblock3_out = self.res_block3(resblock2_out)
         poolblock_out = self.pool_block[0](resblock3_out)
         poolblock_out = self.pool_block[1](poolblock_out)
-
-        return poolblock_out.transpose([0, 1, 3, 2] if len(poolblock_out.shape)
-                                       == 4 else [0, 2, 1])
+        GAN_feature = poolblock_out.transpose([0, 1, 3, 2] if len(
+            poolblock_out.shape) == 4 else [0, 2, 1])
+        return GAN_feature
 
     def forward(self, x: paddle.Tensor):
-        """
+        """Calculate forward propagation.
+        Args:
+            x(Tensor(float32)): 
+                Shape (B, num_class, n_mels, seq_len).
         Returns:
-            classification_prediction, detection_prediction
-            sizes: (b, 31, 722), (b, 31, 2)
+            Tensor:
+                classifier output consists of predicted pitch classes per frame.
+                Shape: (B, seq_len, num_class).
+            Tensor:
+                GAN_feature. Shape: (B, num_features, n_mels // 8, seq_len)
+            Tensor:
+                poolblock_out. Shape (B, seq_len, 512)     
         """
         ###############################
         # forward pass for classifier #
         ###############################
+        # (B, num_class, n_mels, T) -> (B, num_class, T, n_mels)
         x = x.transpose([0, 1, 3, 2] if len(x.shape) == 4 else
                         [0, 2, 1]).astype(paddle.float32)
 
         convblock_out = self.conv_block(x)
-
         resblock1_out = self.res_block1(convblock_out)
         resblock2_out = self.res_block2(resblock1_out)
         resblock3_out = self.res_block3(resblock2_out)
-
         poolblock_out = self.pool_block[0](resblock3_out)
         poolblock_out = self.pool_block[1](poolblock_out)
         GAN_feature = poolblock_out.transpose([0, 1, 3, 2] if len(
             poolblock_out.shape) == 4 else [0, 2, 1])
         poolblock_out = self.pool_block[2](poolblock_out)
-
-        # (b, 256, 31, 2) => (b, 31, 256, 2) => (b, 31, 512)
+        # (B, 256, seq_len, 2) => (B, seq_len, 256, 2) => (B, seq_len, 512)
         classifier_out = poolblock_out.transpose([0, 2, 1, 3]).reshape(
             (-1, self.seq_len, 512))
         self.bilstm_classifier.flatten_parameters()
-        classifier_out, _ = self.bilstm_classifier(
-            classifier_out)  # ignore the hidden states
-
-        classifier_out = classifier_out.reshape((-1, 512))  # (b * 31, 512)
+        # ignore the hidden states
+        classifier_out, _ = self.bilstm_classifier(classifier_out)
+        # (B * seq_len, 512)
+        classifier_out = classifier_out.reshape((-1, 512))
         classifier_out = self.classifier(classifier_out)
+        # (B, seq_len, num_class)
         classifier_out = classifier_out.reshape(
-            (-1, self.seq_len, self.num_class))  # (b, 31, num_class)
-
-        # sizes: (b, 31, 722), (b, 31, 2)
-        # classifier output consists of predicted pitch classes per frame
-        # detector output consists of: (isvoice, notvoice) estimates per frame
+            (-1, self.seq_len, self.num_class))
         return paddle.abs(classifier_out.squeeze()), GAN_feature, poolblock_out
 
     @staticmethod
@@ -188,10 +159,9 @@ class ResBlock(nn.Layer):
     def __init__(self,
                  in_channels: int,
                  out_channels: int,
-                 leaky_relu_slope=0.01):
+                 leaky_relu_slope: float=0.01):
         super().__init__()
         self.downsample = in_channels != out_channels
-
         # BN / LReLU / MaxPool layer before the conv layer - see Figure 1b in the paper
         self.pre_conv = nn.Sequential(
             nn.BatchNorm2D(num_features=in_channels),
@@ -215,7 +185,6 @@ class ResBlock(nn.Layer):
                 kernel_size=3,
                 padding=1,
                 bias_attr=False), )
-
         # 1 x 1 convolution layer to match the feature dimensions
         self.conv1by1 = None
         if self.downsample:
@@ -226,6 +195,13 @@ class ResBlock(nn.Layer):
                 bias_attr=False)
 
     def forward(self, x: paddle.Tensor):
+        """Calculate forward propagation.
+        Args:
+            x(Tensor(float32)): Shape (B, in_channels, T, n_mels).
+        Returns:
+            Tensor:
+                The residual output, Shape (B, out_channels, T, n_mels // 2).
+        """
         x = self.pre_conv(x)
         if self.downsample:
             x = self.conv(x) + self.conv1by1(x)
diff --git a/paddlespeech/t2s/models/starganv2_vc/losses.py b/paddlespeech/t2s/models/starganv2_vc/losses.py
index 8086a595..d94c9342 100644
--- a/paddlespeech/t2s/models/starganv2_vc/losses.py
+++ b/paddlespeech/t2s/models/starganv2_vc/losses.py
@@ -11,92 +11,102 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from typing import Any
+from typing import Dict
+
 import paddle
 import paddle.nn.functional as F
-from munch import Munch
-from starganv2vc_paddle.transforms import build_transforms
 
+from .transforms import build_transforms
 
 # 这些都写到 updater 里
-def compute_d_loss(nets,
-                   args,
-                   x_real,
-                   y_org,
-                   y_trg,
-                   z_trg=None,
-                   x_ref=None,
-                   use_r1_reg=True,
-                   use_adv_cls=False,
-                   use_con_reg=False):
-    args = Munch(args)
+
+
+def compute_d_loss(
+        nets: Dict[str, Any],
+        x_real: paddle.Tensor,
+        y_org: paddle.Tensor,
+        y_trg: paddle.Tensor,
+        z_trg: paddle.Tensor=None,
+        x_ref: paddle.Tensor=None,
+        # TODO: should be True here, but r1_reg has some bug now 
+        use_r1_reg: bool=False,
+        use_adv_cls: bool=False,
+        use_con_reg: bool=False,
+        lambda_reg: float=1.,
+        lambda_adv_cls: float=0.1,
+        lambda_con_reg: float=10.):
 
     assert (z_trg is None) != (x_ref is None)
     # with real audios
     x_real.stop_gradient = False
-    out = nets.discriminator(x_real, y_org)
+    out = nets['discriminator'](x_real, y_org)
     loss_real = adv_loss(out, 1)
-
     # R1 regularizaition (https://arxiv.org/abs/1801.04406v4)
     if use_r1_reg:
         loss_reg = r1_reg(out, x_real)
     else:
-        loss_reg = paddle.to_tensor([0.], dtype=paddle.float32)
+        # loss_reg = paddle.to_tensor([0.], dtype=paddle.float32)
+        loss_reg = paddle.zeros([1])
 
     # consistency regularization (bCR-GAN: https://arxiv.org/abs/2002.04724)
-    loss_con_reg = paddle.to_tensor([0.], dtype=paddle.float32)
+    loss_con_reg = paddle.zeros([1])
     if use_con_reg:
         t = build_transforms()
-        out_aug = nets.discriminator(t(x_real).detach(), y_org)
+        out_aug = nets['discriminator'](t(x_real).detach(), y_org)
         loss_con_reg += F.smooth_l1_loss(out, out_aug)
 
     # with fake audios
     with paddle.no_grad():
         if z_trg is not None:
-            s_trg = nets.mapping_network(z_trg, y_trg)
+            s_trg = nets['mapping_network'](z_trg, y_trg)
         else:  # x_ref is not None
-            s_trg = nets.style_encoder(x_ref, y_trg)
+            s_trg = nets['style_encoder'](x_ref, y_trg)
 
-        F0 = nets.f0_model.get_feature_GAN(x_real)
-        x_fake = nets.generator(x_real, s_trg, masks=None, F0=F0)
-    out = nets.discriminator(x_fake, y_trg)
+        F0 = nets['F0_model'].get_feature_GAN(x_real)
+        x_fake = nets['generator'](x_real, s_trg, masks=None, F0=F0)
+    out = nets['discriminator'](x_fake, y_trg)
     loss_fake = adv_loss(out, 0)
     if use_con_reg:
-        out_aug = nets.discriminator(t(x_fake).detach(), y_trg)
+        out_aug = nets['discriminator'](t(x_fake).detach(), y_trg)
         loss_con_reg += F.smooth_l1_loss(out, out_aug)
 
     # adversarial classifier loss
     if use_adv_cls:
-        out_de = nets.discriminator.classifier(x_fake)
+        out_de = nets['discriminator'].classifier(x_fake)
         loss_real_adv_cls = F.cross_entropy(out_de[y_org != y_trg],
                                             y_org[y_org != y_trg])
 
         if use_con_reg:
-            out_de_aug = nets.discriminator.classifier(t(x_fake).detach())
+            out_de_aug = nets['discriminator'].classifier(t(x_fake).detach())
             loss_con_reg += F.smooth_l1_loss(out_de, out_de_aug)
     else:
         loss_real_adv_cls = paddle.zeros([1]).mean()
 
-    loss = loss_real + loss_fake + args.lambda_reg * loss_reg + \
-            args.lambda_adv_cls * loss_real_adv_cls + \
-            args.lambda_con_reg * loss_con_reg
+    loss = loss_real + loss_fake + lambda_reg * loss_reg + \
+            lambda_adv_cls * loss_real_adv_cls + \
+            lambda_con_reg * loss_con_reg
 
-    return loss, Munch(
-        real=loss_real.item(),
-        fake=loss_fake.item(),
-        reg=loss_reg.item(),
-        real_adv_cls=loss_real_adv_cls.item(),
-        con_reg=loss_con_reg.item())
+    return loss
 
 
-def compute_g_loss(nets,
-                   args,
-                   x_real,
-                   y_org,
-                   y_trg,
-                   z_trgs=None,
-                   x_refs=None,
-                   use_adv_cls=False):
-    args = Munch(args)
+def compute_g_loss(nets: Dict[str, Any],
+                   x_real: paddle.Tensor,
+                   y_org: paddle.Tensor,
+                   y_trg: paddle.Tensor,
+                   z_trgs: paddle.Tensor=None,
+                   x_refs: paddle.Tensor=None,
+                   use_adv_cls: bool=False,
+                   lambda_sty: float=1.,
+                   lambda_cyc: float=5.,
+                   lambda_ds: float=1.,
+                   lambda_norm: float=1.,
+                   lambda_asr: float=10.,
+                   lambda_f0: float=5.,
+                   lambda_f0_sty: float=0.1,
+                   lambda_adv: float=2.,
+                   lambda_adv_cls: float=0.5,
+                   norm_bias: float=0.5):
 
     assert (z_trgs is None) != (x_refs is None)
     if z_trgs is not None:
@@ -106,37 +116,37 @@ def compute_g_loss(nets,
 
     # compute style vectors
     if z_trgs is not None:
-        s_trg = nets.mapping_network(z_trg, y_trg)
+        s_trg = nets['mapping_network'](z_trg, y_trg)
     else:
-        s_trg = nets.style_encoder(x_ref, y_trg)
+        s_trg = nets['style_encoder'](x_ref, y_trg)
 
     # compute ASR/F0 features (real)
-    with paddle.no_grad():
-        F0_real, GAN_F0_real, cyc_F0_real = nets.f0_model(x_real)
-        ASR_real = nets.asr_model.get_feature(x_real)
+    # 源码没有用 .eval(), 使用了 no_grad()
+    # 我们使用了 .eval(), 开启 with paddle.no_grad() 会报错
+    F0_real, GAN_F0_real, cyc_F0_real = nets['F0_model'](x_real)
+    ASR_real = nets['asr_model'].get_feature(x_real)
 
     # adversarial loss
-    x_fake = nets.generator(x_real, s_trg, masks=None, F0=GAN_F0_real)
-    out = nets.discriminator(x_fake, y_trg)
+    x_fake = nets['generator'](x_real, s_trg, masks=None, F0=GAN_F0_real)
+    out = nets['discriminator'](x_fake, y_trg)
     loss_adv = adv_loss(out, 1)
 
     # compute ASR/F0 features (fake)
-    F0_fake, GAN_F0_fake, _ = nets.f0_model(x_fake)
-    ASR_fake = nets.asr_model.get_feature(x_fake)
+    F0_fake, GAN_F0_fake, _ = nets['F0_model'](x_fake)
+    ASR_fake = nets['asr_model'].get_feature(x_fake)
 
     # norm consistency loss
     x_fake_norm = log_norm(x_fake)
     x_real_norm = log_norm(x_real)
-    loss_norm = ((
-        paddle.nn.ReLU()(paddle.abs(x_fake_norm - x_real_norm) - args.norm_bias)
-    )**2).mean()
+    tmp = paddle.abs(x_fake_norm - x_real_norm) - norm_bias
+    loss_norm = ((paddle.nn.ReLU()(tmp))**2).mean()
 
     # F0 loss
     loss_f0 = f0_loss(F0_fake, F0_real)
 
     # style F0 loss (style initialization)
-    if x_refs is not None and args.lambda_f0_sty > 0 and not use_adv_cls:
-        F0_sty, _, _ = nets.f0_model(x_ref)
+    if x_refs is not None and lambda_f0_sty > 0 and not use_adv_cls:
+        F0_sty, _, _ = nets['F0_model'](x_ref)
         loss_f0_sty = F.l1_loss(
             compute_mean_f0(F0_fake), compute_mean_f0(F0_sty))
     else:
@@ -146,61 +156,53 @@ def compute_g_loss(nets,
     loss_asr = F.smooth_l1_loss(ASR_fake, ASR_real)
 
     # style reconstruction loss
-    s_pred = nets.style_encoder(x_fake, y_trg)
+    s_pred = nets['style_encoder'](x_fake, y_trg)
     loss_sty = paddle.mean(paddle.abs(s_pred - s_trg))
 
     # diversity sensitive loss
     if z_trgs is not None:
-        s_trg2 = nets.mapping_network(z_trg2, y_trg)
+        s_trg2 = nets['mapping_network'](z_trg2, y_trg)
     else:
-        s_trg2 = nets.style_encoder(x_ref2, y_trg)
-    x_fake2 = nets.generator(x_real, s_trg2, masks=None, F0=GAN_F0_real)
+        s_trg2 = nets['style_encoder'](x_ref2, y_trg)
+    x_fake2 = nets['generator'](x_real, s_trg2, masks=None, F0=GAN_F0_real)
     x_fake2 = x_fake2.detach()
-    _, GAN_F0_fake2, _ = nets.f0_model(x_fake2)
+    _, GAN_F0_fake2, _ = nets['F0_model'](x_fake2)
     loss_ds = paddle.mean(paddle.abs(x_fake - x_fake2))
     loss_ds += F.smooth_l1_loss(GAN_F0_fake, GAN_F0_fake2.detach())
 
     # cycle-consistency loss
-    s_org = nets.style_encoder(x_real, y_org)
-    x_rec = nets.generator(x_fake, s_org, masks=None, F0=GAN_F0_fake)
+    s_org = nets['style_encoder'](x_real, y_org)
+    x_rec = nets['generator'](x_fake, s_org, masks=None, F0=GAN_F0_fake)
     loss_cyc = paddle.mean(paddle.abs(x_rec - x_real))
     # F0 loss in cycle-consistency loss
-    if args.lambda_f0 > 0:
-        _, _, cyc_F0_rec = nets.f0_model(x_rec)
+    if lambda_f0 > 0:
+        _, _, cyc_F0_rec = nets['F0_model'](x_rec)
         loss_cyc += F.smooth_l1_loss(cyc_F0_rec, cyc_F0_real)
-    if args.lambda_asr > 0:
-        ASR_recon = nets.asr_model.get_feature(x_rec)
+    if lambda_asr > 0:
+        ASR_recon = nets['asr_model'].get_feature(x_rec)
         loss_cyc += F.smooth_l1_loss(ASR_recon, ASR_real)
 
     # adversarial classifier loss
     if use_adv_cls:
-        out_de = nets.discriminator.classifier(x_fake)
+        out_de = nets['discriminator'].classifier(x_fake)
         loss_adv_cls = F.cross_entropy(out_de[y_org != y_trg],
                                        y_trg[y_org != y_trg])
     else:
         loss_adv_cls = paddle.zeros([1]).mean()
 
-    loss = args.lambda_adv * loss_adv + args.lambda_sty * loss_sty \
-           - args.lambda_ds * loss_ds + args.lambda_cyc * loss_cyc\
-           + args.lambda_norm * loss_norm \
-           + args.lambda_asr * loss_asr \
-           + args.lambda_f0 * loss_f0 \
-           + args.lambda_f0_sty * loss_f0_sty \
-           + args.lambda_adv_cls * loss_adv_cls
-
-    return loss, Munch(
-        adv=loss_adv.item(),
-        sty=loss_sty.item(),
-        ds=loss_ds.item(),
-        cyc=loss_cyc.item(),
-        norm=loss_norm.item(),
-        asr=loss_asr.item(),
-        f0=loss_f0.item(),
-        adv_cls=loss_adv_cls.item())
+    loss = lambda_adv * loss_adv + lambda_sty * loss_sty \
+           - lambda_ds * loss_ds + lambda_cyc * loss_cyc \
+           + lambda_norm * loss_norm \
+           + lambda_asr * loss_asr \
+           + lambda_f0 * loss_f0 \
+           + lambda_f0_sty * loss_f0_sty \
+           + lambda_adv_cls * loss_adv_cls
+
+    return loss
 
 
 # for norm consistency loss
-def log_norm(x, mean=-4, std=4, axis=2):
+def log_norm(x: paddle.Tensor, mean: float=-4, std: float=4, axis: int=2):
     """
     normalized log mel -> mel -> norm -> log(norm)
     """
@@ -209,7 +211,7 @@ def log_norm(x, mean=-4, std=4, axis=2):
 
 
 # for adversarial loss
-def adv_loss(logits, target):
+def adv_loss(logits: paddle.Tensor, target: float):
     assert target in [1, 0]
     if len(logits.shape) > 1:
         logits = logits.reshape([-1])
@@ -220,7 +222,7 @@ def adv_loss(logits, target):
 
 
 # for R1 regularization loss
-def r1_reg(d_out, x_in):
+def r1_reg(d_out: paddle.Tensor, x_in: paddle.Tensor):
     # zero-centered gradient penalty for real images
     batch_size = x_in.shape[0]
     grad_dout = paddle.grad(
@@ -236,14 +238,14 @@ def r1_reg(d_out, x_in):
 
 
 # for F0 consistency loss
-def compute_mean_f0(f0):
+def compute_mean_f0(f0: paddle.Tensor):
     f0_mean = f0.mean(-1)
     f0_mean = f0_mean.expand((f0.shape[-1], f0_mean.shape[0])).transpose(
         (1, 0))  # (B, M)
     return f0_mean
 
 
-def f0_loss(x_f0, y_f0):
+def f0_loss(x_f0: paddle.Tensor, y_f0: paddle.Tensor):
     """
     x.shape = (B, 1, M, L): predict
     y.shape = (B, 1, M, L): target
diff --git a/paddlespeech/t2s/models/starganv2_vc/starganv2_vc.py b/paddlespeech/t2s/models/starganv2_vc/starganv2_vc.py
index 96e9eda8..99aeb73b 100644
--- a/paddlespeech/t2s/models/starganv2_vc/starganv2_vc.py
+++ b/paddlespeech/t2s/models/starganv2_vc/starganv2_vc.py
@@ -19,17 +19,13 @@ This work is licensed under the Creative Commons Attribution-NonCommercial
 http://creativecommons.org/licenses/by-nc/4.0/ or send a letter to
 Creative Commons, PO Box 1866, Mountain View, CA 94042, USA.
 """
-# import copy
 import math
 
 import paddle
 import paddle.nn.functional as F
 from paddle import nn
 
-from paddlespeech.utils.initialize import _calculate_gain
-from paddlespeech.utils.initialize import xavier_uniform_
-
-# from munch import Munch
+from paddlespeech.t2s.modules.nets_utils import _reset_parameters
 
 
 class DownSample(nn.Layer):
@@ -37,13 +33,24 @@ class DownSample(nn.Layer):
         super().__init__()
         self.layer_type = layer_type
 
-    def forward(self, x):
+    def forward(self, x: paddle.Tensor):
+        """Calculate forward propagation.
+        Args:
+            x(Tensor(float32)): Shape (B, dim_in, n_mels, T).
+        Returns:
+            Tensor:
+                layer_type == 'none': Shape (B, dim_in, n_mels, T)
+                layer_type == 'timepreserve': Shape (B, dim_in, n_mels // 2, T)
+                layer_type == 'half': Shape (B, dim_in, n_mels // 2, T // 2)
+        """
         if self.layer_type == 'none':
             return x
         elif self.layer_type == 'timepreserve':
-            return F.avg_pool2d(x, (2, 1))
+            out = F.avg_pool2d(x, (2, 1))
+            return out
         elif self.layer_type == 'half':
-            return F.avg_pool2d(x, 2)
+            out = F.avg_pool2d(x, 2)
+            return out
         else:
             raise RuntimeError(
                 'Got unexpected donwsampletype %s, expected is [none, timepreserve, half]'
@@ -55,13 +62,24 @@ class UpSample(nn.Layer):
         super().__init__()
         self.layer_type = layer_type
 
-    def forward(self, x):
+    def forward(self, x: paddle.Tensor):
+        """Calculate forward propagation.
+        Args:
+            x(Tensor(float32)): Shape (B, dim_in, n_mels, T).
+        Returns:
+            Tensor:
+                layer_type == 'none': Shape (B, dim_in, n_mels, T)
+                layer_type == 'timepreserve': Shape (B, dim_in, n_mels * 2, T)
+                layer_type == 'half': Shape (B, dim_in, n_mels * 2, T * 2)
+        """
         if self.layer_type == 'none':
             return x
         elif self.layer_type == 'timepreserve':
-            return F.interpolate(x, scale_factor=(2, 1), mode='nearest')
+            out = F.interpolate(x, scale_factor=(2, 1), mode='nearest')
+            return out
         elif self.layer_type == 'half':
-            return F.interpolate(x, scale_factor=2, mode='nearest')
+            out = F.interpolate(x, scale_factor=2, mode='nearest')
+            return out
         else:
             raise RuntimeError(
                 'Got unexpected upsampletype %s, expected is [none, timepreserve, half]'
@@ -127,9 +145,19 @@ class ResBlk(nn.Layer):
         return x
 
     def forward(self, x: paddle.Tensor):
+        """Calculate forward propagation.
+        Args:
+            x(Tensor(float32)): Shape (B, dim_in, n_mels, T).
+        Returns:
+            Tensor:
+                downsample == 'none': Shape (B, dim_in, n_mels, T).
+                downsample == 'timepreserve': Shape (B, dim_out, T, n_mels // 2, T).
+                downsample == 'half': Shape (B, dim_out, T, n_mels // 2, T // 2).
+        """
         x = self._shortcut(x) + self._residual(x)
         # unit variance
-        return x / math.sqrt(2)
+        out = x / math.sqrt(2)
+        return out
 
 
 class AdaIN(nn.Layer):
@@ -140,12 +168,21 @@ class AdaIN(nn.Layer):
         self.fc = nn.Linear(style_dim, num_features * 2)
 
     def forward(self, x: paddle.Tensor, s: paddle.Tensor):
+        """Calculate forward propagation.
+        Args:
+            x(Tensor(float32)): Shape (B, style_dim, n_mels, T).
+            s(Tensor(float32)): Shape (style_dim, ).
+        Returns:
+            Tensor:
+                Shape (B, style_dim, T, n_mels, T).
+        """
         if len(s.shape) == 1:
             s = s[None]
         h = self.fc(s)
         h = h.reshape((h.shape[0], h.shape[1], 1, 1))
         gamma, beta = paddle.split(h, 2, axis=1)
-        return (1 + gamma) * self.norm(x) + beta
+        out = (1 + gamma) * self.norm(x) + beta
+        return out
 
 
 class AdainResBlk(nn.Layer):
@@ -162,6 +199,7 @@ class AdainResBlk(nn.Layer):
         self.upsample = UpSample(layer_type=upsample)
         self.learned_sc = dim_in != dim_out
         self._build_weights(dim_in, dim_out, style_dim)
+        self.layer_type = upsample
 
     def _build_weights(self, dim_in: int, dim_out: int, style_dim: int=64):
         self.conv1 = nn.Conv2D(
@@ -204,6 +242,18 @@ class AdainResBlk(nn.Layer):
         return x
 
     def forward(self, x: paddle.Tensor, s: paddle.Tensor):
+        """Calculate forward propagation.
+        Args:
+            x(Tensor(float32)): 
+                Shape (B, dim_in, n_mels, T).
+            s(Tensor(float32)):
+                Shape (64,).
+        Returns:
+            Tensor:
+                upsample == 'none': Shape (B, dim_out, T, n_mels, T).  
+                upsample == 'timepreserve': Shape (B, dim_out, T, n_mels * 2, T).
+                upsample == 'half': Shape (B, dim_out, T, n_mels * 2, T * 2).  
+        """
         out = self._residual(x, s)
         if self.w_hpf == 0:
             out = (out + self._shortcut(x)) / math.sqrt(2)
@@ -219,7 +269,8 @@ class HighPass(nn.Layer):
     def forward(self, x: paddle.Tensor):
         filter = self.filter.unsqueeze(0).unsqueeze(1).tile(
             [x.shape[1], 1, 1, 1])
-        return F.conv2d(x, filter, padding=1, groups=x.shape[1])
+        out = F.conv2d(x, filter, padding=1, groups=x.shape[1])
+        return out
 
 
 class Generator(nn.Layer):
@@ -276,12 +327,10 @@ class Generator(nn.Layer):
                  w_hpf=w_hpf,
                  upsample=_downtype))  # stack-like
             dim_in = dim_out
-
         # bottleneck blocks (encoder)
         for _ in range(2):
             self.encode.append(
                 ResBlk(dim_in=dim_out, dim_out=dim_out, normalize=True))
-
         # F0 blocks 
         if F0_channel != 0:
             self.decode.insert(0,
@@ -290,7 +339,6 @@ class Generator(nn.Layer):
                                    dim_out=dim_out,
                                    style_dim=style_dim,
                                    w_hpf=w_hpf))
-
         # bottleneck blocks (decoder)
         for _ in range(2):
             self.decode.insert(0,
@@ -299,7 +347,6 @@ class Generator(nn.Layer):
                                    dim_out=dim_out + int(F0_channel / 2),
                                    style_dim=style_dim,
                                    w_hpf=w_hpf))
-
         if F0_channel != 0:
             self.F0_conv = nn.Sequential(
                 ResBlk(
@@ -307,35 +354,57 @@ class Generator(nn.Layer):
                     dim_out=int(F0_channel / 2),
                     normalize=True,
                     downsample="half"), )
-
         if w_hpf > 0:
             self.hpf = HighPass(w_hpf)
 
+        self.reset_parameters()
+
     def forward(self,
                 x: paddle.Tensor,
                 s: paddle.Tensor,
                 masks: paddle.Tensor=None,
                 F0: paddle.Tensor=None):
+        """Calculate forward propagation.
+        Args:
+            x(Tensor(float32)): 
+                Shape (B, 1, n_mels, T).
+            s(Tensor(float32)):
+                Shape (64,).
+            masks:
+                None.
+            F0:
+                Shape (B, num_features(256), n_mels // 8, T).
+        Returns:
+            Tensor:
+                output of generator. Shape (B, 1, n_mels, T // 4 * 4)
+        """
         x = self.stem(x)
         cache = {}
+        # output: (B, max_conv_dim, n_mels // 16, T // 4)
         for block in self.encode:
             if (masks is not None) and (x.shape[2] in [32, 64, 128]):
                 cache[x.shape[2]] = x
             x = block(x)
-
         if F0 is not None:
+            # input: (B, num_features(256), n_mels // 8, T)
+            # output: (B, num_features(256) // 2, n_mels // 16, T // 2)
             F0 = self.F0_conv(F0)
+            # output: (B, num_features(256) // 2, n_mels // 16, T // 4)
             F0 = F.adaptive_avg_pool2d(F0, [x.shape[-2], x.shape[-1]])
             x = paddle.concat([x, F0], axis=1)
-
+        # input: (B, max_conv_dim+num_features(256) // 2, n_mels // 16, T // 4 * 4)
+        # output: (B, dim_in, n_mels, T // 4 * 4)
         for block in self.decode:
             x = block(x, s)
             if (masks is not None) and (x.shape[2] in [32, 64, 128]):
                 mask = masks[0] if x.shape[2] in [32] else masks[1]
                 mask = F.interpolate(mask, size=x.shape[2], mode='bilinear')
                 x = x + self.hpf(mask * cache[x.shape[2]])
+        out = self.to_out(x)
+        return out
 
-        return self.to_out(x)
+    def reset_parameters(self):
+        self.apply(_reset_parameters)
 
 
 class MappingNetwork(nn.Layer):
@@ -365,18 +434,33 @@ class MappingNetwork(nn.Layer):
                     nn.ReLU(), nn.Linear(hidden_dim, style_dim))
             ])
 
+        self.reset_parameters()
+
     def forward(self, z: paddle.Tensor, y: paddle.Tensor):
+        """Calculate forward propagation.
+        Args:
+            z(Tensor(float32)): 
+                Shape (B, latent_dim).
+            y(Tensor(float32)):
+                speaker label. Shape (B, ).    
+        Returns:
+            Tensor:
+                Shape (style_dim, )
+        """
         h = self.shared(z)
         out = []
         for layer in self.unshared:
             out += [layer(h)]
-        # (batch, num_domains, style_dim)
+        # (B, num_domains, style_dim)
         out = paddle.stack(out, axis=1)
         idx = paddle.arange(y.shape[0])
-        # (batch, style_dim)
+        # (style_dim, )
         s = out[idx, y]
         return s
 
+    def reset_parameters(self):
+        self.apply(_reset_parameters)
+
 
 class StyleEncoder(nn.Layer):
     def __init__(self,
@@ -418,19 +502,34 @@ class StyleEncoder(nn.Layer):
         for _ in range(num_domains):
             self.unshared.append(nn.Linear(dim_out, style_dim))
 
+        self.reset_parameters()
+
     def forward(self, x: paddle.Tensor, y: paddle.Tensor):
+        """Calculate forward propagation.
+        Args:
+            x(Tensor(float32)): 
+                Shape (B, 1, n_mels, T).   
+            y(Tensor(float32)):
+                speaker label. Shape (B, ).
+        Returns:
+            Tensor:
+                Shape (style_dim, )
+        """
         h = self.shared(x)
         h = h.reshape((h.shape[0], -1))
         out = []
         for layer in self.unshared:
             out += [layer(h)]
-        # (batch, num_domains, style_dim)
+        # (B, num_domains, style_dim)
         out = paddle.stack(out, axis=1)
         idx = paddle.arange(y.shape[0])
-        # (batch, style_dim)
+        # (style_dim,)
         s = out[idx, y]
         return s
 
+    def reset_parameters(self):
+        self.apply(_reset_parameters)
+
 
 class Discriminator(nn.Layer):
     def __init__(self,
@@ -453,26 +552,28 @@ class Discriminator(nn.Layer):
             repeat_num=repeat_num)
         self.num_domains = num_domains
 
+        self.reset_parameters()
+
     def forward(self, x: paddle.Tensor, y: paddle.Tensor):
-        return self.dis(x, y)
+        """Calculate forward propagation.
+        Args:
+            x(Tensor(float32)):
+                Shape (B, 1, 80, T).
+            y(Tensor(float32)):
+                Shape (B, ). 
+        Returns:
+            Tensor:
+                Shape (B, )
+        """
+        out = self.dis(x, y)
+        return out
 
     def classifier(self, x: paddle.Tensor):
-        return self.cls.get_feature(x)
-
-
-class LinearNorm(nn.Layer):
-    def __init__(self,
-                 in_dim: int,
-                 out_dim: int,
-                 bias: bool=True,
-                 w_init_gain: str='linear'):
-        super().__init__()
-        self.linear_layer = nn.Linear(in_dim, out_dim, bias_attr=bias)
-        xavier_uniform_(
-            self.linear_layer.weight, gain=_calculate_gain(w_init_gain))
+        out = self.cls.get_feature(x)
+        return out
 
-    def forward(self, x):
-        return self.linear_layer(x)
+    def reset_parameters(self):
+        self.apply(_reset_parameters)
 
 
 class Discriminator2D(nn.Layer):
@@ -520,97 +621,13 @@ class Discriminator2D(nn.Layer):
 
     def get_feature(self, x: paddle.Tensor):
         out = self.main(x)
-        # (batch, num_domains)
+        # (B, num_domains)
         out = out.reshape((out.shape[0], -1))
         return out
 
     def forward(self, x: paddle.Tensor, y: paddle.Tensor):
         out = self.get_feature(x)
         idx = paddle.arange(y.shape[0])
-        # (batch)
+        # (B,) ?
         out = out[idx, y]
         return out
-
-
-'''
-def build_model(args, F0_model: nn.Layer, ASR_model: nn.Layer):
-    generator = Generator(
-        dim_in=args.dim_in,
-        style_dim=args.style_dim,
-        max_conv_dim=args.max_conv_dim,
-        w_hpf=args.w_hpf,
-        F0_channel=args.F0_channel)
-    mapping_network = MappingNetwork(
-        latent_dim=args.latent_dim,
-        style_dim=args.style_dim,
-        num_domains=args.num_domains,
-        hidden_dim=args.max_conv_dim)
-    style_encoder = StyleEncoder(
-        dim_in=args.dim_in,
-        style_dim=args.style_dim,
-        num_domains=args.num_domains,
-        max_conv_dim=args.max_conv_dim)
-    discriminator = Discriminator(
-        dim_in=args.dim_in,
-        num_domains=args.num_domains,
-        max_conv_dim=args.max_conv_dim,
-        n_repeat=args.n_repeat)
-    generator_ema = copy.deepcopy(generator)
-    mapping_network_ema = copy.deepcopy(mapping_network)
-    style_encoder_ema = copy.deepcopy(style_encoder)
-
-    nets = Munch(
-        generator=generator,
-        mapping_network=mapping_network,
-        style_encoder=style_encoder,
-        discriminator=discriminator,
-        f0_model=F0_model,
-        asr_model=ASR_model)
-
-    nets_ema = Munch(
-        generator=generator_ema,
-        mapping_network=mapping_network_ema,
-        style_encoder=style_encoder_ema)
-
-    return nets, nets_ema
-
-
-class StarGANv2VC(nn.Layer):
-    def __init__(
-            self,
-            # spk_num
-            num_domains: int=20,
-            dim_in: int=64,
-            style_dim: int=64,
-            latent_dim: int=16,
-            max_conv_dim: int=512,
-            n_repeat: int=4,
-            w_hpf: int=0,
-            F0_channel: int=256):
-        super().__init__()
-
-        self.generator = Generator(
-            dim_in=dim_in,
-            style_dim=style_dim,
-            max_conv_dim=max_conv_dim,
-            w_hpf=w_hpf,
-            F0_channel=F0_channel)
-        # MappingNetwork and StyleEncoder are used to generate reference_embeddings
-        self.mapping_network = MappingNetwork(
-            latent_dim=latent_dim,
-            style_dim=style_dim,
-            num_domains=num_domains,
-            hidden_dim=max_conv_dim)
-
-        self.style_encoder = StyleEncoder(
-            dim_in=dim_in,
-            style_dim=style_dim,
-            num_domains=num_domains,
-            max_conv_dim=max_conv_dim)
-
-        self.discriminator = Discriminator(
-            dim_in=dim_in,
-            num_domains=num_domains,
-            max_conv_dim=max_conv_dim,
-            repeat_num=n_repeat)
-'''
diff --git a/paddlespeech/t2s/models/starganv2_vc/starganv2_vc_updater.py b/paddlespeech/t2s/models/starganv2_vc/starganv2_vc_updater.py
index 595add0a..1b811a3f 100644
--- a/paddlespeech/t2s/models/starganv2_vc/starganv2_vc_updater.py
+++ b/paddlespeech/t2s/models/starganv2_vc/starganv2_vc_updater.py
@@ -11,3 +11,298 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import logging
+from typing import Any
+from typing import Dict
+
+from paddle import distributed as dist
+from paddle.io import DataLoader
+from paddle.nn import Layer
+from paddle.optimizer import Optimizer
+from paddle.optimizer.lr import LRScheduler
+
+from paddlespeech.t2s.models.starganv2_vc.losses import compute_d_loss
+from paddlespeech.t2s.models.starganv2_vc.losses import compute_g_loss
+from paddlespeech.t2s.training.extensions.evaluator import StandardEvaluator
+from paddlespeech.t2s.training.reporter import report
+from paddlespeech.t2s.training.updaters.standard_updater import StandardUpdater
+from paddlespeech.t2s.training.updaters.standard_updater import UpdaterState
+
+logging.basicConfig(
+    format='%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s',
+    datefmt='[%Y-%m-%d %H:%M:%S]')
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+class StarGANv2VCUpdater(StandardUpdater):
+    def __init__(self,
+                 models: Dict[str, Layer],
+                 optimizers: Dict[str, Optimizer],
+                 schedulers: Dict[str, LRScheduler],
+                 dataloader: DataLoader,
+                 g_loss_params: Dict[str, Any]={
+                     'lambda_sty': 1.,
+                     'lambda_cyc': 5.,
+                     'lambda_ds': 1.,
+                     'lambda_norm': 1.,
+                     'lambda_asr': 10.,
+                     'lambda_f0': 5.,
+                     'lambda_f0_sty': 0.1,
+                     'lambda_adv': 2.,
+                     'lambda_adv_cls': 0.5,
+                     'norm_bias': 0.5,
+                 },
+                 d_loss_params: Dict[str, Any]={
+                     'lambda_reg': 1.,
+                     'lambda_adv_cls': 0.1,
+                     'lambda_con_reg': 10.,
+                 },
+                 adv_cls_epoch: int=50,
+                 con_reg_epoch: int=30,
+                 use_r1_reg: bool=False,
+                 output_dir=None):
+        self.models = models
+
+        self.optimizers = optimizers
+        self.optimizer_g = optimizers['generator']
+        self.optimizer_s = optimizers['style_encoder']
+        self.optimizer_m = optimizers['mapping_network']
+        self.optimizer_d = optimizers['discriminator']
+
+        self.schedulers = schedulers
+        self.scheduler_g = schedulers['generator']
+        self.scheduler_s = schedulers['style_encoder']
+        self.scheduler_m = schedulers['mapping_network']
+        self.scheduler_d = schedulers['discriminator']
+
+        self.dataloader = dataloader
+
+        self.g_loss_params = g_loss_params
+        self.d_loss_params = d_loss_params
+
+        self.use_r1_reg = use_r1_reg
+        self.con_reg_epoch = con_reg_epoch
+        self.adv_cls_epoch = adv_cls_epoch
+
+        self.state = UpdaterState(iteration=0, epoch=0)
+        self.train_iterator = iter(self.dataloader)
+
+        log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
+        self.filehandler = logging.FileHandler(str(log_file))
+        logger.addHandler(self.filehandler)
+        self.logger = logger
+        self.msg = ""
+
+    def zero_grad(self):
+        self.optimizer_d.clear_grad()
+        self.optimizer_g.clear_grad()
+        self.optimizer_m.clear_grad()
+        self.optimizer_s.clear_grad()
+
+    def scheduler(self):
+        self.scheduler_d.step()
+        self.scheduler_g.step()
+        self.scheduler_m.step()
+        self.scheduler_s.step()
+
+    def update_core(self, batch):
+        self.msg = "Rank: {}, ".format(dist.get_rank())
+        losses_dict = {}
+        # parse batch
+        x_real = batch['x_real']
+        y_org = batch['y_org']
+        x_ref = batch['x_ref']
+        x_ref2 = batch['x_ref2']
+        y_trg = batch['y_trg']
+        z_trg = batch['z_trg']
+        z_trg2 = batch['z_trg2']
+
+        use_con_reg = (self.state.epoch >= self.con_reg_epoch)
+        use_adv_cls = (self.state.epoch >= self.adv_cls_epoch)
+
+        # Discriminator loss
+        # train the discriminator (by random reference)
+        self.zero_grad()
+        random_d_loss = compute_d_loss(
+            nets=self.models,
+            x_real=x_real,
+            y_org=y_org,
+            y_trg=y_trg,
+            z_trg=z_trg,
+            use_adv_cls=use_adv_cls,
+            use_con_reg=use_con_reg,
+            **self.d_loss_params)
+        random_d_loss.backward()
+        self.optimizer_d.step()
+        # train the discriminator (by target reference)
+        self.zero_grad()
+        target_d_loss = compute_d_loss(
+            nets=self.models,
+            x_real=x_real,
+            y_org=y_org,
+            y_trg=y_trg,
+            x_ref=x_ref,
+            use_adv_cls=use_adv_cls,
+            use_con_reg=use_con_reg,
+            **self.d_loss_params)
+        target_d_loss.backward()
+        self.optimizer_d.step()
+        report("train/random_d_loss", float(random_d_loss))
+        report("train/target_d_loss", float(target_d_loss))
+        losses_dict["random_d_loss"] = float(random_d_loss)
+        losses_dict["target_d_loss"] = float(target_d_loss)
+
+        # Generator
+        # train the generator (by random reference)
+        self.zero_grad()
+        random_g_loss = compute_g_loss(
+            nets=self.models,
+            x_real=x_real,
+            y_org=y_org,
+            y_trg=y_trg,
+            z_trgs=[z_trg, z_trg2],
+            use_adv_cls=use_adv_cls,
+            **self.g_loss_params)
+        random_g_loss.backward()
+        self.optimizer_g.step()
+        self.optimizer_m.step()
+        self.optimizer_s.step()
+
+        # train the generator (by target reference)
+        self.zero_grad()
+        target_g_loss = compute_g_loss(
+            nets=self.models,
+            x_real=x_real,
+            y_org=y_org,
+            y_trg=y_trg,
+            x_refs=[x_ref, x_ref2],
+            use_adv_cls=use_adv_cls,
+            **self.g_loss_params)
+        target_g_loss.backward()
+        # 此处是否要 optimizer_g optimizer_m optimizer_s 都写上？
+        # 源码没写上后两个是否是疏忽？
+        self.optimizer_g.step()
+        # self.optimizer_m.step()
+        # self.optimizer_s.step()
+        report("train/random_g_loss", float(random_g_loss))
+        report("train/target_g_loss", float(target_g_loss))
+        losses_dict["random_g_loss"] = float(random_g_loss)
+        losses_dict["target_g_loss"] = float(target_g_loss)
+
+        self.scheduler()
+
+        self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                              for k, v in losses_dict.items())
+
+
+class StarGANv2VCEvaluator(StandardEvaluator):
+    def __init__(self,
+                 models: Dict[str, Layer],
+                 dataloader: DataLoader,
+                 g_loss_params: Dict[str, Any]={
+                     'lambda_sty': 1.,
+                     'lambda_cyc': 5.,
+                     'lambda_ds': 1.,
+                     'lambda_norm': 1.,
+                     'lambda_asr': 10.,
+                     'lambda_f0': 5.,
+                     'lambda_f0_sty': 0.1,
+                     'lambda_adv': 2.,
+                     'lambda_adv_cls': 0.5,
+                     'norm_bias': 0.5,
+                 },
+                 d_loss_params: Dict[str, Any]={
+                     'lambda_reg': 1.,
+                     'lambda_adv_cls': 0.1,
+                     'lambda_con_reg': 10.,
+                 },
+                 adv_cls_epoch: int=50,
+                 con_reg_epoch: int=30,
+                 use_r1_reg: bool=False,
+                 output_dir=None):
+        self.models = models
+
+        self.dataloader = dataloader
+
+        self.g_loss_params = g_loss_params
+        self.d_loss_params = d_loss_params
+
+        self.use_r1_reg = use_r1_reg
+        self.con_reg_epoch = con_reg_epoch
+        self.adv_cls_epoch = adv_cls_epoch
+
+        log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
+        self.filehandler = logging.FileHandler(str(log_file))
+        logger.addHandler(self.filehandler)
+        self.logger = logger
+        self.msg = ""
+
+    def evaluate_core(self, batch):
+        # logging.debug("Evaluate: ")
+        self.msg = "Evaluate: "
+        losses_dict = {}
+
+        x_real = batch['x_real']
+        y_org = batch['y_org']
+        x_ref = batch['x_ref']
+        x_ref2 = batch['x_ref2']
+        y_trg = batch['y_trg']
+        z_trg = batch['z_trg']
+        z_trg2 = batch['z_trg2']
+
+        # eval the discriminator
+
+        random_d_loss = compute_d_loss(
+            nets=self.models,
+            x_real=x_real,
+            y_org=y_org,
+            y_trg=y_trg,
+            z_trg=z_trg,
+            use_r1_reg=self.use_r1_reg,
+            use_adv_cls=use_adv_cls,
+            **self.d_loss_params)
+
+        target_d_loss = compute_d_loss(
+            nets=self.models,
+            x_real=x_real,
+            y_org=y_org,
+            y_trg=y_trg,
+            x_ref=x_ref,
+            use_r1_reg=self.use_r1_reg,
+            use_adv_cls=use_adv_cls,
+            **self.d_loss_params)
+
+        report("eval/random_d_loss", float(random_d_loss))
+        report("eval/target_d_loss", float(target_d_loss))
+        losses_dict["random_d_loss"] = float(random_d_loss)
+        losses_dict["target_d_loss"] = float(target_d_loss)
+
+        # eval the generator
+
+        random_g_loss = compute_g_loss(
+            nets=self.models,
+            x_real=x_real,
+            y_org=y_org,
+            y_trg=y_trg,
+            z_trgs=[z_trg, z_trg2],
+            use_adv_cls=use_adv_cls,
+            **self.g_loss_params)
+
+        target_g_loss = compute_g_loss(
+            nets=self.models,
+            x_real=x_real,
+            y_org=y_org,
+            y_trg=y_trg,
+            x_refs=[x_ref, x_ref2],
+            use_adv_cls=use_adv_cls,
+            **self.g_loss_params)
+
+        report("eval/random_g_loss", float(random_g_loss))
+        report("eval/target_g_loss", float(target_g_loss))
+        losses_dict["random_g_loss"] = float(random_g_loss)
+        losses_dict["target_g_loss"] = float(target_g_loss)
+
+        self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                              for k, v in losses_dict.items())
+        self.logger.info(self.msg)
diff --git a/paddlespeech/t2s/models/starganv2_vc/transforms.py b/paddlespeech/t2s/models/starganv2_vc/transforms.py
new file mode 100644
index 00000000..d7586147
--- /dev/null
+++ b/paddlespeech/t2s/models/starganv2_vc/transforms.py
@@ -0,0 +1,143 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import random
+
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+from paddle import nn
+
+
+## 1. RandomTimeStrech
+class TimeStrech(nn.Layer):
+    def __init__(self, scale):
+        super().__init__()
+        self.scale = scale
+
+    def forward(self, x: paddle.Tensor):
+        mel_size = x.shape[-1]
+
+        x = F.interpolate(
+            x,
+            scale_factor=(1, self.scale),
+            align_corners=False,
+            mode='bilinear').squeeze()
+
+        if x.shape[-1] < mel_size:
+            noise_length = (mel_size - x.shape[-1])
+            random_pos = random.randint(0, x.shape[-1]) - noise_length
+            if random_pos < 0:
+                random_pos = 0
+            noise = x[..., random_pos:random_pos + noise_length]
+            x = paddle.concat([x, noise], axis=-1)
+        else:
+            x = x[..., :mel_size]
+
+        return x.unsqueeze(1)
+
+
+## 2. PitchShift
+class PitchShift(nn.Layer):
+    def __init__(self, shift):
+        super().__init__()
+        self.shift = shift
+
+    def forward(self, x: paddle.Tensor):
+        if len(x.shape) == 2:
+            x = x.unsqueeze(0)
+        x = x.squeeze()
+        mel_size = x.shape[1]
+        shift_scale = (mel_size + self.shift) / mel_size
+        x = F.interpolate(
+            x.unsqueeze(1),
+            scale_factor=(shift_scale, 1.),
+            align_corners=False,
+            mode='bilinear').squeeze(1)
+
+        x = x[:, :mel_size]
+        if x.shape[1] < mel_size:
+            pad_size = mel_size - x.shape[1]
+            x = paddle.cat(
+                [x, paddle.zeros(x.shape[0], pad_size, x.shape[2])], axis=1)
+        x = x.squeeze()
+        return x.unsqueeze(1)
+
+
+## 3. ShiftBias
+class ShiftBias(nn.Layer):
+    def __init__(self, bias):
+        super().__init__()
+        self.bias = bias
+
+    def forward(self, x: paddle.Tensor):
+        return x + self.bias
+
+
+## 4. Scaling
+class SpectScaling(nn.Layer):
+    def __init__(self, scale):
+        super().__init__()
+        self.scale = scale
+
+    def forward(self, x: paddle.Tensor):
+        return x * self.scale
+
+
+## 5. Time Flip
+class TimeFlip(nn.Layer):
+    def __init__(self, length):
+        super().__init__()
+        self.length = round(length)
+
+    def forward(self, x: paddle.Tensor):
+        if self.length > 1:
+            start = np.random.randint(0, x.shape[-1] - self.length)
+            x_ret = x.clone()
+            x_ret[..., start:start + self.length] = paddle.flip(
+                x[..., start:start + self.length], axis=[-1])
+            x = x_ret
+        return x
+
+
+class PhaseShuffle2D(nn.Layer):
+    def __init__(self, n: int=2):
+        super().__init__()
+        self.n = n
+        self.random = random.Random(1)
+
+    def forward(self, x: paddle.Tensor, move=None):
+        # x.size = (B, C, M, L)
+        if move is None:
+            move = self.random.randint(-self.n, self.n)
+
+        if move == 0:
+            return x
+        else:
+            left = x[:, :, :, :move]
+            right = x[:, :, :, move:]
+            shuffled = paddle.concat([right, left], axis=3)
+
+        return shuffled
+
+
+def build_transforms():
+    transforms = [
+        lambda M: TimeStrech(1 + (np.random.random() - 0.5) * M * 0.2),
+        lambda M: SpectScaling(1 + (np.random.random() - 1) * M * 0.1),
+        lambda M: PhaseShuffle2D(192),
+    ]
+    N, M = len(transforms), np.random.random()
+    composed = nn.Sequential(
+        * [trans(M) for trans in np.random.choice(transforms, N)])
+    return composed
diff --git a/paddlespeech/t2s/models/vits/duration_predictor.py b/paddlespeech/t2s/models/vits/duration_predictor.py
index b0bb68d0..12177fbc 100644
--- a/paddlespeech/t2s/models/vits/duration_predictor.py
+++ b/paddlespeech/t2s/models/vits/duration_predictor.py
@@ -155,12 +155,10 @@ class StochasticDurationPredictor(nn.Layer):
             z_u, z1 = paddle.split(z_q, [1, 1], 1)
             u = F.sigmoid(z_u) * x_mask
             z0 = (w - u) * x_mask
-            logdet_tot_q += paddle.sum(
-                (F.log_sigmoid(z_u) + F.log_sigmoid(-z_u)) * x_mask, [1, 2])
-            logq = (paddle.sum(-0.5 *
-                               (math.log(2 * math.pi) +
-                                (e_q**2)) * x_mask, [1, 2]) - logdet_tot_q)
-
+            tmp1 = (F.log_sigmoid(z_u) + F.log_sigmoid(-z_u)) * x_mask
+            logdet_tot_q += paddle.sum(tmp1, [1, 2])
+            tmp2 = -0.5 * (math.log(2 * math.pi) + (e_q**2)) * x_mask
+            logq = (paddle.sum(tmp2, [1, 2]) - logdet_tot_q)
             logdet_tot = 0
             z0, logdet = self.log_flow(z0, x_mask)
             logdet_tot += logdet
@@ -168,8 +166,8 @@ class StochasticDurationPredictor(nn.Layer):
             for flow in self.flows:
                 z, logdet = flow(z, x_mask, g=x, inverse=inverse)
                 logdet_tot = logdet_tot + logdet
-            nll = (paddle.sum(0.5 * (math.log(2 * math.pi) +
-                                     (z**2)) * x_mask, [1, 2]) - logdet_tot)
+            tmp3 = 0.5 * (math.log(2 * math.pi) + (z**2)) * x_mask
+            nll = (paddle.sum(tmp3, [1, 2]) - logdet_tot)
             # (B,)
             return nll + logq
         else:
diff --git a/paddlespeech/t2s/models/vits/flow.py b/paddlespeech/t2s/models/vits/flow.py
index 7593eb72..94df968a 100644
--- a/paddlespeech/t2s/models/vits/flow.py
+++ b/paddlespeech/t2s/models/vits/flow.py
@@ -334,11 +334,12 @@ class ConvFlow(nn.Layer):
         unnorm_widths = h[..., :self.bins] / denom
         unnorm_heights = h[..., self.bins:2 * self.bins] / denom
         unnorm_derivatives = h[..., 2 * self.bins:]
+
         xb, logdet_abs = piecewise_rational_quadratic_transform(
-            xb,
-            unnorm_widths,
-            unnorm_heights,
-            unnorm_derivatives,
+            inputs=xb,
+            unnormalized_widths=unnorm_widths,
+            unnormalized_heights=unnorm_heights,
+            unnormalized_derivatives=unnorm_derivatives,
             inverse=inverse,
             tails="linear",
             tail_bound=self.tail_bound, )
diff --git a/paddlespeech/t2s/models/vits/generator.py b/paddlespeech/t2s/models/vits/generator.py
index fbd2d665..427ae09e 100644
--- a/paddlespeech/t2s/models/vits/generator.py
+++ b/paddlespeech/t2s/models/vits/generator.py
@@ -371,8 +371,9 @@ class VITSGenerator(nn.Layer):
             # (B, H, T_text)
             s_p_sq_r = paddle.exp(-2 * logs_p)
             # (B, 1, T_text)
+            tmp1 = -0.5 * math.log(2 * math.pi) - logs_p
             neg_x_ent_1 = paddle.sum(
-                -0.5 * math.log(2 * math.pi) - logs_p,
+                tmp1,
                 [1],
                 keepdim=True, )
             # (B, T_feats, H) x (B, H, T_text) = (B, T_feats, T_text)
@@ -384,8 +385,9 @@ class VITSGenerator(nn.Layer):
                 z_p.transpose([0, 2, 1]),
                 (m_p * s_p_sq_r), )
             # (B, 1, T_text)
+            tmp2 = -0.5 * (m_p**2) * s_p_sq_r
             neg_x_ent_4 = paddle.sum(
-                -0.5 * (m_p**2) * s_p_sq_r,
+                tmp2,
                 [1],
                 keepdim=True, )
             # (B, T_feats, T_text)
@@ -403,7 +405,6 @@ class VITSGenerator(nn.Layer):
         w = attn.sum(2)
         dur_nll = self.duration_predictor(x, x_mask, w=w, g=g)
         dur_nll = dur_nll / paddle.sum(x_mask)
-
         # expand the length to match with the feature sequence
         # (B, T_feats, T_text) x (B, T_text, H) -> (B, H, T_feats)
         m_p = paddle.matmul(attn.squeeze(1),
@@ -511,8 +512,9 @@ class VITSGenerator(nn.Layer):
             # (B, H, T_text)
             s_p_sq_r = paddle.exp(-2 * logs_p)
             # (B, 1, T_text)
+            tmp3 = -0.5 * math.log(2 * math.pi) - logs_p
             neg_x_ent_1 = paddle.sum(
-                -0.5 * math.log(2 * math.pi) - logs_p,
+                tmp3,
                 [1],
                 keepdim=True, )
             # (B, T_feats, H) x (B, H, T_text) = (B, T_feats, T_text)
@@ -524,8 +526,9 @@ class VITSGenerator(nn.Layer):
                 z_p.transpose([0, 2, 1]),
                 (m_p * s_p_sq_r), )
             # (B, 1, T_text)
+            tmp4 = -0.5 * (m_p**2) * s_p_sq_r
             neg_x_ent_4 = paddle.sum(
-                -0.5 * (m_p**2) * s_p_sq_r,
+                tmp4,
                 [1],
                 keepdim=True, )
             # (B, T_feats, T_text)
@@ -556,8 +559,9 @@ class VITSGenerator(nn.Layer):
             y_lengths = paddle.cast(
                 paddle.clip(paddle.sum(dur, [1, 2]), min=1), dtype='int64')
             y_mask = make_non_pad_mask(y_lengths).unsqueeze(1)
-            attn_mask = paddle.unsqueeze(x_mask, 2) * paddle.unsqueeze(y_mask,
-                                                                       -1)
+            tmp_a = paddle.cast(paddle.unsqueeze(x_mask, 2), dtype='int64')
+            tmp_b = paddle.cast(paddle.unsqueeze(y_mask, -1), dtype='int64')
+            attn_mask = tmp_a * tmp_b
             attn = self._generate_path(dur, attn_mask)
 
             # expand the length to match with the feature sequence
diff --git a/paddlespeech/t2s/models/vits/transform.py b/paddlespeech/t2s/models/vits/transform.py
index 61bd5ee2..917f2843 100644
--- a/paddlespeech/t2s/models/vits/transform.py
+++ b/paddlespeech/t2s/models/vits/transform.py
@@ -61,8 +61,12 @@ def piecewise_rational_quadratic_transform(
 
 
 def mask_preprocess(x, mask):
+    # bins.dtype = int32
     B, C, T, bins = paddle.shape(x)
-    new_x = paddle.zeros([mask.sum(), bins])
+    mask_int = paddle.cast(mask, dtype='int64')
+    # paddle.sum 输入是 int32 或 bool 的时候，输出是 int64
+    # paddle.zeros (fill_constant) 的 shape 会被强制转成 int32 类型
+    new_x = paddle.zeros([paddle.sum(mask_int), bins])
     for i in range(bins):
         new_x[:, i] = x[:, :, :, i][mask]
     return new_x
@@ -240,4 +244,7 @@ def rational_quadratic_spline(
 
 def _searchsorted(bin_locations, inputs, eps=1e-6):
     bin_locations[..., -1] += eps
-    return paddle.sum(inputs[..., None] >= bin_locations, axis=-1) - 1
+    mask = inputs[..., None] >= bin_locations
+    mask_int = paddle.cast(mask, dtype='int64')
+    out = paddle.sum(mask_int, axis=-1) - 1
+    return out
diff --git a/paddlespeech/t2s/models/vits/vits_updater.py b/paddlespeech/t2s/models/vits/vits_updater.py
index 9f8be680..e61e617c 100644
--- a/paddlespeech/t2s/models/vits/vits_updater.py
+++ b/paddlespeech/t2s/models/vits/vits_updater.py
@@ -166,7 +166,9 @@ class VITSUpdater(StandardUpdater):
                 gen_loss.backward()
 
                 self.optimizer_g.step()
-                self.scheduler_g.step()
+                # learning rate updates on each epoch.
+                if self.state.iteration % self.updates_per_epoch == 0:
+                    self.scheduler_g.step()
 
                 # reset cache
                 if self.model.reuse_cache_gen or not self.model.training:
@@ -202,7 +204,9 @@ class VITSUpdater(StandardUpdater):
                 dis_loss.backward()
 
                 self.optimizer_d.step()
-                self.scheduler_d.step()
+                # learning rate updates on each epoch.
+                if self.state.iteration % self.updates_per_epoch == 0:
+                    self.scheduler_d.step()
 
                 # reset cache
                 if self.model.reuse_cache_dis or not self.model.training:
diff --git a/paddlespeech/t2s/models/waveflow.py b/paddlespeech/t2s/models/waveflow.py
index 8e2ce822..b4818cab 100644
--- a/paddlespeech/t2s/models/waveflow.py
+++ b/paddlespeech/t2s/models/waveflow.py
@@ -236,7 +236,7 @@ class ResidualBlock(nn.Layer):
 
         Returns:
             res (Tensor): 
-                A row of the the residual output. shape=(batch_size, channel, 1, width)
+                A row of the residual output. shape=(batch_size, channel, 1, width)
             skip (Tensor): 
                 A row of the skip output. shape=(batch_size, channel, 1, width)
 
@@ -343,7 +343,7 @@ class ResidualNet(nn.LayerList):
             
         Returns:
             res (Tensor): 
-                A row of the the residual output. shape=(batch_size, channel, 1, width) 
+                A row of the residual output. shape=(batch_size, channel, 1, width) 
             skip (Tensor): 
                 A row of the skip output. shape=(batch_size, channel, 1, width)
                 
@@ -465,7 +465,7 @@ class Flow(nn.Layer):
         self.resnet.start_sequence()
 
     def inverse(self, z, condition):
-        """Sampling from the the distrition p(X). It is done by sample form
+        """Sampling from the distrition p(X). It is done by sample form
         p(Z) and transform the sample. It is a auto regressive transformation.
 
         Args:
@@ -600,7 +600,7 @@ class WaveFlow(nn.LayerList):
         return z, log_det_jacobian
 
     def inverse(self, z, condition):
-        """Sampling from the the distrition p(X).
+        """Sampling from the distrition p(X).
 
         It is done by sample a ``z`` form p(Z) and transform it into ``x``.
         Each Flow transform .. math:: `z_{i-1}` to .. math:: `z_{i}` in an
diff --git a/paddlespeech/t2s/modules/activation.py b/paddlespeech/t2s/modules/activation.py
index 8d8cd62e..f1c099b7 100644
--- a/paddlespeech/t2s/modules/activation.py
+++ b/paddlespeech/t2s/modules/activation.py
@@ -37,7 +37,8 @@ def get_activation(act, **kwargs):
         "selu": paddle.nn.SELU,
         "leakyrelu": paddle.nn.LeakyReLU,
         "swish": paddle.nn.Swish,
-        "glu": GLU
+        "glu": GLU,
+        "gelu": paddle.nn.GELU,
     }
 
     return activation_funcs[act](**kwargs)
diff --git a/paddlespeech/t2s/modules/conformer/encoder_layer.py b/paddlespeech/t2s/modules/conformer/encoder_layer.py
index 26a35456..6c416088 100644
--- a/paddlespeech/t2s/modules/conformer/encoder_layer.py
+++ b/paddlespeech/t2s/modules/conformer/encoder_layer.py
@@ -113,7 +113,6 @@ class EncoderLayer(nn.Layer):
             x, pos_emb = x_input[0], x_input[1]
         else:
             x, pos_emb = x_input, None
-
         skip_layer = False
         # with stochastic depth, residual connection `x + f(x)` becomes
         # `x <- x + 1 / (1 - p) * f(x)` at training time.
@@ -121,14 +120,12 @@ class EncoderLayer(nn.Layer):
         if self.training and self.stochastic_depth_rate > 0:
             skip_layer = paddle.rand(1).item() < self.stochastic_depth_rate
             stoch_layer_coeff = 1.0 / (1 - self.stochastic_depth_rate)
-
         if skip_layer:
             if cache is not None:
                 x = paddle.concat([cache, x], axis=1)
             if pos_emb is not None:
                 return (x, pos_emb), mask
             return x, mask
-
         # whether to use macaron style
         if self.feed_forward_macaron is not None:
             residual = x
@@ -138,7 +135,6 @@ class EncoderLayer(nn.Layer):
                 self.feed_forward_macaron(x))
             if not self.normalize_before:
                 x = self.norm_ff_macaron(x)
-
         # multi-headed self-attention module
         residual = x
         if self.normalize_before:
diff --git a/paddlespeech/t2s/modules/diffnet.py b/paddlespeech/t2s/modules/diffnet.py
new file mode 100644
index 00000000..2f433ad6
--- /dev/null
+++ b/paddlespeech/t2s/modules/diffnet.py
@@ -0,0 +1,244 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+
+import paddle
+import paddle.nn.functional as F
+from paddle import nn
+
+from paddlespeech.utils.initialize import _calculate_fan_in_and_fan_out
+from paddlespeech.utils.initialize import kaiming_normal_
+from paddlespeech.utils.initialize import kaiming_uniform_
+from paddlespeech.utils.initialize import uniform_
+from paddlespeech.utils.initialize import zeros_
+
+
+def Conv1D(*args, **kwargs):
+    layer = nn.Conv1D(*args, **kwargs)
+    # Initialize the weight to be consistent with the official
+    kaiming_normal_(layer.weight)
+
+    # Initialization is consistent with torch
+    if layer.bias is not None:
+        fan_in, _ = _calculate_fan_in_and_fan_out(layer.weight)
+        if fan_in != 0:
+            bound = 1 / math.sqrt(fan_in)
+            uniform_(layer.bias, -bound, bound)
+    return layer
+
+
+# Initialization is consistent with torch
+def Linear(*args, **kwargs):
+    layer = nn.Linear(*args, **kwargs)
+    kaiming_uniform_(layer.weight, a=math.sqrt(5))
+    if layer.bias is not None:
+        fan_in, _ = _calculate_fan_in_and_fan_out(layer.weight)
+        bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+        uniform_(layer.bias, -bound, bound)
+    return layer
+
+
+class ResidualBlock(nn.Layer):
+    """ResidualBlock
+
+    Args:
+        encoder_hidden (int, optional): 
+            Input feature size of the 1D convolution, by default 256
+        residual_channels (int, optional): 
+            Feature size of the residual output(and also the input), by default 256
+        gate_channels (int, optional): 
+            Output feature size of the 1D convolution, by default 512
+        kernel_size (int, optional): 
+            Kernel size of the 1D convolution, by default 3
+        dilation (int, optional): 
+            Dilation of the 1D convolution, by default 4
+    """
+
+    def __init__(self,
+                 encoder_hidden: int=256,
+                 residual_channels: int=256,
+                 gate_channels: int=512,
+                 kernel_size: int=3,
+                 dilation: int=4):
+        super().__init__()
+        self.dilated_conv = Conv1D(
+            residual_channels,
+            gate_channels,
+            kernel_size,
+            padding=dilation,
+            dilation=dilation)
+        self.diffusion_projection = Linear(residual_channels, residual_channels)
+        self.conditioner_projection = Conv1D(encoder_hidden, gate_channels, 1)
+        self.output_projection = Conv1D(residual_channels, gate_channels, 1)
+
+    def forward(
+            self,
+            x: paddle.Tensor,
+            diffusion_step: paddle.Tensor,
+            cond: paddle.Tensor, ):
+        """Calculate forward propagation.
+        Args:
+            spec (Tensor(float32)): input feature. (B, residual_channels, T)
+            diffusion_step (Tensor(int64)):  The timestep input (adding noise step). (B,)
+            cond (Tensor(float32)): The auxiliary input (e.g. fastspeech2 encoder output). (B, residual_channels, T)
+
+        Returns:
+            x (Tensor(float32)): output (B, residual_channels, T)
+
+        """
+        diffusion_step = self.diffusion_projection(diffusion_step).unsqueeze(-1)
+        cond = self.conditioner_projection(cond)
+        y = x + diffusion_step
+
+        y = self.dilated_conv(y) + cond
+
+        gate, filter = paddle.chunk(y, 2, axis=1)
+        y = F.sigmoid(gate) * paddle.tanh(filter)
+
+        y = self.output_projection(y)
+        residual, skip = paddle.chunk(y, 2, axis=1)
+        return (x + residual) / math.sqrt(2.0), skip
+
+
+class SinusoidalPosEmb(nn.Layer):
+    """Positional embedding
+    """
+
+    def __init__(self, dim: int=256):
+        super().__init__()
+        self.dim = dim
+
+    def forward(self, x: paddle.Tensor):
+        x = paddle.cast(x, 'float32')
+        half_dim = self.dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = paddle.exp(paddle.arange(half_dim) * -emb)
+        emb = x[:, None] * emb[None, :]
+        emb = paddle.concat([emb.sin(), emb.cos()], axis=-1)
+        return emb
+
+
+class DiffNet(nn.Layer):
+    """A Mel-Spectrogram Denoiser
+
+    Args:
+        in_channels (int, optional): 
+            Number of channels of the input mel-spectrogram, by default 80
+        out_channels (int, optional): 
+            Number of channels of the output mel-spectrogram, by default 80
+        kernel_size (int, optional): 
+            Kernel size of the residual blocks inside, by default 3
+        layers (int, optional): 
+            Number of residual blocks inside, by default 20
+        stacks (int, optional):
+            The number of groups to split the residual blocks into, by default 5
+            Within each group, the dilation of the residual block grows exponentially.
+        residual_channels (int, optional): 
+            Residual channel of the residual blocks, by default 256
+        gate_channels (int, optional): 
+            Gate channel of the residual blocks, by default 512
+        skip_channels (int, optional): 
+            Skip channel of the residual blocks, by default 256
+        aux_channels (int, optional): 
+            Auxiliary channel of the residual blocks, by default 256
+        dropout (float, optional): 
+            Dropout of the residual blocks, by default 0.
+        bias (bool, optional): 
+            Whether to use bias in residual blocks, by default True
+        use_weight_norm (bool, optional): 
+            Whether to use weight norm in all convolutions, by default False
+    """
+
+    def __init__(
+            self,
+            in_channels: int=80,
+            out_channels: int=80,
+            kernel_size: int=3,
+            layers: int=20,
+            stacks: int=5,
+            residual_channels: int=256,
+            gate_channels: int=512,
+            skip_channels: int=256,
+            aux_channels: int=256,
+            dropout: float=0.,
+            bias: bool=True,
+            use_weight_norm: bool=False,
+            init_type: str="kaiming_normal", ):
+        super().__init__()
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.layers = layers
+        self.aux_channels = aux_channels
+        self.residual_channels = residual_channels
+        self.gate_channels = gate_channels
+        self.kernel_size = kernel_size
+        self.dilation_cycle_length = layers // stacks
+        self.skip_channels = skip_channels
+
+        self.input_projection = Conv1D(self.in_channels, self.residual_channels,
+                                       1)
+        self.diffusion_embedding = SinusoidalPosEmb(self.residual_channels)
+        dim = self.residual_channels
+        self.mlp = nn.Sequential(
+            Linear(dim, dim * 4), nn.Mish(), Linear(dim * 4, dim))
+        self.residual_layers = nn.LayerList([
+            ResidualBlock(
+                encoder_hidden=self.aux_channels,
+                residual_channels=self.residual_channels,
+                gate_channels=self.gate_channels,
+                kernel_size=self.kernel_size,
+                dilation=2**(i % self.dilation_cycle_length))
+            for i in range(self.layers)
+        ])
+        self.skip_projection = Conv1D(self.residual_channels,
+                                      self.skip_channels, 1)
+        self.output_projection = Conv1D(self.residual_channels,
+                                        self.out_channels, 1)
+        zeros_(self.output_projection.weight)
+
+    def forward(
+            self,
+            spec: paddle.Tensor,
+            diffusion_step: paddle.Tensor,
+            cond: paddle.Tensor, ):
+        """Calculate forward propagation.
+        Args:
+            spec (Tensor(float32)): The input mel-spectrogram. (B, n_mel, T)
+            diffusion_step (Tensor(int64)):  The timestep input (adding noise step). (B,)
+            cond (Tensor(float32)): The auxiliary input (e.g. fastspeech2 encoder output). (B, D_enc_out, T)
+
+        Returns:
+            x (Tensor(float32)): pred noise (B, n_mel, T)
+
+        """
+        x = spec
+        x = self.input_projection(x)  # x [B, residual_channel, T]
+
+        x = F.relu(x)
+        diffusion_step = self.diffusion_embedding(diffusion_step)
+        diffusion_step = self.mlp(diffusion_step)
+        skip = []
+        for layer_id, layer in enumerate(self.residual_layers):
+            x, skip_connection = layer(
+                x=x,
+                diffusion_step=diffusion_step,
+                cond=cond, )
+            skip.append(skip_connection)
+        x = paddle.sum(
+            paddle.stack(skip), axis=0) / math.sqrt(len(self.residual_layers))
+        x = self.skip_projection(x)
+        x = F.relu(x)
+        x = self.output_projection(x)  # [B, 80, T]
+        return x
diff --git a/paddlespeech/t2s/modules/diffusion.py b/paddlespeech/t2s/modules/diffusion.py
index be684ce3..adbd9ce7 100644
--- a/paddlespeech/t2s/modules/diffusion.py
+++ b/paddlespeech/t2s/modules/diffusion.py
@@ -12,184 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Diffusion denoising related modules for paddle"""
-import math
 from typing import Callable
 from typing import Optional
 from typing import Tuple
 
+import numpy as np
 import paddle
 import ppdiffusers
 from paddle import nn
-from ppdiffusers.models.embeddings import Timesteps
 from ppdiffusers.schedulers import DDPMScheduler
 
-from paddlespeech.t2s.modules.nets_utils import initialize
-from paddlespeech.t2s.modules.residual_block import WaveNetResidualBlock
-
-
-class WaveNetDenoiser(nn.Layer):
-    """A Mel-Spectrogram Denoiser modified from WaveNet
-
-    Args:
-        in_channels (int, optional): 
-            Number of channels of the input mel-spectrogram, by default 80
-        out_channels (int, optional): 
-            Number of channels of the output mel-spectrogram, by default 80
-        kernel_size (int, optional): 
-            Kernel size of the residual blocks inside, by default 3
-        layers (int, optional): 
-            Number of residual blocks inside, by default 20
-        stacks (int, optional):
-            The number of groups to split the residual blocks into, by default 5
-            Within each group, the dilation of the residual block grows exponentially.
-        residual_channels (int, optional): 
-            Residual channel of the residual blocks, by default 256
-        gate_channels (int, optional): 
-            Gate channel of the residual blocks, by default 512
-        skip_channels (int, optional): 
-            Skip channel of the residual blocks, by default 256
-        aux_channels (int, optional): 
-            Auxiliary channel of the residual blocks, by default 256
-        dropout (float, optional): 
-            Dropout of the residual blocks, by default 0.
-        bias (bool, optional): 
-            Whether to use bias in residual blocks, by default True
-        use_weight_norm (bool, optional): 
-            Whether to use weight norm in all convolutions, by default False
-    """
-
-    def __init__(
-            self,
-            in_channels: int=80,
-            out_channels: int=80,
-            kernel_size: int=3,
-            layers: int=20,
-            stacks: int=5,
-            residual_channels: int=256,
-            gate_channels: int=512,
-            skip_channels: int=256,
-            aux_channels: int=256,
-            dropout: float=0.,
-            bias: bool=True,
-            use_weight_norm: bool=False,
-            init_type: str="kaiming_normal", ):
-        super().__init__()
-
-        # initialize parameters
-        initialize(self, init_type)
-
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.aux_channels = aux_channels
-        self.layers = layers
-        self.stacks = stacks
-        self.kernel_size = kernel_size
-
-        assert layers % stacks == 0
-        layers_per_stack = layers // stacks
-
-        self.first_t_emb = nn.Sequential(
-            Timesteps(
-                residual_channels,
-                flip_sin_to_cos=False,
-                downscale_freq_shift=1),
-            nn.Linear(residual_channels, residual_channels * 4),
-            nn.Mish(), nn.Linear(residual_channels * 4, residual_channels))
-        self.t_emb_layers = nn.LayerList([
-            nn.Linear(residual_channels, residual_channels)
-            for _ in range(layers)
-        ])
-
-        self.first_conv = nn.Conv1D(
-            in_channels, residual_channels, 1, bias_attr=True)
-        self.first_act = nn.ReLU()
-
-        self.conv_layers = nn.LayerList()
-        for layer in range(layers):
-            dilation = 2**(layer % layers_per_stack)
-            conv = WaveNetResidualBlock(
-                kernel_size=kernel_size,
-                residual_channels=residual_channels,
-                gate_channels=gate_channels,
-                skip_channels=skip_channels,
-                aux_channels=aux_channels,
-                dilation=dilation,
-                dropout=dropout,
-                bias=bias)
-            self.conv_layers.append(conv)
-
-        final_conv = nn.Conv1D(skip_channels, out_channels, 1, bias_attr=True)
-        nn.initializer.Constant(0.0)(final_conv.weight)
-        self.last_conv_layers = nn.Sequential(nn.ReLU(),
-                                              nn.Conv1D(
-                                                  skip_channels,
-                                                  skip_channels,
-                                                  1,
-                                                  bias_attr=True),
-                                              nn.ReLU(), final_conv)
-
-        if use_weight_norm:
-            self.apply_weight_norm()
-
-    def forward(self, x, t, c):
-        """Denoise mel-spectrogram.
-
-        Args:
-            x(Tensor): 
-                Shape (N, C_in, T), The input mel-spectrogram.
-            t(Tensor): 
-                Shape (N), The timestep input.
-            c(Tensor): 
-                Shape (N, C_aux, T'). The auxiliary input (e.g. fastspeech2 encoder output). 
-
-        Returns:
-            Tensor: Shape (N, C_out, T), the denoised mel-spectrogram.
-        """
-        assert c.shape[-1] == x.shape[-1]
-
-        if t.shape[0] != x.shape[0]:
-            t = t.tile([x.shape[0]])
-        t_emb = self.first_t_emb(t)
-        t_embs = [
-            t_emb_layer(t_emb)[..., None] for t_emb_layer in self.t_emb_layers
-        ]
-
-        x = self.first_conv(x)
-        x = self.first_act(x)
-        skips = 0
-        for f, t in zip(self.conv_layers, t_embs):
-            x = x + t
-            x, s = f(x, c)
-            skips += s
-        skips *= math.sqrt(1.0 / len(self.conv_layers))
-
-        x = self.last_conv_layers(skips)
-        return x
-
-    def apply_weight_norm(self):
-        """Recursively apply weight normalization to all the Convolution layers
-        in the sublayers.
-        """
-
-        def _apply_weight_norm(layer):
-            if isinstance(layer, (nn.Conv1D, nn.Conv2D)):
-                nn.utils.weight_norm(layer)
-
-        self.apply(_apply_weight_norm)
-
-    def remove_weight_norm(self):
-        """Recursively remove weight normalization from all the Convolution 
-        layers in the sublayers.
-        """
-
-        def _remove_weight_norm(layer):
-            try:
-                nn.utils.remove_weight_norm(layer)
-            except ValueError:
-                pass
-
-        self.apply(_remove_weight_norm)
-
 
 class GaussianDiffusion(nn.Layer):
     """Common Gaussian Diffusion Denoising Model Module 
@@ -207,6 +39,13 @@ class GaussianDiffusion(nn.Layer):
             beta schedule parameter for the scheduler, by default 'squaredcos_cap_v2' (cosine schedule).
         num_max_timesteps (int, optional): 
             The max timestep transition from real to noise, by default None.
+        stretch (bool, optional): 
+            Whether to stretch before diffusion, by defalut True.
+        min_values: (paddle.Tensor):
+            The minimum value of the feature to stretch.
+        max_values: (paddle.Tensor):
+            The maximum value of the feature to stretch.
+    
     
     Examples: 
         >>> import paddle
@@ -294,13 +133,17 @@ class GaussianDiffusion(nn.Layer):
 
     """
 
-    def __init__(self,
-                 denoiser: nn.Layer,
-                 num_train_timesteps: Optional[int]=1000,
-                 beta_start: Optional[float]=0.0001,
-                 beta_end: Optional[float]=0.02,
-                 beta_schedule: Optional[str]="squaredcos_cap_v2",
-                 num_max_timesteps: Optional[int]=None):
+    def __init__(
+            self,
+            denoiser: nn.Layer,
+            num_train_timesteps: Optional[int]=1000,
+            beta_start: Optional[float]=0.0001,
+            beta_end: Optional[float]=0.02,
+            beta_schedule: Optional[str]="squaredcos_cap_v2",
+            num_max_timesteps: Optional[int]=None,
+            stretch: bool=True,
+            min_values: paddle.Tensor=None,
+            max_values: paddle.Tensor=None, ):
         super().__init__()
 
         self.num_train_timesteps = num_train_timesteps
@@ -315,6 +158,22 @@ class GaussianDiffusion(nn.Layer):
             beta_end=beta_end,
             beta_schedule=beta_schedule)
         self.num_max_timesteps = num_max_timesteps
+        self.stretch = stretch
+        self.min_values = min_values
+        self.max_values = max_values
+
+    def norm_spec(self, x):
+        """
+        Linearly map x to [-1, 1]
+        Args:
+            x: [B, T, N]
+        """
+        return (x - self.min_values) / (self.max_values - self.min_values
+                                        ) * 2 - 1
+
+    def denorm_spec(self, x):
+        return (x + 1) / 2 * (self.max_values - self.min_values
+                              ) + self.min_values
 
     def forward(self, x: paddle.Tensor, cond: Optional[paddle.Tensor]=None
                 ) -> Tuple[paddle.Tensor, paddle.Tensor]:
@@ -333,6 +192,11 @@ class GaussianDiffusion(nn.Layer):
                 The noises which is added to the input.
 
         """
+        if self.stretch:
+            x = x.transpose((0, 2, 1))
+            x = self.norm_spec(x)
+            x = x.transpose((0, 2, 1))
+
         noise_scheduler = self.noise_scheduler
 
         # Sample noise that we'll add to the mel-spectrograms
@@ -360,7 +224,7 @@ class GaussianDiffusion(nn.Layer):
                   num_inference_steps: Optional[int]=1000,
                   strength: Optional[float]=None,
                   scheduler_type: Optional[str]="ddpm",
-                  clip_noise: Optional[bool]=True,
+                  clip_noise: Optional[bool]=False,
                   clip_noise_range: Optional[Tuple[float, float]]=(-1, 1),
                   callback: Optional[Callable[[int, int, int, paddle.Tensor],
                                               None]]=None,
@@ -369,9 +233,9 @@ class GaussianDiffusion(nn.Layer):
 
         Args:
             noise (Tensor): 
-                The input tensor as a starting point for denoising.
+                The input tensor as a starting point for denoising. 
             cond (Tensor, optional):
-                Conditional input for compute noises.
+                Conditional input for compute noises. (N, C_aux, T)
             ref_x (Tensor, optional):
                 The real output for the denoising process to refer.
             num_inference_steps (int, optional):
@@ -382,6 +246,7 @@ class GaussianDiffusion(nn.Layer):
             scheduler_type (str, optional):
                 Noise scheduler for generate noises. 
                 Choose a great scheduler can skip many denoising step, by default 'ddpm'.
+                only support 'ddpm' now !
             clip_noise (bool, optional):
                 Whether to clip each denoised output, by default True.
             clip_noise_range (tuple, optional):
@@ -425,48 +290,33 @@ class GaussianDiffusion(nn.Layer):
         # set timesteps
         scheduler.set_timesteps(num_inference_steps)
 
-        # prepare first noise variables
         noisy_input = noise
-        timesteps = scheduler.timesteps
-        if ref_x is not None:
-            init_timestep = None
-            if strength is None or strength < 0. or strength > 1.:
-                strength = None
-                if self.num_max_timesteps is not None:
-                    strength = self.num_max_timesteps / self.num_train_timesteps
-            if strength is not None:
-                # get the original timestep using init_timestep
-                init_timestep = min(
-                    int(num_inference_steps * strength), num_inference_steps)
-                t_start = max(num_inference_steps - init_timestep, 0)
-                timesteps = scheduler.timesteps[t_start:]
-                num_inference_steps = num_inference_steps - t_start
-                noisy_input = scheduler.add_noise(
-                    ref_x, noise, timesteps[:1].tile([noise.shape[0]]))
-
-        # denoising loop
+        if self.stretch and ref_x is not None:
+            ref_x = ref_x.transpose((0, 2, 1))
+            ref_x = self.norm_spec(ref_x)
+            ref_x = ref_x.transpose((0, 2, 1))
+
+            # for ddpm
+            timesteps = paddle.to_tensor(
+                np.flipud(np.arange(num_inference_steps)))
+            noisy_input = scheduler.add_noise(ref_x, noise, timesteps[0])
+
         denoised_output = noisy_input
         if clip_noise:
             n_min, n_max = clip_noise_range
             denoised_output = paddle.clip(denoised_output, n_min, n_max)
-        num_warmup_steps = len(
-            timesteps) - num_inference_steps * scheduler.order
         for i, t in enumerate(timesteps):
             denoised_output = scheduler.scale_model_input(denoised_output, t)
-
-            # predict the noise residual
             noise_pred = self.denoiser(denoised_output, t, cond)
-
             # compute the previous noisy sample x_t -> x_t-1
             denoised_output = scheduler.step(noise_pred, t,
                                              denoised_output).prev_sample
             if clip_noise:
                 denoised_output = paddle.clip(denoised_output, n_min, n_max)
 
-            # call the callback, if provided
-            if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and
-                                           (i + 1) % scheduler.order == 0):
-                if callback is not None and i % callback_steps == 0:
-                    callback(i, t, len(timesteps), denoised_output)
+        if self.stretch:
+            denoised_output = denoised_output.transpose((0, 2, 1))
+            denoised_output = self.denorm_spec(denoised_output)
+            denoised_output = denoised_output.transpose((0, 2, 1))
 
         return denoised_output
diff --git a/paddlespeech/t2s/modules/losses.py b/paddlespeech/t2s/modules/losses.py
index 1a43f5ef..b4d78364 100644
--- a/paddlespeech/t2s/modules/losses.py
+++ b/paddlespeech/t2s/modules/losses.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
+from typing import Tuple
 
 import librosa
 import numpy as np
@@ -19,8 +20,13 @@ import paddle
 from paddle import nn
 from paddle.nn import functional as F
 from scipy import signal
+from scipy.stats import betabinom
+from typeguard import check_argument_types
 
 from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask
+from paddlespeech.t2s.modules.predictor.duration_predictor import (
+    DurationPredictorLoss,  # noqa: H301
+)
 
 
 # Losses for WaveRNN
@@ -1126,3 +1132,195 @@ class MLMLoss(nn.Layer):
                 text_masked_pos_reshape) / paddle.sum((text_masked_pos) + 1e-10)
 
         return mlm_loss, text_mlm_loss
+
+
+class VarianceLoss(nn.Layer):
+    def __init__(self, use_masking: bool=True,
+                 use_weighted_masking: bool=False):
+        """Initialize JETS variance loss module.
+        Args:
+            use_masking (bool): Whether to apply masking for padded part in loss
+                calculation.
+            use_weighted_masking (bool): Whether to weighted masking in loss
+                calculation.
+
+        """
+        assert check_argument_types()
+        super().__init__()
+
+        assert (use_masking != use_weighted_masking) or not use_masking
+        self.use_masking = use_masking
+        self.use_weighted_masking = use_weighted_masking
+
+        # define criterions
+        reduction = "none" if self.use_weighted_masking else "mean"
+        self.mse_criterion = nn.MSELoss(reduction=reduction)
+        self.duration_criterion = DurationPredictorLoss(reduction=reduction)
+
+    def forward(
+            self,
+            d_outs: paddle.Tensor,
+            ds: paddle.Tensor,
+            p_outs: paddle.Tensor,
+            ps: paddle.Tensor,
+            e_outs: paddle.Tensor,
+            es: paddle.Tensor,
+            ilens: paddle.Tensor,
+    ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]:
+        """Calculate forward propagation.
+
+        Args:
+            d_outs (LongTensor): Batch of outputs of duration predictor (B, T_text).
+            ds (LongTensor): Batch of durations (B, T_text).
+            p_outs (Tensor): Batch of outputs of pitch predictor (B, T_text, 1).
+            ps (Tensor): Batch of target token-averaged pitch (B, T_text, 1).
+            e_outs (Tensor): Batch of outputs of energy predictor (B, T_text, 1).
+            es (Tensor): Batch of target token-averaged energy (B, T_text, 1).
+            ilens (LongTensor): Batch of the lengths of each input (B,).
+
+        Returns:
+            Tensor: Duration predictor loss value.
+            Tensor: Pitch predictor loss value.
+            Tensor: Energy predictor loss value.
+
+        """
+        # apply mask to remove padded part
+        if self.use_masking:
+            duration_masks = paddle.to_tensor(
+                make_non_pad_mask(ilens), place=ds.place)
+            d_outs = d_outs.masked_select(duration_masks)
+            ds = ds.masked_select(duration_masks)
+            pitch_masks = paddle.to_tensor(
+                make_non_pad_mask(ilens).unsqueeze(-1), place=ds.place)
+            p_outs = p_outs.masked_select(pitch_masks)
+            e_outs = e_outs.masked_select(pitch_masks)
+            ps = ps.masked_select(pitch_masks)
+            es = es.masked_select(pitch_masks)
+
+        # calculate loss
+        duration_loss = self.duration_criterion(d_outs, ds)
+        pitch_loss = self.mse_criterion(p_outs, ps)
+        energy_loss = self.mse_criterion(e_outs, es)
+
+        # make weighted mask and apply it
+        if self.use_weighted_masking:
+            duration_masks = paddle.to_tensor(
+                make_non_pad_mask(ilens), place=ds.place)
+            duration_weights = (duration_masks.float() /
+                                duration_masks.sum(dim=1, keepdim=True).float())
+            duration_weights /= ds.size(0)
+
+            # apply weight
+            duration_loss = (duration_loss.mul(duration_weights).masked_select(
+                duration_masks).sum())
+            pitch_masks = duration_masks.unsqueeze(-1)
+            pitch_weights = duration_weights.unsqueeze(-1)
+            pitch_loss = pitch_loss.mul(pitch_weights).masked_select(
+                pitch_masks).sum()
+            energy_loss = (
+                energy_loss.mul(pitch_weights).masked_select(pitch_masks).sum())
+
+        return duration_loss, pitch_loss, energy_loss
+
+
+class ForwardSumLoss(nn.Layer):
+    """
+    https://openreview.net/forum?id=0NQwnnwAORi
+    """
+
+    def __init__(self, cache_prior: bool=True):
+        """
+        Args:
+            cache_prior (bool): Whether to cache beta-binomial prior
+        """
+        super().__init__()
+        self.cache_prior = cache_prior
+        self._cache = {}
+
+    def forward(
+            self,
+            log_p_attn: paddle.Tensor,
+            ilens: paddle.Tensor,
+            olens: paddle.Tensor,
+            blank_prob: float=np.e**-1, ) -> paddle.Tensor:
+        """
+        Args:
+            log_p_attn (Tensor): Batch of log probability of attention matrix (B, T_feats, T_text).
+            ilens (Tensor): Batch of the lengths of each input (B,).
+            olens (Tensor): Batch of the lengths of each target (B,).
+            blank_prob (float): Blank symbol probability
+
+        Returns:
+            Tensor: forwardsum loss value.
+        """
+
+        B = log_p_attn.shape[0]
+        # add beta-binomial prior
+        bb_prior = self._generate_prior(ilens, olens)
+        bb_prior = paddle.to_tensor(
+            bb_prior, dtype=log_p_attn.dtype, place=log_p_attn.place)
+        log_p_attn = log_p_attn + bb_prior
+
+        # a row must be added to the attention matrix to account for blank token of CTC loss
+        # (B,T_feats,T_text+1)
+        log_p_attn_pd = F.pad(
+            log_p_attn, (0, 0, 0, 0, 1, 0), value=np.log(blank_prob))
+        loss = 0
+        for bidx in range(B):
+            # construct target sequnece.
+            # Every text token is mapped to a unique sequnece number.
+            target_seq = paddle.arange(
+                1, ilens[bidx] + 1, dtype="int32").unsqueeze(0)
+            cur_log_p_attn_pd = log_p_attn_pd[bidx, :olens[bidx], :ilens[
+                bidx] + 1].unsqueeze(1)  # (T_feats,1,T_text+1)
+            # The input of ctc_loss API need to be fixed
+            loss += F.ctc_loss(
+                log_probs=cur_log_p_attn_pd,
+                labels=target_seq,
+                input_lengths=olens[bidx:bidx + 1],
+                label_lengths=ilens[bidx:bidx + 1])
+        loss = loss / B
+
+        return loss
+
+    def _generate_prior(self, text_lengths, feats_lengths,
+                        w=1) -> paddle.Tensor:
+        """Generate alignment prior formulated as beta-binomial distribution
+
+        Args:
+            text_lengths (Tensor): Batch of the lengths of each input (B,).
+            feats_lengths (Tensor): Batch of the lengths of each target (B,).
+            w (float): Scaling factor; lower -> wider the width
+
+        Returns:
+            Tensor: Batched 2d static prior matrix (B, T_feats, T_text)   
+        """
+        B = len(text_lengths)
+        T_text = text_lengths.max()
+        T_feats = feats_lengths.max()
+
+        bb_prior = paddle.full((B, T_feats, T_text), fill_value=-np.inf)
+        for bidx in range(B):
+            T = feats_lengths[bidx].item()
+            N = text_lengths[bidx].item()
+
+            key = str(T) + ',' + str(N)
+            if self.cache_prior and key in self._cache:
+                prob = self._cache[key]
+            else:
+                alpha = w * np.arange(1, T + 1, dtype=float)  # (T,)
+                beta = w * np.array([T - t + 1 for t in alpha])
+                k = np.arange(N)
+                batched_k = k[..., None]  # (N,1)
+                prob = betabinom.pmf(batched_k, N, alpha, beta)  # (N,T)
+
+            # store cache
+            if self.cache_prior and key not in self._cache:
+                self._cache[key] = prob
+
+            prob = paddle.to_tensor(
+                prob, place=text_lengths.place, dtype="float32").transpose(
+                    (1, 0))  # -> (T,N)
+            bb_prior[bidx, :T, :N] = prob
+
+        return bb_prior
diff --git a/paddlespeech/t2s/modules/masked_fill.py b/paddlespeech/t2s/modules/masked_fill.py
index b3222254..1445a926 100644
--- a/paddlespeech/t2s/modules/masked_fill.py
+++ b/paddlespeech/t2s/modules/masked_fill.py
@@ -38,11 +38,9 @@ def masked_fill(xs: paddle.Tensor,
                 value: Union[float, int]):
     # comment following line for converting dygraph to static graph. 
     # assert is_broadcastable(xs.shape, mask.shape) is True
-    # bshape = paddle.broadcast_shape(xs.shape, mask.shape)   
     bshape = broadcast_shape(xs.shape, mask.shape)
     mask.stop_gradient = True
     mask = mask.broadcast_to(bshape)
-
     trues = paddle.ones_like(xs) * value
     mask = mask.cast(dtype=paddle.bool)
     xs = paddle.where(mask, trues, xs)
diff --git a/paddlespeech/t2s/modules/nets_utils.py b/paddlespeech/t2s/modules/nets_utils.py
index 798e4dee..57c46e3a 100644
--- a/paddlespeech/t2s/modules/nets_utils.py
+++ b/paddlespeech/t2s/modules/nets_utils.py
@@ -20,6 +20,44 @@ import paddle
 from paddle import nn
 from typeguard import check_argument_types
 
+from paddlespeech.utils.initialize import _calculate_fan_in_and_fan_out
+from paddlespeech.utils.initialize import kaiming_uniform_
+from paddlespeech.utils.initialize import normal_
+from paddlespeech.utils.initialize import ones_
+from paddlespeech.utils.initialize import uniform_
+from paddlespeech.utils.initialize import zeros_
+
+
+# default init method of torch
+# copy from https://github.com/PaddlePaddle/PaddleSpeech/blob/9cf8c1985a98bb380c183116123672976bdfe5c9/paddlespeech/t2s/models/vits/vits.py#L506
+def _reset_parameters(module):
+    if isinstance(module, (nn.Conv1D, nn.Conv1DTranspose, nn.Conv2D,
+                           nn.Conv2DTranspose)):
+        kaiming_uniform_(module.weight, a=math.sqrt(5))
+        if module.bias is not None:
+            fan_in, _ = _calculate_fan_in_and_fan_out(module.weight)
+            if fan_in != 0:
+                bound = 1 / math.sqrt(fan_in)
+                uniform_(module.bias, -bound, bound)
+
+    if isinstance(module,
+                  (nn.BatchNorm1D, nn.BatchNorm2D, nn.GroupNorm, nn.LayerNorm)):
+        ones_(module.weight)
+        zeros_(module.bias)
+
+    if isinstance(module, nn.Linear):
+        kaiming_uniform_(module.weight, a=math.sqrt(5))
+        if module.bias is not None:
+            fan_in, _ = _calculate_fan_in_and_fan_out(module.weight)
+            bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+            uniform_(module.bias, -bound, bound)
+
+    if isinstance(module, nn.Embedding):
+        normal_(module.weight)
+        if module._padding_idx is not None:
+            with paddle.no_grad():
+                module.weight[module._padding_idx] = 0
+
 
 def pad_list(xs, pad_value):
     """Perform padding for the list of tensors.
@@ -143,20 +181,20 @@ def make_pad_mask(lengths, xs=None, length_dim=-1):
     if length_dim == 0:
         raise ValueError("length_dim cannot be 0: {}".format(length_dim))
 
-    bs = paddle.shape(lengths)[0]
+    bs = paddle.shape(lengths)
     if xs is None:
-        maxlen = lengths.max()
+        maxlen = paddle.cast(lengths.max(), dtype=bs.dtype)
     else:
         maxlen = paddle.shape(xs)[length_dim]
 
     seq_range = paddle.arange(0, maxlen, dtype=paddle.int64)
+    # VITS 最后一个 expand 的位置
     seq_range_expand = seq_range.unsqueeze(0).expand([bs, maxlen])
     seq_length_expand = lengths.unsqueeze(-1)
     mask = seq_range_expand >= seq_length_expand.cast(seq_range_expand.dtype)
 
     if xs is not None:
         assert paddle.shape(xs)[0] == bs, (paddle.shape(xs)[0], bs)
-
         if length_dim < 0:
             length_dim = len(paddle.shape(xs)) + length_dim
         # ind = (:, None, ..., None, :, , None, ..., None)
diff --git a/paddlespeech/t2s/modules/predictor/variance_predictor.py b/paddlespeech/t2s/modules/predictor/variance_predictor.py
index 4c2a67cc..197f7359 100644
--- a/paddlespeech/t2s/modules/predictor/variance_predictor.py
+++ b/paddlespeech/t2s/modules/predictor/variance_predictor.py
@@ -96,7 +96,7 @@ class VariancePredictor(nn.Layer):
             xs = f(xs)
         # (B, Tmax, 1)
         xs = self.linear(xs.transpose([0, 2, 1]))
-
+    
         if x_masks is not None:
             xs = masked_fill(xs, x_masks, 0.0)
         return xs
diff --git a/paddlespeech/t2s/modules/transformer/attention.py b/paddlespeech/t2s/modules/transformer/attention.py
index e3c9a992..3237be1b 100644
--- a/paddlespeech/t2s/modules/transformer/attention.py
+++ b/paddlespeech/t2s/modules/transformer/attention.py
@@ -103,7 +103,7 @@ class MultiHeadedAttention(nn.Layer):
             mask = paddle.logical_not(mask)
             # assume scores.dtype==paddle.float32, we only use "float32" here
             dtype = str(scores.dtype).split(".")[-1]
-            min_value = numpy.finfo(dtype).min
+            min_value = float(numpy.finfo(dtype).min)
             scores = masked_fill(scores, mask, min_value)
             # (batch, head, time1, time2)
             self.attn = softmax(scores)
@@ -192,12 +192,11 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):
         x_padded = paddle.concat([zero_pad, x], axis=-1)
         x_padded = x_padded.reshape([b, h, t2 + 1, t1])
         # only keep the positions from 0 to time2
-        x = x_padded[:, :, 1:].reshape([b, h, t1, t2])[:, :, :, :t2 // 2 + 1]
-
+        new_t = paddle.cast(paddle.floor(t2 / 2) + 1, dtype='int32')
+        x = x_padded[:, :, 1:].reshape([b, h, t1, t2])[:, :, :, :new_t]
         if self.zero_triu:
             ones = paddle.ones((t1, t2))
             x = x * paddle.tril(ones, t2 - t1)[None, None, :, :]
-
         return x
 
     def forward(self, query, key, value, pos_emb, mask):
@@ -221,7 +220,6 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):
         q, k, v = self.forward_qkv(query, key, value)
         # (batch, time1, head, d_k)
         q = q.transpose([0, 2, 1, 3])
-
         n_batch_pos = paddle.shape(pos_emb)[0]
         p = self.linear_pos(pos_emb).reshape(
             [n_batch_pos, -1, self.h, self.d_k])
diff --git a/paddlespeech/t2s/modules/transformer/embedding.py b/paddlespeech/t2s/modules/transformer/embedding.py
index 7ba301cb..f90eb44a 100644
--- a/paddlespeech/t2s/modules/transformer/embedding.py
+++ b/paddlespeech/t2s/modules/transformer/embedding.py
@@ -198,7 +198,8 @@ class RelPositionalEncoding(nn.Layer):
         x = x * self.xscale
         T = paddle.shape(x)[1]
         pe_size = paddle.shape(self.pe)
-        pos_emb = self.pe[:, pe_size[1] // 2 - T + 1:pe_size[1] // 2 + T, ]
+        tmp = paddle.cast(paddle.floor(pe_size[1] / 2), dtype='int32')
+        pos_emb = self.pe[:, tmp - T + 1:tmp + T, ]
         return self.dropout(x), self.dropout(pos_emb)
 
 
diff --git a/paddlespeech/t2s/modules/transformer/encoder.py b/paddlespeech/t2s/modules/transformer/encoder.py
index f2aed589..0fd94689 100644
--- a/paddlespeech/t2s/modules/transformer/encoder.py
+++ b/paddlespeech/t2s/modules/transformer/encoder.py
@@ -15,6 +15,7 @@
 from typing import List
 from typing import Union
 
+import paddle
 from paddle import nn
 
 from paddlespeech.t2s.modules.activation import get_activation
@@ -390,7 +391,13 @@ class TransformerEncoder(BaseEncoder):
             padding_idx=padding_idx,
             encoder_type="transformer")
 
-    def forward(self, xs, masks):
+    def forward(self,
+                xs: paddle.Tensor,
+                masks: paddle.Tensor,
+                note_emb: paddle.Tensor=None,
+                note_dur_emb: paddle.Tensor=None,
+                is_slur_emb: paddle.Tensor=None,
+                scale: int=16):
         """Encoder input sequence.
 
         Args:
@@ -398,6 +405,12 @@ class TransformerEncoder(BaseEncoder):
                 Input tensor (#batch, time, idim).
             masks(Tensor): 
                 Mask tensor (#batch, 1, time).
+            note_emb(Tensor): 
+                Input tensor (#batch, time, attention_dim).
+            note_dur_emb(Tensor): 
+                Input tensor (#batch, time, attention_dim).
+            is_slur_emb(Tensor): 
+                Input tensor (#batch, time, attention_dim).
 
         Returns:
             Tensor: 
@@ -406,6 +419,8 @@ class TransformerEncoder(BaseEncoder):
                 Mask tensor (#batch, 1, time).
         """
         xs = self.embed(xs)
+        if note_emb is not None:
+            xs = scale * xs + note_emb + note_dur_emb + is_slur_emb
         xs, masks = self.encoders(xs, masks)
         if self.normalize_before:
             xs = self.after_norm(xs)
diff --git a/paddlespeech/t2s/modules/transformer/lightconv.py b/paddlespeech/t2s/modules/transformer/lightconv.py
index 22217d50..85336f4f 100644
--- a/paddlespeech/t2s/modules/transformer/lightconv.py
+++ b/paddlespeech/t2s/modules/transformer/lightconv.py
@@ -110,7 +110,7 @@ class LightweightConvolution(nn.Layer):
                 (batch, time1, time2) mask
 
         Return:
-            Tensor: ouput. (batch, time1, d_model) 
+            Tensor: output. (batch, time1, d_model) 
 
         """
         # linear -> GLU -> lightconv -> linear
diff --git a/paddlespeech/t2s/modules/transformer/multi_layer_conv.py b/paddlespeech/t2s/modules/transformer/multi_layer_conv.py
index 91d67ca5..a322becd 100644
--- a/paddlespeech/t2s/modules/transformer/multi_layer_conv.py
+++ b/paddlespeech/t2s/modules/transformer/multi_layer_conv.py
@@ -69,8 +69,8 @@ class MultiLayeredConv1d(nn.Layer):
             Tensor: Batch of output tensors (B, T, in_chans).
         """
         x = self.relu(self.w_1(x.transpose([0, 2, 1]))).transpose([0, 2, 1])
-        return self.w_2(self.dropout(x).transpose([0, 2, 1])).transpose(
-            [0, 2, 1])
+        out = self.w_2(self.dropout(x).transpose([0, 2, 1])).transpose([0, 2, 1])
+        return out
 
 
 class Conv1dLinear(nn.Layer):
diff --git a/paddlespeech/t2s/modules/wavenet_denoiser.py b/paddlespeech/t2s/modules/wavenet_denoiser.py
new file mode 100644
index 00000000..61c92265
--- /dev/null
+++ b/paddlespeech/t2s/modules/wavenet_denoiser.py
@@ -0,0 +1,185 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+
+import paddle
+from paddle import nn
+from ppdiffusers.models.embeddings import Timesteps
+
+from paddlespeech.t2s.modules.nets_utils import initialize
+from paddlespeech.t2s.modules.residual_block import WaveNetResidualBlock
+
+
+class WaveNetDenoiser(nn.Layer):
+    """A Mel-Spectrogram Denoiser modified from WaveNet
+
+    Args:
+        in_channels (int, optional): 
+            Number of channels of the input mel-spectrogram, by default 80
+        out_channels (int, optional): 
+            Number of channels of the output mel-spectrogram, by default 80
+        kernel_size (int, optional): 
+            Kernel size of the residual blocks inside, by default 3
+        layers (int, optional): 
+            Number of residual blocks inside, by default 20
+        stacks (int, optional):
+            The number of groups to split the residual blocks into, by default 5
+            Within each group, the dilation of the residual block grows exponentially.
+        residual_channels (int, optional): 
+            Residual channel of the residual blocks, by default 256
+        gate_channels (int, optional): 
+            Gate channel of the residual blocks, by default 512
+        skip_channels (int, optional): 
+            Skip channel of the residual blocks, by default 256
+        aux_channels (int, optional): 
+            Auxiliary channel of the residual blocks, by default 256
+        dropout (float, optional): 
+            Dropout of the residual blocks, by default 0.
+        bias (bool, optional): 
+            Whether to use bias in residual blocks, by default True
+        use_weight_norm (bool, optional): 
+            Whether to use weight norm in all convolutions, by default False
+    """
+
+    def __init__(
+            self,
+            in_channels: int=80,
+            out_channels: int=80,
+            kernel_size: int=3,
+            layers: int=20,
+            stacks: int=5,
+            residual_channels: int=256,
+            gate_channels: int=512,
+            skip_channels: int=256,
+            aux_channels: int=256,
+            dropout: float=0.,
+            bias: bool=True,
+            use_weight_norm: bool=False,
+            init_type: str="kaiming_normal", ):
+        super().__init__()
+
+        # initialize parameters
+        initialize(self, init_type)
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.aux_channels = aux_channels
+        self.layers = layers
+        self.stacks = stacks
+        self.kernel_size = kernel_size
+
+        assert layers % stacks == 0
+        layers_per_stack = layers // stacks
+
+        self.first_t_emb = nn.Sequential(
+            Timesteps(
+                residual_channels,
+                flip_sin_to_cos=False,
+                downscale_freq_shift=1),
+            nn.Linear(residual_channels, residual_channels * 4),
+            nn.Mish(), nn.Linear(residual_channels * 4, residual_channels))
+        self.t_emb_layers = nn.LayerList([
+            nn.Linear(residual_channels, residual_channels)
+            for _ in range(layers)
+        ])
+
+        self.first_conv = nn.Conv1D(
+            in_channels, residual_channels, 1, bias_attr=True)
+        self.first_act = nn.ReLU()
+
+        self.conv_layers = nn.LayerList()
+        for layer in range(layers):
+            dilation = 2**(layer % layers_per_stack)
+            conv = WaveNetResidualBlock(
+                kernel_size=kernel_size,
+                residual_channels=residual_channels,
+                gate_channels=gate_channels,
+                skip_channels=skip_channels,
+                aux_channels=aux_channels,
+                dilation=dilation,
+                dropout=dropout,
+                bias=bias)
+            self.conv_layers.append(conv)
+
+        final_conv = nn.Conv1D(skip_channels, out_channels, 1, bias_attr=True)
+        nn.initializer.Constant(0.0)(final_conv.weight)
+        self.last_conv_layers = nn.Sequential(nn.ReLU(),
+                                              nn.Conv1D(
+                                                  skip_channels,
+                                                  skip_channels,
+                                                  1,
+                                                  bias_attr=True),
+                                              nn.ReLU(), final_conv)
+
+        if use_weight_norm:
+            self.apply_weight_norm()
+
+    def forward(self, x: paddle.Tensor, t: paddle.Tensor, c: paddle.Tensor):
+        """Denoise mel-spectrogram.
+
+        Args:
+            x(Tensor): 
+                Shape (B, C_in, T), The input mel-spectrogram.
+            t(Tensor): 
+                Shape (B), The timestep input.
+            c(Tensor): 
+                Shape (B, C_aux, T'). The auxiliary input (e.g. fastspeech2 encoder output).
+
+        Returns:
+            Tensor: Shape (B, C_out, T), the pred noise.
+        """
+        assert c.shape[-1] == x.shape[-1]
+
+        if t.shape[0] != x.shape[0]:
+            t = t.tile([x.shape[0]])
+        t_emb = self.first_t_emb(t)
+        t_embs = [
+            t_emb_layer(t_emb)[..., None] for t_emb_layer in self.t_emb_layers
+        ]
+
+        x = self.first_conv(x)
+        x = self.first_act(x)
+        skips = 0
+        for f, t in zip(self.conv_layers, t_embs):
+            x = x + t
+            x, s = f(x, c)
+            skips += s
+        skips *= math.sqrt(1.0 / len(self.conv_layers))
+
+        x = self.last_conv_layers(skips)
+        return x
+
+    def apply_weight_norm(self):
+        """Recursively apply weight normalization to all the Convolution layers
+        in the sublayers.
+        """
+
+        def _apply_weight_norm(layer):
+            if isinstance(layer, (nn.Conv1D, nn.Conv2D)):
+                nn.utils.weight_norm(layer)
+
+        self.apply(_apply_weight_norm)
+
+    def remove_weight_norm(self):
+        """Recursively remove weight normalization from all the Convolution 
+        layers in the sublayers.
+        """
+
+        def _remove_weight_norm(layer):
+            try:
+                nn.utils.remove_weight_norm(layer)
+            except ValueError:
+                pass
+
+        self.apply(_remove_weight_norm)
diff --git a/paddlespeech/t2s/training/trainer.py b/paddlespeech/t2s/training/trainer.py
index 9a32bca8..7631ef35 100644
--- a/paddlespeech/t2s/training/trainer.py
+++ b/paddlespeech/t2s/training/trainer.py
@@ -20,7 +20,7 @@ from typing import List
 from typing import Union
 
 import six
-
+import paddle
 from paddlespeech.t2s.training.extension import Extension
 from paddlespeech.t2s.training.extension import PRIORITY_READER
 from paddlespeech.t2s.training.reporter import scope
@@ -162,8 +162,13 @@ class Trainer(object):
                     ) + "avg_batch_cost: {:.5f} sec, ".format(avg_batch_cost)
                     msg += "avg_samples: {}, ".format(
                         self.updater.
-                        batch_size) + "avg_ips: {:.5f} sequences/sec".format(
+                        batch_size) + "avg_ips: {:.5f} sequences/sec,".format(
                             self.updater.batch_size / avg_batch_cost)
+                    if paddle.device.is_compiled_with_cuda():
+                        max_mem_reserved_str = f" max_mem_reserved: {paddle.device.cuda.max_memory_reserved() // (1024 ** 2)} MB"
+                        max_mem_allocated_str = f" max_mem_allocated: {paddle.device.cuda.max_memory_allocated() // (1024 ** 2)} MB"
+                        msg += max_mem_reserved_str + "," + max_mem_allocated_str
+
                     logger.info(msg)
 
                     # execute extension when necessary
diff --git a/paddlespeech/t2s/utils/profiler.py b/paddlespeech/t2s/utils/profiler.py
index 2bbeb02d..629ef4ef 100644
--- a/paddlespeech/t2s/utils/profiler.py
+++ b/paddlespeech/t2s/utils/profiler.py
@@ -11,9 +11,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import sys
 
-import paddle
+import sys
+import paddle.profiler as profiler
 
 # A global variable to record the number of calling times for profiler
 # functions. It is used to specify the tracing range of training steps.
@@ -21,7 +21,7 @@ _profiler_step_id = 0
 
 # A global variable to avoid parsing from string every time.
 _profiler_options = None
-
+_prof = None
 
 class ProfilerOptions(object):
     '''
@@ -31,6 +31,7 @@ class ProfilerOptions(object):
       "profile_path=model.profile"
       "batch_range=[50, 60]; profile_path=model.profile"
       "batch_range=[50, 60]; tracer_option=OpDetail; profile_path=model.profile"
+
     ProfilerOptions supports following key-value pair:
       batch_range      - a integer list, e.g. [100, 110].
       state            - a string, the optional values are 'CPU', 'GPU' or 'All'. 
@@ -52,7 +53,8 @@ class ProfilerOptions(object):
             'sorted_key': 'total',
             'tracer_option': 'Default',
             'profile_path': '/tmp/profile',
-            'exit_on_finished': True
+            'exit_on_finished': True,
+            'timer_only': True
         }
         self._parse_from_string(options_str)
 
@@ -71,6 +73,8 @@ class ProfilerOptions(object):
                     'state', 'sorted_key', 'tracer_option', 'profile_path'
             ]:
                 self._options[key] = value
+            elif key == 'timer_only':
+                self._options[key] = value
 
     def __getitem__(self, name):
         if self._options.get(name, None) is None:
@@ -84,7 +88,6 @@ def add_profiler_step(options_str=None):
     Enable the operator-level timing using PaddlePaddle's profiler.
     The profiler uses a independent variable to count the profiler steps.
     One call of this function is treated as a profiler step.
-    
     Args:
       profiler_options - a string to initialize the ProfilerOptions.
                          Default is None, and the profiler is disabled.
@@ -92,18 +95,33 @@ def add_profiler_step(options_str=None):
     if options_str is None:
         return
 
+    global _prof 
     global _profiler_step_id
     global _profiler_options
 
     if _profiler_options is None:
         _profiler_options = ProfilerOptions(options_str)
-
-    if _profiler_step_id == _profiler_options['batch_range'][0]:
-        paddle.utils.profiler.start_profiler(_profiler_options['state'],
-                                             _profiler_options['tracer_option'])
-    elif _profiler_step_id == _profiler_options['batch_range'][1]:
-        paddle.utils.profiler.stop_profiler(_profiler_options['sorted_key'],
-                                            _profiler_options['profile_path'])
+    # profile : https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/performance_improving/profiling_model.html#chakanxingnengshujudetongjibiaodan
+    # timer_only = True  only the model's throughput and time overhead are displayed
+    # timer_only = False calling summary can print a statistical form that presents performance data from different perspectives.
+    # timer_only = False the output Timeline information can be found in the profiler_log directory
+    if _prof is None:
+        _timer_only = str(_profiler_options['timer_only']) == str(True)
+        _prof = profiler.Profiler(
+                   scheduler = (_profiler_options['batch_range'][0], _profiler_options['batch_range'][1]),
+                   on_trace_ready = profiler.export_chrome_tracing('./profiler_log'),
+                   timer_only = _timer_only)
+        _prof.start()
+    else:
+        _prof.step()
+        
+    if _profiler_step_id == _profiler_options['batch_range'][1]:
+        _prof.stop()
+        _prof.summary(
+             op_detail=True,
+             thread_sep=False,
+             time_unit='ms')
+        _prof = None
         if _profiler_options['exit_on_finished']:
             sys.exit(0)
 
diff --git a/paddlespeech/text/exps/ernie_linear/train.py b/paddlespeech/text/exps/ernie_linear/train.py
index 22c25e17..f6eab6fe 100644
--- a/paddlespeech/text/exps/ernie_linear/train.py
+++ b/paddlespeech/text/exps/ernie_linear/train.py
@@ -66,7 +66,7 @@ def train_sp(args, config):
     seed_everything(config.seed)
 
     print(
-        f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}",
+        f"rank:{dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}"
     )
     # dataloader has been too verbose
     logging.getLogger("DataLoader").disabled = True
diff --git a/paddlespeech/utils/argparse.py b/paddlespeech/utils/argparse.py
new file mode 100644
index 00000000..aad3801e
--- /dev/null
+++ b/paddlespeech/utils/argparse.py
@@ -0,0 +1,100 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import hashlib
+import os
+import sys
+from typing import Text
+
+import distutils
+
+__all__ = ["print_arguments", "add_arguments", "get_commandline_args"]
+
+
+def get_commandline_args():
+    extra_chars = [
+        " ",
+        ";",
+        "&",
+        "(",
+        ")",
+        "|",
+        "^",
+        "<",
+        ">",
+        "?",
+        "*",
+        "[",
+        "]",
+        "$",
+        "`",
+        '"',
+        "\\",
+        "!",
+        "{",
+        "}",
+    ]
+
+    # Escape the extra characters for shell
+    argv = [
+        arg.replace("'", "'\\''") if all(char not in arg
+                                         for char in extra_chars) else
+        "'" + arg.replace("'", "'\\''") + "'" for arg in sys.argv
+    ]
+
+    return sys.executable + " " + " ".join(argv)
+
+
+def print_arguments(args, info=None):
+    """Print argparse's arguments.
+
+    Usage:
+
+    .. code-block:: python
+
+        parser = argparse.ArgumentParser()
+        parser.add_argument("name", default="Jonh", type=str, help="User name.")
+        args = parser.parse_args()
+        print_arguments(args)
+
+    :param args: Input argparse.Namespace for printing.
+    :type args: argparse.Namespace
+    """
+    filename = ""
+    if info:
+        filename = info["__file__"]
+    filename = os.path.basename(filename)
+    print(f"----------- {filename} Configuration Arguments -----------")
+    for arg, value in sorted(vars(args).items()):
+        print("%s: %s" % (arg, value))
+    print("-----------------------------------------------------------")
+
+
+def add_arguments(argname, type, default, help, argparser, **kwargs):
+    """Add argparse's argument.
+
+    Usage:
+
+    .. code-block:: python
+
+        parser = argparse.ArgumentParser()
+        add_argument("name", str, "Jonh", "User name.", parser)
+        args = parser.parse_args()
+    """
+    type = distutils.util.strtobool if type == bool else type
+    argparser.add_argument(
+        "--" + argname,
+        default=default,
+        type=type,
+        help=help + ' Default: %(default)s.',
+        **kwargs)
diff --git a/paddlespeech/vector/exps/ecapa_tdnn/train.py b/paddlespeech/vector/exps/ecapa_tdnn/train.py
index bf014045..2dc7a716 100644
--- a/paddlespeech/vector/exps/ecapa_tdnn/train.py
+++ b/paddlespeech/vector/exps/ecapa_tdnn/train.py
@@ -51,7 +51,7 @@ def main(args, config):
     # stage0: set the training device, cpu or gpu
     paddle.set_device(args.device)
 
-    # stage1: we must call the paddle.distributed.init_parallel_env() api at the begining
+    # stage1: we must call the paddle.distributed.init_parallel_env() api at the beginning
     paddle.distributed.init_parallel_env()
     nranks = paddle.distributed.get_world_size()
     rank = paddle.distributed.get_rank()
@@ -146,7 +146,7 @@ def main(args, config):
     timer.start()
 
     for epoch in range(start_epoch + 1, config.epochs + 1):
-        # at the begining, model must set to train mode
+        # at the beginning, model must set to train mode
         model.train()
 
         avg_loss = 0
diff --git a/paddlespeech/vector/exps/ge2e/preprocess.py b/paddlespeech/vector/exps/ge2e/preprocess.py
index dabe0ce7..ee59e624 100644
--- a/paddlespeech/vector/exps/ge2e/preprocess.py
+++ b/paddlespeech/vector/exps/ge2e/preprocess.py
@@ -42,7 +42,7 @@ if __name__ == "__main__":
     parser.add_argument(
         "--skip_existing",
         action="store_true",
-        help="Whether to skip ouput files with the same name. Useful if this script was interrupted."
+        help="Whether to skip output files with the same name. Useful if this script was interrupted."
     )
     parser.add_argument(
         "--no_trim",
diff --git a/paddlespeech/vector/io/augment.py b/paddlespeech/vector/io/augment.py
index 0aa89c6a..ffda1386 100644
--- a/paddlespeech/vector/io/augment.py
+++ b/paddlespeech/vector/io/augment.py
@@ -343,7 +343,8 @@ class Resample(nn.Layer):
         window_width = self.lowpass_filter_width / (2.0 * lowpass_cutoff)
 
         assert lowpass_cutoff < min(self.orig_freq, self.new_freq) / 2
-        output_t = paddle.arange(start=0.0, end=self.output_samples)
+        output_t = paddle.arange(
+            start=0.0, end=self.output_samples, dtype='int64')
         output_t /= self.new_freq
         min_t = output_t - window_width
         max_t = output_t + window_width
diff --git a/setup.py b/setup.py
index 578f4986..7d3af40d 100644
--- a/setup.py
+++ b/setup.py
@@ -40,16 +40,20 @@ base = [
     "hyperpyyaml",
     "inflect",
     "jsonlines",
+    # paddleaudio align with librosa==0.8.1, which need numpy==1.23.x
+    "numpy==1.23.5",
     "librosa==0.8.1",
+    "scipy>=1.4.0",
     "loguru",
     "matplotlib",
     "nara_wpe",
     "onnxruntime>=1.11.0",
-    "opencc",
+    "opencc==1.1.6",
     "opencc-python-reimplemented",
     "pandas",
     "paddleaudio>=1.1.0",
     "paddlenlp>=2.4.8",
+    "paddlepaddle-gpu==2.5.1",
     "paddleslim>=2.3.4",
     "ppdiffusers>=0.9.0",
     "paddlespeech_feat",
@@ -259,6 +263,7 @@ setup_info = dict(
     long_description=read("README.md"),
     long_description_content_type="text/markdown",
     keywords=[
+        "SSL"
         "speech",
         "asr",
         "tts",
@@ -267,12 +272,19 @@ setup_info = dict(
         "text frontend",
         "MFA",
         "paddlepaddle",
+        "paddleaudio",
+        "streaming asr",
+        "streaming tts",
         "beam search",
         "ctcdecoder",
         "deepspeech2",
+        "wav2vec2",
+        "hubert",
+        "wavlm",
         "transformer",
         "conformer",
         "fastspeech2",
+        "hifigan",
         "gan vocoders",
     ],
     python_requires='>=3.7',
diff --git a/speechx/README.md b/speechx/README.md
index 5d4b5845..66227c68 100644
--- a/speechx/README.md
+++ b/speechx/README.md
@@ -3,7 +3,7 @@
 ## Environment
 
 We develop under:
-* python - 3.7
+* python - >=3.8
 * docker - `registry.baidubce.com/paddlepaddle/paddle:2.2.2-gpu-cuda10.2-cudnn7`
 * os - Ubuntu 16.04.7 LTS
 * gcc/g++/gfortran - 8.2.0
@@ -99,7 +99,7 @@ please install paddlepaddle >= 2.4rc
 
 
 ```
-cd $YOUR_ENV_PATH/lib/python3.7/site-packages/paddle/fluid
+cd $YOUR_ENV_PATH/lib/python3.8/site-packages/paddle/fluid
 patchelf --set-soname libpaddle.so libpaddle.so
 ```
 
diff --git a/speechx/examples/ds2_ol/onnx/local/onnx_infer_shape.py b/speechx/examples/ds2_ol/onnx/local/onnx_infer_shape.py
index c53e9ec9..65709fc2 100755
--- a/speechx/examples/ds2_ol/onnx/local/onnx_infer_shape.py
+++ b/speechx/examples/ds2_ol/onnx/local/onnx_infer_shape.py
@@ -2078,7 +2078,7 @@ class SymbolicShapeInference:
         output_tensor_ranks = get_attribute(node, 'output_tensor_ranks')
         assert output_tensor_ranks
 
-        # set the context output seperately.
+        # set the context output separately.
         # The first output is autograd's context.
         vi = self.known_vi_[node.output[0]]
         vi.CopyFrom(
diff --git a/speechx/speechx/frontend/audio/db_norm.cc b/speechx/speechx/frontend/audio/db_norm.cc
index ad79fcc3..7141fc80 100644
--- a/speechx/speechx/frontend/audio/db_norm.cc
+++ b/speechx/speechx/frontend/audio/db_norm.cc
@@ -76,7 +76,7 @@ bool DecibelNormalizer::Compute(VectorBase<BaseFloat>* waves) const {
     if (gain > opts_.max_gain_db) {
         LOG(ERROR)
             << "Unable to normalize segment to " << opts_.target_db << "dB,"
-            << "because the the probable gain have exceeds opts_.max_gain_db"
+            << "because the probable gain has exceeded opts_.max_gain_db"
             << opts_.max_gain_db << "dB.";
         return false;
     }
diff --git a/speechx/speechx/kaldi/base/kaldi-types.h b/speechx/speechx/kaldi/base/kaldi-types.h
index c6a3e1ae..07381cf2 100644
--- a/speechx/speechx/kaldi/base/kaldi-types.h
+++ b/speechx/speechx/kaldi/base/kaldi-types.h
@@ -40,7 +40,7 @@ typedef float   BaseFloat;
 #include <stdint.h>
 
 // for discussion on what to do if you need compile kaldi
-// without OpenFST, see the bottom of this this file
+// without OpenFST, see the bottom of this file
 
 #ifndef COMPILE_WITHOUT_OPENFST
 
diff --git a/speechx/speechx/kaldi/feat/pitch-functions.cc b/speechx/speechx/kaldi/feat/pitch-functions.cc
index 430e9bdb..d71169ec 100644
--- a/speechx/speechx/kaldi/feat/pitch-functions.cc
+++ b/speechx/speechx/kaldi/feat/pitch-functions.cc
@@ -746,7 +746,7 @@ OnlinePitchFeatureImpl::OnlinePitchFeatureImpl(
   Vector<BaseFloat> lags_offset(lags_);
   // lags_offset equals lags_ (which are the log-spaced lag values we want to
   // measure the NCCF at) with nccf_first_lag_ / opts.resample_freq subtracted
-  // from each element, so we can treat the measured NCCF values as as starting
+  // from each element, so we can treat the measured NCCF values as starting
   // from sample zero in a signal that starts at the point start /
   // opts.resample_freq.  This is necessary because the ArbitraryResample code
   // assumes that the input signal starts from sample zero.
diff --git a/speechx/speechx/kaldi/lat/lattice-functions.h b/speechx/speechx/kaldi/lat/lattice-functions.h
index 6b1b6656..785d3f96 100644
--- a/speechx/speechx/kaldi/lat/lattice-functions.h
+++ b/speechx/speechx/kaldi/lat/lattice-functions.h
@@ -355,12 +355,12 @@ bool PruneLattice(BaseFloat beam, LatticeType *lat);
 //
 //
 // /// This function returns the number of words in the longest sentence in a
-// /// CompactLattice (i.e. the the maximum of any path, of the count of
+// /// CompactLattice (i.e. the maximum of any path, of the count of
 // /// olabels on that path).
 // int32 LongestSentenceLength(const Lattice &lat);
 //
 // /// This function returns the number of words in the longest sentence in a
-// /// CompactLattice, i.e. the the maximum of any path, of the count of
+// /// CompactLattice, i.e. the maximum of any path, of the count of
 // /// labels on that path... note, in CompactLattice, the ilabels and olabels
 // /// are identical because it is an acceptor.
 // int32 LongestSentenceLength(const CompactLattice &lat);
@@ -408,7 +408,7 @@ bool PruneLattice(BaseFloat beam, LatticeType *lat);
 //
 // /// This function computes the mapping from the pair
 // /// (frame-index, transition-id) to the pair
-// /// (sum-of-acoustic-scores, num-of-occurences) over all occurences of the
+// /// (sum-of-acoustic-scores, num-of-occurrences) over all occurrences of the
 // /// transition-id in that frame.
 // /// frame-index in the lattice.
 // /// This function is useful for retaining the acoustic scores in a
@@ -422,13 +422,13 @@ bool PruneLattice(BaseFloat beam, LatticeType *lat);
 // ///   @param [out] acoustic_scores
 // ///                     Pointer to a map from the pair (frame-index,
 // ///                     transition-id) to a pair (sum-of-acoustic-scores,
-// ///                     num-of-occurences).
+// ///                     num-of-occurrences).
 // ///                     Usually the acoustic scores for a pdf-id (and hence
 // ///                     transition-id) on a frame will be the same for all the
-// ///                     occurences of the pdf-id in that frame.
+// ///                     occurrences of the pdf-id in that frame.
 // ///                     But if not, we will take the average of the acoustic
 // ///                     scores. Hence, we store both the sum-of-acoustic-scores
-// ///                     and the num-of-occurences of the transition-id in that
+// ///                     and the num-of-occurrences of the transition-id in that
 // ///                     frame.
 // void ComputeAcousticScoresMap(
 //     const Lattice &lat,
@@ -440,8 +440,8 @@ bool PruneLattice(BaseFloat beam, LatticeType *lat);
 // ///
 // ///   @param [in] acoustic_scores
 // ///                      A map from the pair (frame-index, transition-id) to a
-// ///                      pair (sum-of-acoustic-scores, num-of-occurences) of
-// ///                      the occurences of the transition-id in that frame.
+// ///                      pair (sum-of-acoustic-scores, num-of-occurrences) of
+// ///                      the occurrences of the transition-id in that frame.
 // ///                      See the comments for ComputeAcousticScoresMap for
 // ///                      details.
 // ///   @param [out] lat   Pointer to the output lattice.
diff --git a/speechx/speechx/kaldi/matrix/kaldi-matrix.cc b/speechx/speechx/kaldi/matrix/kaldi-matrix.cc
index faf23cdf..85e6fecc 100644
--- a/speechx/speechx/kaldi/matrix/kaldi-matrix.cc
+++ b/speechx/speechx/kaldi/matrix/kaldi-matrix.cc
@@ -1646,7 +1646,7 @@ SubMatrix<Real>::SubMatrix(const MatrixBase<Real> &M,
                static_cast<UnsignedMatrixIndexT>(M.num_rows_ - ro) &&
                static_cast<UnsignedMatrixIndexT>(c) <=
                static_cast<UnsignedMatrixIndexT>(M.num_cols_ - co));
-  // point to the begining of window
+  // point to the beginning of window
   MatrixBase<Real>::num_rows_ = r;
   MatrixBase<Real>::num_cols_ = c;
   MatrixBase<Real>::stride_ = M.Stride();
diff --git a/speechx/speechx/kaldi/matrix/sparse-matrix.cc b/speechx/speechx/kaldi/matrix/sparse-matrix.cc
index 68a61e17..192d2584 100644
--- a/speechx/speechx/kaldi/matrix/sparse-matrix.cc
+++ b/speechx/speechx/kaldi/matrix/sparse-matrix.cc
@@ -998,7 +998,7 @@ void FilterCompressedMatrixRows(const CompressedMatrix &in,
   // iterating row-wise versus column-wise in compressed-matrix uncompression.
 
   if (num_kept_rows > heuristic * in.NumRows()) {
-    // if quite a few of the the rows are kept, it may be more efficient
+    // if quite a few of the rows are kept, it may be more efficient
     // to uncompress the entire compressed matrix, since per-column operation
     // is more efficient.
     Matrix<BaseFloat> full_mat(in);
diff --git a/speechx/speechx/kaldi/util/kaldi-table-inl.h b/speechx/speechx/kaldi/util/kaldi-table-inl.h
index 6aca2f13..175e2704 100644
--- a/speechx/speechx/kaldi/util/kaldi-table-inl.h
+++ b/speechx/speechx/kaldi/util/kaldi-table-inl.h
@@ -1587,7 +1587,7 @@ template<class Holder> class RandomAccessTableReaderImplBase {
 // this from a pipe.  In principle we could read it on-demand as for the
 // archives, but this would probably be overkill.
 
-// Note: the code for this this class is similar to TableWriterScriptImpl:
+// Note: the code for this class is similar to TableWriterScriptImpl:
 // try to keep them in sync.
 template<class Holder>
 class RandomAccessTableReaderScriptImpl:
diff --git a/speechx/speechx/nnet/ds2_nnet.cc b/speechx/speechx/nnet/ds2_nnet.cc
index 22c7f61b..f30d7979 100644
--- a/speechx/speechx/nnet/ds2_nnet.cc
+++ b/speechx/speechx/nnet/ds2_nnet.cc
@@ -105,7 +105,7 @@ paddle_infer::Predictor* PaddleNnet::GetPredictor() {
 
     while (pred_id < pool_usages.size()) {
         if (pool_usages[pred_id] == false) {
-            predictor = pool->Retrive(pred_id);
+            predictor = pool->Retrieve(pred_id);
             break;
         }
         ++pred_id;
diff --git a/speechx/speechx/protocol/websocket/websocket_server.cc b/speechx/speechx/protocol/websocket/websocket_server.cc
index 14f2f6e9..d1bed1ca 100644
--- a/speechx/speechx/protocol/websocket/websocket_server.cc
+++ b/speechx/speechx/protocol/websocket/websocket_server.cc
@@ -32,14 +32,14 @@ void ConnectionHandler::OnSpeechStart() {
     decode_thread_ = std::make_shared<std::thread>(
         &ConnectionHandler::DecodeThreadFunc, this);
     got_start_tag_ = true;
-    LOG(INFO) << "Server: Recieved speech start signal, start reading speech";
+    LOG(INFO) << "Server: Received speech start signal, start reading speech";
     json::value rv = {{"status", "ok"}, {"type", "server_ready"}};
     ws_.text(true);
     ws_.write(asio::buffer(json::serialize(rv)));
 }
 
 void ConnectionHandler::OnSpeechEnd() {
-    LOG(INFO) << "Server: Recieved speech end signal";
+    LOG(INFO) << "Server: Received speech end signal";
     if (recognizer_ != nullptr) {
         recognizer_->SetFinished();
     }
@@ -70,8 +70,8 @@ void ConnectionHandler::OnSpeechData(const beast::flat_buffer& buffer) {
         pcm_data(i) = static_cast<float>(*pdata);
         pdata++;
     }
-    VLOG(2) << "Server: Recieved " << num_samples << " samples";
-    LOG(INFO) << "Server: Recieved " << num_samples << " samples";
+    VLOG(2) << "Server: Received " << num_samples << " samples";
+    LOG(INFO) << "Server: Received " << num_samples << " samples";
     CHECK(recognizer_ != nullptr);
     recognizer_->Accept(pcm_data);
 
diff --git a/speechx/tools/venv.sh b/speechx/tools/venv.sh
index 3952988c..2aa7e509 100755
--- a/speechx/tools/venv.sh
+++ b/speechx/tools/venv.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
 set -ex
 
-PYTHON=python3.7
+PYTHON=python3.8
 test -d venv || virtualenv -p ${PYTHON} venv
diff --git a/tests/test_tipc/benchmark_train.sh b/tests/test_tipc/benchmark_train.sh
index 7f0382ac..b53e1f16 100644
--- a/tests/test_tipc/benchmark_train.sh
+++ b/tests/test_tipc/benchmark_train.sh
@@ -110,6 +110,8 @@ repo_name=$(get_repo_name )
 SAVE_LOG=${BENCHMARK_LOG_DIR:-$(pwd)}   # */benchmark_log
 mkdir -p "${SAVE_LOG}/benchmark_log/"
 status_log="${SAVE_LOG}/benchmark_log/results.log"
+# get benchmark profiling params : PROFILING_TIMER_ONLY=no|True|False
+PROFILING_TIMER_ONLY=${PROFILING_TIMER_ONLY:-"True"}
 
 # The number of lines in which train params can be replaced.
 line_python=3
@@ -166,19 +168,25 @@ for batch_size in ${batch_size_list[*]}; do
             gpu_id=$(set_gpu_id $device_num)
 
             if [ ${#gpu_id} -le 1 ];then
-                log_path="$SAVE_LOG/profiling_log"
-                mkdir -p $log_path
-                log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_profiling"
                 func_sed_params "$FILENAME" "${line_gpuid}" "0"  # sed used gpu_id 
-                # set profile_option params
-                tmp=`sed -i "${line_profile}s/.*/${profile_option}/" "${FILENAME}"`
-
-                # run test_train_inference_python.sh
-                cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 "
-                echo $cmd
-                eval $cmd
-                eval "cat ${log_path}/${log_name}"
-
+                if [[ ${PROFILING_TIMER_ONLY} != "no" ]];then
+                    echo "run profile"
+                    # The default value of profile_option's timer_only parameter is True
+                    if [[ ${PROFILING_TIMER_ONLY} = "False" ]];then
+                        profile_option="${profile_option};timer_only=False"
+                    fi
+                    log_path="$SAVE_LOG/profiling_log"
+                    mkdir -p $log_path
+                    log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_${to_static}profiling"
+                    # set profile_option params
+                    tmp=`sed -i "${line_profile}s/.*/\"${profile_option}\"/" "${FILENAME}"`
+                    # run test_train_inference_python.sh
+                    cmd="timeout 5m bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 "
+                    echo $cmd
+                    eval ${cmd}
+                    eval "cat ${log_path}/${log_name}"
+                fi
+                echo "run without profile"
                 # without profile
                 log_path="$SAVE_LOG/train_log"
                 speed_log_path="$SAVE_LOG/index"
diff --git a/tests/test_tipc/conformer/scripts/aishell_tiny.py b/tests/test_tipc/conformer/scripts/aishell_tiny.py
index 14f09f17..c87463b5 100644
--- a/tests/test_tipc/conformer/scripts/aishell_tiny.py
+++ b/tests/test_tipc/conformer/scripts/aishell_tiny.py
@@ -26,8 +26,8 @@ from pathlib import Path
 
 import soundfile
 
-from utils.utility import download
-from utils.utility import unpack
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unpack
 
 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
 
diff --git a/tests/test_tipc/prepare.sh b/tests/test_tipc/prepare.sh
index 9ff81bd8..e57feda0 100755
--- a/tests/test_tipc/prepare.sh
+++ b/tests/test_tipc/prepare.sh
@@ -29,11 +29,11 @@ if [[ ${MODE} = "benchmark_train" ]];then
     cd ${curPath}/../..
     echo "------------- install for speech  "
     apt-get install libsndfile1 -y 
-    pip install yacs -i https://pypi.tuna.tsinghua.edu.cn/simple
-    pip install pytest-runner  -i https://pypi.tuna.tsinghua.edu.cn/simple
-    pip install kaldiio  -i https://pypi.tuna.tsinghua.edu.cn/simple
-    pip install setuptools_scm -i https://pypi.tuna.tsinghua.edu.cn/simple 
-    pip install . -i https://pypi.tuna.tsinghua.edu.cn/simple 
+    pip install yacs #-i https://pypi.tuna.tsinghua.edu.cn/simple
+    pip install pytest-runner  #-i https://pypi.tuna.tsinghua.edu.cn/simple
+    pip install kaldiio  #-i https://pypi.tuna.tsinghua.edu.cn/simple
+    pip install setuptools_scm #-i https://pypi.tuna.tsinghua.edu.cn/simple 
+    pip install . #-i https://pypi.tuna.tsinghua.edu.cn/simple 
     pip install jsonlines
     pip list
     cd -
diff --git a/tests/unit/cli/aishell_test_prepare.py b/tests/unit/cli/aishell_test_prepare.py
index ed542c57..c364e4fd 100644
--- a/tests/unit/cli/aishell_test_prepare.py
+++ b/tests/unit/cli/aishell_test_prepare.py
@@ -25,8 +25,8 @@ from pathlib import Path
 
 import soundfile
 
-from utils.utility import download
-from utils.utility import unpack
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unpack
 
 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
 
diff --git a/tests/unit/cli/test_cli.sh b/tests/unit/cli/test_cli.sh
index a7f7d11e..3903e659 100755
--- a/tests/unit/cli/test_cli.sh
+++ b/tests/unit/cli/test_cli.sh
@@ -10,11 +10,12 @@ paddlespeech cls --input ./cat.wav --topk 10
 paddlespeech text --input 今天的天气真不错啊你下午有空吗我想约你一起去吃饭 --model ernie_linear_p3_wudao_fast
 
 # Speech SSL
+wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
 paddlespeech ssl --task asr --lang en --input ./en.wav
 paddlespeech ssl --task vector --lang en --input ./en.wav
 
 # Speech_recognition
-wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav
+wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav
 paddlespeech asr --input ./zh.wav
 paddlespeech asr --model conformer_aishell --input ./zh.wav
 paddlespeech asr --model conformer_online_aishell --input ./zh.wav
@@ -110,5 +111,7 @@ paddlespeech whisper --task transcribe --input ./zh.wav
 # whisper recognize text and translate to English
 paddlespeech whisper --task translate --input ./zh.wav
 
+# to change model English-Only model
+paddlespeech whisper --lang en --size base --task transcribe  --input ./en.wav
 
 echo -e "\033[32mTest success !!!\033[0m"
diff --git a/tests/unit/doc/test_cli.md b/tests/unit/doc/test_cli.md
new file mode 100644
index 00000000..34a0c016
--- /dev/null
+++ b/tests/unit/doc/test_cli.md
@@ -0,0 +1,29 @@
+# test CLI 测试文档
+
+ 该文档为 CLI 测试说明，该测试目前覆盖大部分 paddlespeech 中的 CLI 推理。该 CI 建立后用于快速验证修复是否正确。
+
+ # 测试流程
+ ## 1. 环境安装
+
+ CI 重建时在已有通过版本 paddlepaddle-gpu==2.5.1, paddlepseech==develop 下运行。
+
+ CI 重建后在 paddlepaddle-gpu==develop, paddlepseech==develop 下运行。
+ 
+ ### 其他相关依赖
+
+ gcc >= 4.8.5,
+ python >= 3.8
+
+ ## 2. 功能测试
+
+ 在 repo 的 tests/unit/cli 中运行：
+
+  ```shell
+
+  source path.sh
+  bash test_cli.sh
+
+  ```
+## 3. 预期结果
+
+ 输出 "Test success"，且运行过程中无报错或 Error 即为成功。
diff --git a/tests/unit/tts/test_enfrontend.py b/tests/unit/tts/test_enfrontend.py
new file mode 100644
index 00000000..4f8c4930
--- /dev/null
+++ b/tests/unit/tts/test_enfrontend.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddlespeech.t2s.frontend.en_frontend import English as EnFrontend
+
+if __name__ == '__main__':
+
+    fe = EnFrontend()
+
+    text = "AI for Sceience"
+    phonemes = fe.phoneticize(text)
+    print(text)
+    print(phonemes)
+
+    text = "eight"
+    phonemes = fe.phoneticize(text)
+    print(text)
+    print(phonemes)
diff --git a/tests/unit/tts/test_mixfrontend.py b/tests/unit/tts/test_mixfrontend.py
new file mode 100644
index 00000000..5751dd2a
--- /dev/null
+++ b/tests/unit/tts/test_mixfrontend.py
@@ -0,0 +1,444 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import re
+import tempfile
+
+from paddlespeech.t2s.frontend.mix_frontend import MixFrontend
+
+# mix zh & en phonemes
+phone_id_str = """
+<pad> 0
+<unk> 1
+AA0 2
+AA1 3
+AA2 4
+AE0 5
+AE1 6
+AE2 7
+AH0 8
+AH1 9
+AH2 10
+AO0 11
+AO1 12
+AO2 13
+AW0 14
+AW1 15
+AW2 16
+AY0 17
+AY1 18
+AY2 19
+B 20
+CH 21
+D 22
+DH 23
+EH0 24
+EH1 25
+EH2 26
+ER0 27
+ER1 28
+ER2 29
+EY0 30
+EY1 31
+EY2 32
+F 33
+G 34
+HH 35
+IH0 36
+IH1 37
+IH2 38
+IY0 39
+IY1 40
+IY2 41
+JH 42
+K 43
+L 44
+M 45
+N 46
+NG 47
+OW0 48
+OW1 49
+OW2 50
+OY0 51
+OY1 52
+OY2 53
+P 54
+R 55
+S 56
+SH 57
+T 58
+TH 59
+UH0 60
+UH1 61
+UH2 62
+UW0 63
+UW1 64
+UW2 65
+V 66
+W 67
+Y 68
+Z 69
+ZH 70
+a1 71
+a2 72
+a3 73
+a4 74
+a5 75
+ai1 76
+ai2 77
+ai3 78
+ai4 79
+ai5 80
+air2 81
+air3 82
+air4 83
+an1 84
+an2 85
+an3 86
+an4 87
+an5 88
+ang1 89
+ang2 90
+ang3 91
+ang4 92
+ang5 93
+angr2 94
+angr4 95
+anr1 96
+anr3 97
+anr4 98
+ao1 99
+ao2 100
+ao3 101
+ao4 102
+ao5 103
+aor1 104
+aor3 105
+aor4 106
+aor5 107
+ar2 108
+ar3 109
+ar4 110
+ar5 111
+b 112
+c 113
+ch 114
+d 115
+e1 116
+e2 117
+e3 118
+e4 119
+e5 120
+ei1 121
+ei2 122
+ei3 123
+ei4 124
+ei5 125
+eir4 126
+en1 127
+en2 128
+en3 129
+en4 130
+en5 131
+eng1 132
+eng2 133
+eng3 134
+eng4 135
+eng5 136
+engr4 137
+enr1 138
+enr2 139
+enr3 140
+enr4 141
+enr5 142
+er1 143
+er2 144
+er3 145
+er4 146
+er5 147
+f 148
+g 149
+h 150
+i1 151
+i2 152
+i3 153
+i4 154
+i5 155
+ia1 156
+ia2 157
+ia3 158
+ia4 159
+ia5 160
+ian1 161
+ian2 162
+ian3 163
+ian4 164
+ian5 165
+iang1 166
+iang2 167
+iang3 168
+iang4 169
+iang5 170
+iangr4 171
+ianr1 172
+ianr2 173
+ianr3 174
+ianr4 175
+ianr5 176
+iao1 177
+iao2 178
+iao3 179
+iao4 180
+iao5 181
+iaor1 182
+iaor2 183
+iaor3 184
+iaor4 185
+iar1 186
+iar3 187
+iar4 188
+ie1 189
+ie2 190
+ie3 191
+ie4 192
+ie5 193
+ii1 194
+ii2 195
+ii3 196
+ii4 197
+ii5 198
+iii1 199
+iii2 200
+iii3 201
+iii4 202
+iii5 203
+iiir1 204
+iiir4 205
+iir2 206
+in1 207
+in2 208
+in3 209
+in4 210
+in5 211
+ing1 212
+ing2 213
+ing3 214
+ing4 215
+ing5 216
+ingr1 217
+ingr2 218
+ingr3 219
+ingr4 220
+inr1 221
+inr4 222
+io1 223
+io3 224
+io5 225
+iong1 226
+iong2 227
+iong3 228
+iong4 229
+iong5 230
+iou1 231
+iou2 232
+iou3 233
+iou4 234
+iou5 235
+iour1 236
+iour2 237
+iour3 238
+iour4 239
+ir1 240
+ir2 241
+ir3 242
+ir4 243
+ir5 244
+j 245
+k 246
+l 247
+m 248
+n 249
+o1 250
+o2 251
+o3 252
+o4 253
+o5 254
+ong1 255
+ong2 256
+ong3 257
+ong4 258
+ong5 259
+ongr4 260
+or2 261
+ou1 262
+ou2 263
+ou3 264
+ou4 265
+ou5 266
+our2 267
+our3 268
+our4 269
+our5 270
+p 271
+q 272
+r 273
+s 274
+sh 275
+sil 276
+sp 277
+spl 278
+spn 279
+t 280
+u1 281
+u2 282
+u3 283
+u4 284
+u5 285
+ua1 286
+ua2 287
+ua3 288
+ua4 289
+ua5 290
+uai1 291
+uai2 292
+uai3 293
+uai4 294
+uai5 295
+uair4 296
+uan1 297
+uan2 298
+uan3 299
+uan4 300
+uan5 301
+uang1 302
+uang2 303
+uang3 304
+uang4 305
+uang5 306
+uangr4 307
+uanr1 308
+uanr2 309
+uanr3 310
+uanr4 311
+uanr5 312
+uar1 313
+uar2 314
+uar4 315
+uei1 316
+uei2 317
+uei3 318
+uei4 319
+uei5 320
+ueir1 321
+ueir2 322
+ueir3 323
+ueir4 324
+uen1 325
+uen2 326
+uen3 327
+uen4 328
+uen5 329
+ueng1 330
+ueng2 331
+ueng3 332
+ueng4 333
+uenr1 334
+uenr2 335
+uenr3 336
+uenr4 337
+uo1 338
+uo2 339
+uo3 340
+uo4 341
+uo5 342
+uor1 343
+uor2 344
+uor3 345
+uor5 346
+ur1 347
+ur2 348
+ur3 349
+ur4 350
+ur5 351
+v1 352
+v2 353
+v3 354
+v4 355
+v5 356
+van1 357
+van2 358
+van3 359
+van4 360
+van5 361
+vanr1 362
+vanr2 363
+vanr3 364
+vanr4 365
+ve1 366
+ve2 367
+ve3 368
+ve4 369
+ve5 370
+ver3 371
+ver4 372
+vn1 373
+vn2 374
+vn3 375
+vn4 376
+vn5 377
+vnr2 378
+vr3 379
+x 380
+z 381
+zh 382
+, 383
+. 384
+? 385
+! 386
+<eos> 387
+"""
+
+if __name__ == '__main__':
+    with tempfile.NamedTemporaryFile(mode='wt') as f:
+        phone_ids = phone_id_str.split()
+        for phone, id in zip(phone_ids[::2], phone_ids[1::2]):
+            f.write(f"{phone} {id}")
+            f.write('\n')
+            f.flush()
+
+        frontend = MixFrontend(phone_vocab_path=f.name)
+
+        text = "hello, 我爱北京天安们，what about you."
+        print(text)
+        # [('hello, ', 'en'), ('我爱北京天安们，', 'zh'), ('what about you.', 'en')]
+        segs = frontend.split_by_lang(text)
+        print(segs)
+
+        text = "hello?!!我爱北京天安们，what about you."
+        print(text)
+        # [('hello?!!', 'en'), ('我爱北京天安们，', 'zh'), ('what about you.', 'en')]
+        segs = frontend.split_by_lang(text)
+        print(segs)
+
+        text = "<speak> hello，我爱北京天安们，what about you."
+        print(text)
+        # [('<speak> hello，', 'en'), ('我爱北京天安们，', 'zh'), ('what about you.', 'en')]
+        segs = frontend.split_by_lang(text)
+        print(segs)
+
+        # 对于SSML的xml标记处理不好。需要先解析SSML，后处理中英的划分。
+        text = "<speak>我们的声学模型使用了 Fast Speech Two。前浪<say-as pinyin='dao3'>倒</say-as>在沙滩上,沙滩上倒了一堆<say-as pinyin='tu3'>土</say-as>。 想象<say-as pinyin='gan1 gan1'>干干</say-as>的树干<say-as pinyin='dao3'>倒</say-as>了, 里面有个干尸，不知是被谁<say-as pinyin='gan4'>干</say-as>死的。</speak>"
+        print(text)
+        # [('<speak>', 'en'), ('我们的声学模型使用了 ', 'zh'), ('Fast Speech Two。', 'en'), ('前浪<', 'zh'), ("say-as pinyin='dao3'>", 'en'), ('倒</', 'zh'), ('say-as>', 'en'), ('在沙滩上,沙滩上倒了一堆<', 'zh'), ("say-as pinyin='tu3'>", 'en'), ('土</', 'zh'), ('say-as>。 ', 'en'), ('想象<', 'zh'), ("say-as pinyin='gan1 gan1'>", 'en'), ('干干</', 'zh'), ('say-as>', 'en'), ('的树干<', 'zh'), ("say-as pinyin='dao3'>", 'en'), ('倒</', 'zh'), ('say-as>', 'en'), ('了, 里面有个干尸，不知是被谁<', 'zh'), ("say-as pinyin='gan4'>", 'en'), ('干</', 'zh'), ('say-as>', 'en'), ('死的。</', 'zh'), ('speak>', 'en')]
+        segs = frontend.split_by_lang(text)
+        print(segs)
diff --git a/tests/unit/tts/test_ssml.py b/tests/unit/tts/test_ssml.py
new file mode 100644
index 00000000..4c3e9d53
--- /dev/null
+++ b/tests/unit/tts/test_ssml.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddlespeech.t2s.frontend.ssml.xml_processor import MixTextProcessor
+
+if __name__ == '__main__':
+    text = "你好吗，<speak>我们的声学模型使用了 Fast Speech Two。前浪<say-as pinyin='dao3'>倒</say-as>在沙滩上,沙滩上倒了一堆<say-as pinyin='tu3'>土</say-as>。 想象<say-as pinyin='gan1 gan1'>干干</say-as>的树干<say-as pinyin='dao3'>倒</say-as>了, 里面有个干尸，不知是被谁<say-as pinyin='gan4'>干</say-as>死的。</speak>thank you."
+
+    # SSML: 13
+    # 0 ['你好吗，', []]
+    # 1 ['我们的声学模型使用了FastSpeechTwo。前浪', []]
+    # 2 ['倒', ['dao3']]
+    # 3 ['在沙滩上,沙滩上倒了一堆', []]
+    # 4 ['土', ['tu3']]
+    # 5 ['。想象', []]
+    # 6 ['干干', ['gan1', 'gan1']]
+    # 7 ['的树干', []]
+    # 8 ['倒', ['dao3']]
+    # 9 ['了,里面有个干尸，不知是被谁', []]
+    # 10 ['干', ['gan4']]
+    # 11 ['死的。', []]
+    # 12 ['thank you.', []]
+    inputs = MixTextProcessor.get_pinyin_split(text)
+    print(f"SSML get_pinyin_split: {len(inputs)}")
+    for i, sub in enumerate(inputs):
+        print(i, sub)
+    print()
+
+    # SSML get_dom_split: 13
+    # 0 你好吗，
+    # 1 我们的声学模型使用了 Fast Speech Two。前浪
+    # 2 <say-as pinyin="dao3">倒</say-as>
+    # 3 在沙滩上,沙滩上倒了一堆
+    # 4 <say-as pinyin="tu3">土</say-as>
+    # 5 。 想象
+    # 6 <say-as pinyin="gan1 gan1">干干</say-as>
+    # 7 的树干
+    # 8 <say-as pinyin="dao3">倒</say-as>
+    # 9 了, 里面有个干尸，不知是被谁
+    # 10 <say-as pinyin="gan4">干</say-as>
+    # 11 死的。
+    # 12 thank you.
+    inputs = MixTextProcessor.get_dom_split(text)
+    print(f"SSML get_dom_split: {len(inputs)}")
+    for i, sub in enumerate(inputs):
+        print(i, sub)
+    print()
+
+    # SSML object.get_pinyin_split: 246
+    # <speak>我们的声学模型使用了 Fast Speech Two。前浪<say-as pinyin='dao3'>倒</say-as>在沙滩上,沙滩上倒了一堆<say-as pinyin='tu3'>土</say-as>。 想象<say-as pinyin='gan1 gan1'>干干</say-as>的树干<say-as pinyin='dao3'>倒</say-as>了, 里面有个干尸，不知是被谁<say-as pinyin='gan4'>干</say-as>死的。</speak>
+    outs = MixTextProcessor().get_xml_content(text)
+    print(f"SSML object.get_pinyin_split: {len(outs)}")
+    print(outs)
+    print()
+
+    # SSML object.get_content_split: 30 你好吗，
+    # 1 <speak>我们的声学模型使用了 Fast Speech Two。前浪<say-as pinyin='dao3'>倒</say-as>在沙滩上,沙滩上倒了一堆<say-as pinyin='tu3'>土</say-as>。 想象<say-as pinyin='gan1 gan1'>干干</say-as>的树干<say-as pinyin='dao3'>
+    # 倒</say-as>了, 里面有个干尸，不知是被谁<say-as pinyin='gan4'>干</say-as>死的。</speak>
+    # 2 thank you.
+    outs = MixTextProcessor().get_content_split(text)
+    print(f"SSML object.get_content_split: {len(outs)}")
+    for i, sub in enumerate(outs):
+        print(i, sub)
+    print()
+
+    import json
+    import xmltodict
+    text = "<speak>我们的声学模型使用了 Fast Speech Two。前浪<say-as pinyin='dao3'>倒</say-as>在沙滩上,沙滩上倒了一堆<say-as pinyin='tu3'>土</say-as>。 想象<say-as pinyin='gan1 gan1'>干干</say-as>的树干<say-as pinyin='dao3'>倒</say-as>了, 里面有个干尸，不知是被谁<say-as pinyin='gan4'>干</say-as>死的。</speak>"
+    ssml = xmltodict.parse(text)
+    print(json.dumps(ssml))
+    print(ssml['speak'].keys())
+    print(ssml['speak']['#text'])
+    print(ssml['speak']['say-as'])
diff --git a/tools/Makefile b/tools/Makefile
index a5a4485d..c6c667cd 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -1,5 +1,5 @@
 SHELL:= /bin/bash
-PYTHON:= python3.7
+PYTHON:= python3.8
 
 CXX ?= g++
 CC ?= gcc        # used for sph2pipe
diff --git a/tools/extras/install_mkl.sh b/tools/extras/install_mkl.sh
index 8c1899bd..01bce64f 100755
--- a/tools/extras/install_mkl.sh
+++ b/tools/extras/install_mkl.sh
@@ -166,7 +166,7 @@ variable, sudo might not allow it to propagate to the command that it invokes."
 fi
 
 # The install variants, each in a function to simplify error reporting.
-# Each one invokes a subshell with a 'set -x' to to show system-modifying
+# Each one invokes a subshell with a 'set -x' to show system-modifying
 # commands it runs. The subshells simply limit the scope of this diagnostics
 # and avoid creating noise (if we were using 'set +x', it would be printed).
 Install_redhat () {
diff --git a/utils/apply-cmvn.py b/utils/apply-cmvn.py
index cf91bdfc..fa69ff8e 100755
--- a/utils/apply-cmvn.py
+++ b/utils/apply-cmvn.py
@@ -6,7 +6,7 @@ import kaldiio
 import numpy
 from distutils.util import strtobool
 
-from paddlespeech.s2t.transform.cmvn import CMVN
+from paddlespeech.audio.transform.cmvn import CMVN
 from paddlespeech.s2t.utils.cli_readers import file_reader_helper
 from paddlespeech.s2t.utils.cli_utils import get_commandline_args
 from paddlespeech.s2t.utils.cli_utils import is_scipy_wav_style
diff --git a/utils/avg_model.py b/utils/avg_model.py
index 6ee16408..039ea626 100755
--- a/utils/avg_model.py
+++ b/utils/avg_model.py
@@ -12,105 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import argparse
-import glob
-import json
-import os
-
-import numpy as np
-import paddle
-
-
-def main(args):
-    paddle.set_device('cpu')
-
-    val_scores = []
-    beat_val_scores = None
-    selected_epochs = None
-
-    jsons = glob.glob(f'{args.ckpt_dir}/[!train]*.json')
-    jsons = sorted(jsons, key=os.path.getmtime, reverse=True)
-    for y in jsons:
-        with open(y, 'r') as f:
-            dic_json = json.load(f)
-        loss = dic_json['val_loss']
-        epoch = dic_json['epoch']
-        if epoch >= args.min_epoch and epoch <= args.max_epoch:
-            val_scores.append((epoch, loss))
-    val_scores = np.array(val_scores)
-
-    if args.val_best:
-        sort_idx = np.argsort(val_scores[:, 1])
-        sorted_val_scores = val_scores[sort_idx]
-    else:
-        sorted_val_scores = val_scores
-
-    beat_val_scores = sorted_val_scores[:args.num, 1]
-    selected_epochs = sorted_val_scores[:args.num, 0].astype(np.int64)
-    avg_val_score = np.mean(beat_val_scores)
-    print("selected val scores = " + str(beat_val_scores))
-    print("selected epochs = " + str(selected_epochs))
-    print("averaged val score = " + str(avg_val_score))
-
-    path_list = [
-        args.ckpt_dir + '/{}.pdparams'.format(int(epoch))
-        for epoch in sorted_val_scores[:args.num, 0]
-    ]
-    print(path_list)
-
-    avg = None
-    num = args.num
-    assert num == len(path_list)
-    for path in path_list:
-        print(f'Processing {path}')
-        states = paddle.load(path)
-        if avg is None:
-            avg = states
-        else:
-            for k in avg.keys():
-                avg[k] += states[k]
-    # average
-    for k in avg.keys():
-        if avg[k] is not None:
-            avg[k] /= num
-
-    paddle.save(avg, args.dst_model)
-    print(f'Saving to {args.dst_model}')
-
-    meta_path = os.path.splitext(args.dst_model)[0] + '.avg.json'
-    with open(meta_path, 'w') as f:
-        data = json.dumps({
-            "mode": 'val_best' if args.val_best else 'latest',
-            "avg_ckpt": args.dst_model,
-            "val_loss_mean": avg_val_score,
-            "ckpts": path_list,
-            "epochs": selected_epochs.tolist(),
-            "val_losses": beat_val_scores.tolist(),
-        })
-        f.write(data + "\n")
-
+from paddlespeech.dataset.s2t import avg_ckpts_main
 
 if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='average model')
-    parser.add_argument('--dst_model', required=True, help='averaged model')
-    parser.add_argument(
-        '--ckpt_dir', required=True, help='ckpt model dir for average')
-    parser.add_argument(
-        '--val_best', action="store_true", help='averaged model')
-    parser.add_argument(
-        '--num', default=5, type=int, help='nums for averaged model')
-    parser.add_argument(
-        '--min_epoch',
-        default=0,
-        type=int,
-        help='min epoch used for averaging model')
-    parser.add_argument(
-        '--max_epoch',
-        default=65536,  # Big enough
-        type=int,
-        help='max epoch used for averaging model')
-
-    args = parser.parse_args()
-    print(args)
-
-    main(args)
+    avg_ckpts_main()
diff --git a/utils/build_vocab.py b/utils/build_vocab.py
index e364e821..9b29dfa5 100755
--- a/utils/build_vocab.py
+++ b/utils/build_vocab.py
@@ -15,134 +15,7 @@
 """Build vocabulary from manifest files.
 Each item in vocabulary file is a character.
 """
-import argparse
-import functools
-import os
-import tempfile
-from collections import Counter
-
-import jsonlines
-
-from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
-from paddlespeech.s2t.frontend.utility import BLANK
-from paddlespeech.s2t.frontend.utility import SOS
-from paddlespeech.s2t.frontend.utility import SPACE
-from paddlespeech.s2t.frontend.utility import UNK
-from paddlespeech.s2t.utils.utility import add_arguments
-from paddlespeech.s2t.utils.utility import print_arguments
-
-parser = argparse.ArgumentParser(description=__doc__)
-add_arg = functools.partial(add_arguments, argparser=parser)
-# yapf: disable
-add_arg('unit_type', str, "char", "Unit type, e.g. char, word, spm")
-add_arg('count_threshold', int, 0,
-        "Truncation threshold for char/word counts.Default 0, no truncate.")
-add_arg('vocab_path', str,
-        'examples/librispeech/data/vocab.txt',
-        "Filepath to write the vocabulary.")
-add_arg('manifest_paths', str,
-        None,
-        "Filepaths of manifests for building vocabulary. "
-        "You can provide multiple manifest files.",
-        nargs='+',
-        required=True)
-add_arg('text_keys', str,
-        'text',
-        "keys of the text in manifest for building vocabulary. "
-        "You can provide multiple k.",
-        nargs='+')
-# bpe
-add_arg('spm_vocab_size', int, 0, "Vocab size for spm.")
-add_arg('spm_mode', str, 'unigram', "spm model type, e.g. unigram, spm, char, word. only need when `unit_type` is spm")
-add_arg('spm_model_prefix', str, "", "spm_model_%(spm_mode)_%(count_threshold), spm model prefix, only need when `unit_type` is spm")
-add_arg('spm_character_coverage', float, 0.9995, "character coverage to determine the minimum symbols")
-
-# yapf: disable
-args = parser.parse_args()
-
-
-def count_manifest(counter, text_feature, manifest_path):
-    manifest_jsons = []
-    with jsonlines.open(manifest_path, 'r') as reader:
-        for json_data in reader:
-            manifest_jsons.append(json_data)
-
-    for line_json in manifest_jsons:
-        if isinstance(line_json['text'], str):
-            line = text_feature.tokenize(line_json['text'], replace_space=False)
-            counter.update(line)
-        else:
-            assert isinstance(line_json['text'], list)
-            for text in line_json['text']:
-                line = text_feature.tokenize(text, replace_space=False)
-                counter.update(line)
-
-def dump_text_manifest(fileobj, manifest_path, key='text'):
-    manifest_jsons = []
-    with jsonlines.open(manifest_path, 'r') as reader:
-        for json_data in reader:
-            manifest_jsons.append(json_data)
-
-    for line_json in manifest_jsons:
-        if isinstance(line_json[key], str):
-            fileobj.write(line_json[key] + "\n")
-        else:
-            assert isinstance(line_json[key], list)
-            for line in line_json[key]:
-                fileobj.write(line + "\n")
-
-def main():
-    print_arguments(args, globals())
-
-    fout = open(args.vocab_path, 'w', encoding='utf-8')
-    fout.write(BLANK + "\n")  # 0 will be used for "blank" in CTC
-    fout.write(UNK + '\n')  # <unk> must be 1
-
-    if args.unit_type == 'spm':
-        # tools/spm_train --input=$wave_data/lang_char/input.txt
-        # --vocab_size=${nbpe} --model_type=${bpemode}
-        # --model_prefix=${bpemodel} --input_sentence_size=100000000
-        import sentencepiece as spm
-
-        fp = tempfile.NamedTemporaryFile(mode='w', delete=False)
-        for manifest_path in args.manifest_paths:
-            text_keys = [args.text_keys] if type(args.text_keys) is not list else args.text_keys
-            for text_key in text_keys:
-                dump_text_manifest(fp, manifest_path, key=text_key)
-        fp.close()
-        # train
-        spm.SentencePieceTrainer.Train(
-            input=fp.name,
-            vocab_size=args.spm_vocab_size,
-            model_type=args.spm_mode,
-            model_prefix=args.spm_model_prefix,
-            input_sentence_size=100000000,
-            character_coverage=args.spm_character_coverage)
-        os.unlink(fp.name)
-
-    # encode
-    text_feature = TextFeaturizer(args.unit_type, "", args.spm_model_prefix)
-    counter = Counter()
-
-    for manifest_path in args.manifest_paths:
-        count_manifest(counter, text_feature, manifest_path)
-
-    count_sorted = sorted(counter.items(), key=lambda x: x[1], reverse=True)
-    tokens = []
-    for token, count in count_sorted:
-        if count < args.count_threshold:
-            break
-        # replace space by `<space>`
-        token = SPACE if token == ' ' else token
-        tokens.append(token)
-
-    tokens = sorted(tokens)
-    for token in tokens:
-        fout.write(token + '\n')
-
-    fout.write(SOS + "\n")  # <sos/eos>
-    fout.close()
-
+from paddlespeech.dataset.s2t import build_vocab_main
 
 if __name__ == '__main__':
-    main()
+    build_vocab_main()
diff --git a/utils/compute-cmvn-stats.py b/utils/compute-cmvn-stats.py
index 276bcd36..763347ce 100755
--- a/utils/compute-cmvn-stats.py
+++ b/utils/compute-cmvn-stats.py
@@ -5,7 +5,7 @@ import logging
 import kaldiio
 import numpy as np
 
-from paddlespeech.s2t.transform.transformation import Transformation
+from paddlespeech.audio.transform.transformation import Transformation
 from paddlespeech.s2t.utils.cli_readers import file_reader_helper
 from paddlespeech.s2t.utils.cli_utils import get_commandline_args
 from paddlespeech.s2t.utils.cli_utils import is_scipy_wav_style
diff --git a/utils/compute-wer.py b/utils/compute-wer.py
index 98bb24a7..1fa77216 100755
--- a/utils/compute-wer.py
+++ b/utils/compute-wer.py
@@ -1,554 +1,5 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
 # Copyright 2021 Mobvoi Inc. All Rights Reserved.
-import codecs
-import re
-import sys
-import unicodedata
-
-remove_tag = True
-spacelist = [' ', '\t', '\r', '\n']
-puncts = [
-    '!', ',', '?', '、', '。', '！', '，', '；', '？', '：', '「', '」', '︰', '『', '』',
-    '《', '》'
-]
-
-
-def characterize(string):
-    res = []
-    i = 0
-    while i < len(string):
-        char = string[i]
-        if char in puncts:
-            i += 1
-            continue
-        cat1 = unicodedata.category(char)
-        #https://unicodebook.readthedocs.io/unicode.html#unicode-categories
-        if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist:  # space or not assigned
-            i += 1
-            continue
-        if cat1 == 'Lo':  # letter-other
-            res.append(char)
-            i += 1
-        else:
-            # some input looks like: <unk><noise>, we want to separate it to two words.
-            sep = ' '
-            if char == '<': sep = '>'
-            j = i + 1
-            while j < len(string):
-                c = string[j]
-                if ord(c) >= 128 or (c in spacelist) or (c == sep):
-                    break
-                j += 1
-            if j < len(string) and string[j] == '>':
-                j += 1
-            res.append(string[i:j])
-            i = j
-    return res
-
-
-def stripoff_tags(x):
-    if not x: return ''
-    chars = []
-    i = 0
-    T = len(x)
-    while i < T:
-        if x[i] == '<':
-            while i < T and x[i] != '>':
-                i += 1
-            i += 1
-        else:
-            chars.append(x[i])
-            i += 1
-    return ''.join(chars)
-
-
-def normalize(sentence, ignore_words, cs, split=None):
-    """ sentence, ignore_words are both in unicode
-    """
-    new_sentence = []
-    for token in sentence:
-        x = token
-        if not cs:
-            x = x.upper()
-        if x in ignore_words:
-            continue
-        if remove_tag:
-            x = stripoff_tags(x)
-        if not x:
-            continue
-        if split and x in split:
-            new_sentence += split[x]
-        else:
-            new_sentence.append(x)
-    return new_sentence
-
-
-class Calculator:
-    def __init__(self):
-        self.data = {}
-        self.space = []
-        self.cost = {}
-        self.cost['cor'] = 0
-        self.cost['sub'] = 1
-        self.cost['del'] = 1
-        self.cost['ins'] = 1
-
-    def calculate(self, lab, rec):
-        # Initialization
-        lab.insert(0, '')
-        rec.insert(0, '')
-        while len(self.space) < len(lab):
-            self.space.append([])
-        for row in self.space:
-            for element in row:
-                element['dist'] = 0
-                element['error'] = 'non'
-            while len(row) < len(rec):
-                row.append({'dist': 0, 'error': 'non'})
-        for i in range(len(lab)):
-            self.space[i][0]['dist'] = i
-            self.space[i][0]['error'] = 'del'
-        for j in range(len(rec)):
-            self.space[0][j]['dist'] = j
-            self.space[0][j]['error'] = 'ins'
-        self.space[0][0]['error'] = 'non'
-        for token in lab:
-            if token not in self.data and len(token) > 0:
-                self.data[token] = {
-                    'all': 0,
-                    'cor': 0,
-                    'sub': 0,
-                    'ins': 0,
-                    'del': 0
-                }
-        for token in rec:
-            if token not in self.data and len(token) > 0:
-                self.data[token] = {
-                    'all': 0,
-                    'cor': 0,
-                    'sub': 0,
-                    'ins': 0,
-                    'del': 0
-                }
-        # Computing edit distance
-        for i, lab_token in enumerate(lab):
-            for j, rec_token in enumerate(rec):
-                if i == 0 or j == 0:
-                    continue
-                min_dist = sys.maxsize
-                min_error = 'none'
-                dist = self.space[i - 1][j]['dist'] + self.cost['del']
-                error = 'del'
-                if dist < min_dist:
-                    min_dist = dist
-                    min_error = error
-                dist = self.space[i][j - 1]['dist'] + self.cost['ins']
-                error = 'ins'
-                if dist < min_dist:
-                    min_dist = dist
-                    min_error = error
-                if lab_token == rec_token:
-                    dist = self.space[i - 1][j - 1]['dist'] + self.cost['cor']
-                    error = 'cor'
-                else:
-                    dist = self.space[i - 1][j - 1]['dist'] + self.cost['sub']
-                    error = 'sub'
-                if dist < min_dist:
-                    min_dist = dist
-                    min_error = error
-                self.space[i][j]['dist'] = min_dist
-                self.space[i][j]['error'] = min_error
-        # Tracing back
-        result = {
-            'lab': [],
-            'rec': [],
-            'all': 0,
-            'cor': 0,
-            'sub': 0,
-            'ins': 0,
-            'del': 0
-        }
-        i = len(lab) - 1
-        j = len(rec) - 1
-        while True:
-            if self.space[i][j]['error'] == 'cor':  # correct
-                if len(lab[i]) > 0:
-                    self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1
-                    self.data[lab[i]]['cor'] = self.data[lab[i]]['cor'] + 1
-                    result['all'] = result['all'] + 1
-                    result['cor'] = result['cor'] + 1
-                result['lab'].insert(0, lab[i])
-                result['rec'].insert(0, rec[j])
-                i = i - 1
-                j = j - 1
-            elif self.space[i][j]['error'] == 'sub':  # substitution
-                if len(lab[i]) > 0:
-                    self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1
-                    self.data[lab[i]]['sub'] = self.data[lab[i]]['sub'] + 1
-                    result['all'] = result['all'] + 1
-                    result['sub'] = result['sub'] + 1
-                result['lab'].insert(0, lab[i])
-                result['rec'].insert(0, rec[j])
-                i = i - 1
-                j = j - 1
-            elif self.space[i][j]['error'] == 'del':  # deletion
-                if len(lab[i]) > 0:
-                    self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1
-                    self.data[lab[i]]['del'] = self.data[lab[i]]['del'] + 1
-                    result['all'] = result['all'] + 1
-                    result['del'] = result['del'] + 1
-                result['lab'].insert(0, lab[i])
-                result['rec'].insert(0, "")
-                i = i - 1
-            elif self.space[i][j]['error'] == 'ins':  # insertion
-                if len(rec[j]) > 0:
-                    self.data[rec[j]]['ins'] = self.data[rec[j]]['ins'] + 1
-                    result['ins'] = result['ins'] + 1
-                result['lab'].insert(0, "")
-                result['rec'].insert(0, rec[j])
-                j = j - 1
-            elif self.space[i][j]['error'] == 'non':  # starting point
-                break
-            else:  # shouldn't reach here
-                print(
-                    'this should not happen , i = {i} , j = {j} , error = {error}'.
-                    format(i=i, j=j, error=self.space[i][j]['error']))
-        return result
-
-    def overall(self):
-        result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0}
-        for token in self.data:
-            result['all'] = result['all'] + self.data[token]['all']
-            result['cor'] = result['cor'] + self.data[token]['cor']
-            result['sub'] = result['sub'] + self.data[token]['sub']
-            result['ins'] = result['ins'] + self.data[token]['ins']
-            result['del'] = result['del'] + self.data[token]['del']
-        return result
-
-    def cluster(self, data):
-        result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0}
-        for token in data:
-            if token in self.data:
-                result['all'] = result['all'] + self.data[token]['all']
-                result['cor'] = result['cor'] + self.data[token]['cor']
-                result['sub'] = result['sub'] + self.data[token]['sub']
-                result['ins'] = result['ins'] + self.data[token]['ins']
-                result['del'] = result['del'] + self.data[token]['del']
-        return result
-
-    def keys(self):
-        return list(self.data.keys())
-
-
-def width(string):
-    return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string)
-
-
-def default_cluster(word):
-    unicode_names = [unicodedata.name(char) for char in word]
-    for i in reversed(range(len(unicode_names))):
-        if unicode_names[i].startswith('DIGIT'):  # 1
-            unicode_names[i] = 'Number'  # 'DIGIT'
-        elif (unicode_names[i].startswith('CJK UNIFIED IDEOGRAPH') or
-              unicode_names[i].startswith('CJK COMPATIBILITY IDEOGRAPH')):
-            # 明 / 郎
-            unicode_names[i] = 'Mandarin'  # 'CJK IDEOGRAPH'
-        elif (unicode_names[i].startswith('LATIN CAPITAL LETTER') or
-              unicode_names[i].startswith('LATIN SMALL LETTER')):
-            # A / a
-            unicode_names[i] = 'English'  # 'LATIN LETTER'
-        elif unicode_names[i].startswith('HIRAGANA LETTER'):  # は こ め
-            unicode_names[i] = 'Japanese'  # 'GANA LETTER'
-        elif (unicode_names[i].startswith('AMPERSAND') or
-              unicode_names[i].startswith('APOSTROPHE') or
-              unicode_names[i].startswith('COMMERCIAL AT') or
-              unicode_names[i].startswith('DEGREE CELSIUS') or
-              unicode_names[i].startswith('EQUALS SIGN') or
-              unicode_names[i].startswith('FULL STOP') or
-              unicode_names[i].startswith('HYPHEN-MINUS') or
-              unicode_names[i].startswith('LOW LINE') or
-              unicode_names[i].startswith('NUMBER SIGN') or
-              unicode_names[i].startswith('PLUS SIGN') or
-              unicode_names[i].startswith('SEMICOLON')):
-            # & / ' / @ / ℃ / = / . / - / _ / # / + / ;
-            del unicode_names[i]
-        else:
-            return 'Other'
-    if len(unicode_names) == 0:
-        return 'Other'
-    if len(unicode_names) == 1:
-        return unicode_names[0]
-    for i in range(len(unicode_names) - 1):
-        if unicode_names[i] != unicode_names[i + 1]:
-            return 'Other'
-    return unicode_names[0]
-
-
-def usage():
-    print(
-        "compute-wer.py : compute word error rate (WER) and align recognition results and references."
-    )
-    print(
-        "         usage : python compute-wer.py [--cs={0,1}] [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] [--padding-symbol={space,underline}] test.ref test.hyp > test.wer"
-    )
-
+from paddlespeech.dataset.s2t import compute_wer_main
 
 if __name__ == '__main__':
-    if len(sys.argv) == 1:
-        usage()
-        sys.exit(0)
-    calculator = Calculator()
-    cluster_file = ''
-    ignore_words = set()
-    tochar = False
-    verbose = 1
-    padding_symbol = ' '
-    case_sensitive = False
-    max_words_per_line = sys.maxsize
-    split = None
-    while len(sys.argv) > 3:
-        a = '--maxw='
-        if sys.argv[1].startswith(a):
-            b = sys.argv[1][len(a):]
-            del sys.argv[1]
-            max_words_per_line = int(b)
-            continue
-        a = '--rt='
-        if sys.argv[1].startswith(a):
-            b = sys.argv[1][len(a):].lower()
-            del sys.argv[1]
-            remove_tag = (b == 'true') or (b != '0')
-            continue
-        a = '--cs='
-        if sys.argv[1].startswith(a):
-            b = sys.argv[1][len(a):].lower()
-            del sys.argv[1]
-            case_sensitive = (b == 'true') or (b != '0')
-            continue
-        a = '--cluster='
-        if sys.argv[1].startswith(a):
-            cluster_file = sys.argv[1][len(a):]
-            del sys.argv[1]
-            continue
-        a = '--splitfile='
-        if sys.argv[1].startswith(a):
-            split_file = sys.argv[1][len(a):]
-            del sys.argv[1]
-            split = dict()
-            with codecs.open(split_file, 'r', 'utf-8') as fh:
-                for line in fh:  # line in unicode
-                    words = line.strip().split()
-                    if len(words) >= 2:
-                        split[words[0]] = words[1:]
-            continue
-        a = '--ig='
-        if sys.argv[1].startswith(a):
-            ignore_file = sys.argv[1][len(a):]
-            del sys.argv[1]
-            with codecs.open(ignore_file, 'r', 'utf-8') as fh:
-                for line in fh:  # line in unicode
-                    line = line.strip()
-                    if len(line) > 0:
-                        ignore_words.add(line)
-            continue
-        a = '--char='
-        if sys.argv[1].startswith(a):
-            b = sys.argv[1][len(a):].lower()
-            del sys.argv[1]
-            tochar = (b == 'true') or (b != '0')
-            continue
-        a = '--v='
-        if sys.argv[1].startswith(a):
-            b = sys.argv[1][len(a):].lower()
-            del sys.argv[1]
-            verbose = 0
-            try:
-                verbose = int(b)
-            except:
-                if b == 'true' or b != '0':
-                    verbose = 1
-            continue
-        a = '--padding-symbol='
-        if sys.argv[1].startswith(a):
-            b = sys.argv[1][len(a):].lower()
-            del sys.argv[1]
-            if b == 'space':
-                padding_symbol = ' '
-            elif b == 'underline':
-                padding_symbol = '_'
-            continue
-        if True or sys.argv[1].startswith('-'):
-            #ignore invalid switch
-            del sys.argv[1]
-            continue
-
-    if not case_sensitive:
-        ig = set([w.upper() for w in ignore_words])
-        ignore_words = ig
-
-    default_clusters = {}
-    default_words = {}
-
-    ref_file = sys.argv[1]
-    hyp_file = sys.argv[2]
-    rec_set = {}
-    if split and not case_sensitive:
-        newsplit = dict()
-        for w in split:
-            words = split[w]
-            for i in range(len(words)):
-                words[i] = words[i].upper()
-            newsplit[w.upper()] = words
-        split = newsplit
-
-    with codecs.open(hyp_file, 'r', 'utf-8') as fh:
-        for line in fh:
-            if tochar:
-                array = characterize(line)
-            else:
-                array = line.strip().split()
-            if len(array) == 0: continue
-            fid = array[0]
-            rec_set[fid] = normalize(array[1:], ignore_words, case_sensitive,
-                                     split)
-
-    # compute error rate on the interaction of reference file and hyp file
-    for line in open(ref_file, 'r', encoding='utf-8'):
-        if tochar:
-            array = characterize(line)
-        else:
-            array = line.rstrip('\n').split()
-        if len(array) == 0: continue
-        fid = array[0]
-        if fid not in rec_set:
-            continue
-        lab = normalize(array[1:], ignore_words, case_sensitive, split)
-        rec = rec_set[fid]
-        if verbose:
-            print('\nutt: %s' % fid)
-
-        for word in rec + lab:
-            if word not in default_words:
-                default_cluster_name = default_cluster(word)
-                if default_cluster_name not in default_clusters:
-                    default_clusters[default_cluster_name] = {}
-                if word not in default_clusters[default_cluster_name]:
-                    default_clusters[default_cluster_name][word] = 1
-                default_words[word] = default_cluster_name
-
-        result = calculator.calculate(lab, rec)
-        if verbose:
-            if result['all'] != 0:
-                wer = float(result['ins'] + result['sub'] + result[
-                    'del']) * 100.0 / result['all']
-            else:
-                wer = 0.0
-            print('WER: %4.2f %%' % wer, end=' ')
-            print('N=%d C=%d S=%d D=%d I=%d' %
-                  (result['all'], result['cor'], result['sub'], result['del'],
-                   result['ins']))
-            space = {}
-            space['lab'] = []
-            space['rec'] = []
-            for idx in range(len(result['lab'])):
-                len_lab = width(result['lab'][idx])
-                len_rec = width(result['rec'][idx])
-                length = max(len_lab, len_rec)
-                space['lab'].append(length - len_lab)
-                space['rec'].append(length - len_rec)
-            upper_lab = len(result['lab'])
-            upper_rec = len(result['rec'])
-            lab1, rec1 = 0, 0
-            while lab1 < upper_lab or rec1 < upper_rec:
-                if verbose > 1:
-                    print('lab(%s):' % fid.encode('utf-8'), end=' ')
-                else:
-                    print('lab:', end=' ')
-                lab2 = min(upper_lab, lab1 + max_words_per_line)
-                for idx in range(lab1, lab2):
-                    token = result['lab'][idx]
-                    print('{token}'.format(token=token), end='')
-                    for n in range(space['lab'][idx]):
-                        print(padding_symbol, end='')
-                    print(' ', end='')
-                print()
-                if verbose > 1:
-                    print('rec(%s):' % fid.encode('utf-8'), end=' ')
-                else:
-                    print('rec:', end=' ')
-                rec2 = min(upper_rec, rec1 + max_words_per_line)
-                for idx in range(rec1, rec2):
-                    token = result['rec'][idx]
-                    print('{token}'.format(token=token), end='')
-                    for n in range(space['rec'][idx]):
-                        print(padding_symbol, end='')
-                    print(' ', end='')
-                print('\n', end='\n')
-                lab1 = lab2
-                rec1 = rec2
-
-    if verbose:
-        print(
-            '==========================================================================='
-        )
-        print()
-
-    result = calculator.overall()
-    if result['all'] != 0:
-        wer = float(result['ins'] + result['sub'] + result[
-            'del']) * 100.0 / result['all']
-    else:
-        wer = 0.0
-    print('Overall -> %4.2f %%' % wer, end=' ')
-    print('N=%d C=%d S=%d D=%d I=%d' %
-          (result['all'], result['cor'], result['sub'], result['del'],
-           result['ins']))
-    if not verbose:
-        print()
-
-    if verbose:
-        for cluster_id in default_clusters:
-            result = calculator.cluster(
-                [k for k in default_clusters[cluster_id]])
-            if result['all'] != 0:
-                wer = float(result['ins'] + result['sub'] + result[
-                    'del']) * 100.0 / result['all']
-            else:
-                wer = 0.0
-            print('%s -> %4.2f %%' % (cluster_id, wer), end=' ')
-            print('N=%d C=%d S=%d D=%d I=%d' %
-                  (result['all'], result['cor'], result['sub'], result['del'],
-                   result['ins']))
-        if len(cluster_file) > 0:  # compute separated WERs for word clusters
-            cluster_id = ''
-            cluster = []
-            for line in open(cluster_file, 'r', encoding='utf-8'):
-                for token in line.decode('utf-8').rstrip('\n').split():
-                    # end of cluster reached, like </Keyword>
-                    if token[0:2] == '</' and token[len(token)-1] == '>' and \
-                       token.lstrip('</').rstrip('>') == cluster_id :
-                        result = calculator.cluster(cluster)
-                        if result['all'] != 0:
-                            wer = float(result['ins'] + result['sub'] + result[
-                                'del']) * 100.0 / result['all']
-                        else:
-                            wer = 0.0
-                        print('%s -> %4.2f %%' % (cluster_id, wer), end=' ')
-                        print('N=%d C=%d S=%d D=%d I=%d' %
-                              (result['all'], result['cor'], result['sub'],
-                               result['del'], result['ins']))
-                        cluster_id = ''
-                        cluster = []
-                    # begin of cluster reached, like <Keyword>
-                    elif token[0] == '<' and token[len(token)-1] == '>' and \
-                         cluster_id == '' :
-                        cluster_id = token.lstrip('<').rstrip('>')
-                        cluster = []
-                    # general terms, like WEATHER / CAR / ...
-                    else:
-                        cluster.append(token)
-        print()
-        print(
-            '==========================================================================='
-        )
+    compute_wer_main()
diff --git a/utils/compute_mean_std.py b/utils/compute_mean_std.py
index e47554dc..6e3fc0db 100755
--- a/utils/compute_mean_std.py
+++ b/utils/compute_mean_std.py
@@ -13,75 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Compute mean and std for feature normalizer, and save to file."""
-import argparse
-import functools
-
-from paddlespeech.s2t.frontend.augmentor.augmentation import AugmentationPipeline
-from paddlespeech.s2t.frontend.featurizer.audio_featurizer import AudioFeaturizer
-from paddlespeech.s2t.frontend.normalizer import FeatureNormalizer
-from paddlespeech.s2t.utils.utility import add_arguments
-from paddlespeech.s2t.utils.utility import print_arguments
-
-parser = argparse.ArgumentParser(description=__doc__)
-add_arg = functools.partial(add_arguments, argparser=parser)
-# yapf: disable
-add_arg('num_samples',      int,    2000,    "# of samples to for statistics.")
-
-add_arg('spectrum_type',    str,
-        'linear',
-        "Audio feature type. Options: linear, mfcc, fbank.",
-        choices=['linear', 'mfcc', 'fbank'])
-add_arg('feat_dim',    int, 13, "Audio feature dim.")
-add_arg('delta_delta', bool,  False, "Audio feature with delta delta.")
-add_arg('stride_ms', int, 10,  "stride length in ms.")
-add_arg('window_ms', int, 20,  "stride length in ms.")
-add_arg('sample_rate',  int, 16000,  "target sample rate.")
-add_arg('use_dB_normalization', bool, True, "do dB normalization.")
-add_arg('target_dB',   int, -20,  "target dB.")
-
-add_arg('manifest_path',    str,
-        'data/librispeech/manifest.train',
-        "Filepath of manifest to compute normalizer's mean and stddev.")
-add_arg('num_workers',
-                        default=0,
-                        type=int,
-                        help='num of subprocess workers for processing')
-add_arg('output_path',    str,
-        'data/librispeech/mean_std.npz',
-        "Filepath of write mean and stddev to (.npz).")
-# yapf: disable
-args = parser.parse_args()
-
-
-def main():
-    print_arguments(args, globals())
-
-    augmentation_pipeline = AugmentationPipeline('{}')
-    audio_featurizer = AudioFeaturizer(
-        spectrum_type=args.spectrum_type,
-        feat_dim=args.feat_dim,
-        delta_delta=args.delta_delta,
-        stride_ms=float(args.stride_ms),
-        window_ms=float(args.window_ms),
-        n_fft=None,
-        max_freq=None,
-        target_sample_rate=args.sample_rate,
-        use_dB_normalization=args.use_dB_normalization,
-        target_dB=args.target_dB,
-        dither=0.0)
-
-    def augment_and_featurize(audio_segment):
-        augmentation_pipeline.transform_audio(audio_segment)
-        return audio_featurizer.featurize(audio_segment)
-
-    normalizer = FeatureNormalizer(
-        mean_std_filepath=None,
-        manifest_path=args.manifest_path,
-        featurize_func=augment_and_featurize,
-        num_samples=args.num_samples,
-        num_workers=args.num_workers)
-    normalizer.write_to_file(args.output_path)
-
+from paddlespeech.dataset.s2t import compute_mean_std_main
 
 if __name__ == '__main__':
-    main()
+    compute_mean_std_main()
diff --git a/utils/copy-feats.py b/utils/copy-feats.py
index dc7a70b4..89ea30f9 100755
--- a/utils/copy-feats.py
+++ b/utils/copy-feats.py
@@ -4,7 +4,7 @@ import logging
 
 from distutils.util import strtobool
 
-from paddlespeech.s2t.transform.transformation import Transformation
+from paddlespeech.audio.transform.transformation import Transformation
 from paddlespeech.s2t.utils.cli_readers import file_reader_helper
 from paddlespeech.s2t.utils.cli_utils import get_commandline_args
 from paddlespeech.s2t.utils.cli_utils import is_scipy_wav_style
diff --git a/utils/feat-to-shape.py b/utils/feat-to-shape.py
index bbc9242f..e5e014de 100755
--- a/utils/feat-to-shape.py
+++ b/utils/feat-to-shape.py
@@ -3,7 +3,7 @@ import argparse
 import logging
 import sys
 
-from paddlespeech.s2t.transform.transformation import Transformation
+from paddlespeech.audio.transform.transformation import Transformation
 from paddlespeech.s2t.utils.cli_readers import file_reader_helper
 from paddlespeech.s2t.utils.cli_utils import get_commandline_args
 from paddlespeech.s2t.utils.cli_utils import is_scipy_wav_style
diff --git a/utils/format_data.py b/utils/format_data.py
index 6db2a1bb..574cb735 100755
--- a/utils/format_data.py
+++ b/utils/format_data.py
@@ -13,130 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """format manifest with more metadata."""
-import argparse
-import functools
-import json
-
-import jsonlines
-
-from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
-from paddlespeech.s2t.frontend.utility import load_cmvn
-from paddlespeech.s2t.io.utility import feat_type
-from paddlespeech.s2t.utils.utility import add_arguments
-from paddlespeech.s2t.utils.utility import print_arguments
-
-parser = argparse.ArgumentParser(description=__doc__)
-add_arg = functools.partial(add_arguments, argparser=parser)
-# yapf: disable
-add_arg('cmvn_path',       str,
-        'examples/librispeech/data/mean_std.json',
-        "Filepath of cmvn.")
-add_arg('unit_type', str, "char", "Unit type, e.g. char, word, spm")
-add_arg('vocab_path',       str,
-        'examples/librispeech/data/vocab.txt',
-        "Filepath of the vocabulary.")
-add_arg('manifest_paths',   str,
-        None,
-        "Filepaths of manifests for building vocabulary. "
-        "You can provide multiple manifest files.",
-        nargs='+',
-        required=True)
-# bpe
-add_arg('spm_model_prefix', str, None,
-     "spm model prefix, spm_model_%(bpe_mode)_%(count_threshold), only need when `unit_type` is spm")
-add_arg('output_path',  str, None, "filepath of formated manifest.", required=True)
-# yapf: disable
-args = parser.parse_args()
-
-
-def main():
-    print_arguments(args, globals())
-    fout = open(args.output_path, 'w', encoding='utf-8')
-
-    # get feat dim
-    filetype = args.cmvn_path.split(".")[-1]
-    mean, istd = load_cmvn(args.cmvn_path, filetype=filetype)
-    feat_dim = mean.shape[0] #(D)
-    print(f"Feature dim: {feat_dim}")
-
-    text_feature = TextFeaturizer(args.unit_type, args.vocab_path, args.spm_model_prefix)
-    vocab_size = text_feature.vocab_size
-    print(f"Vocab size: {vocab_size}")
-
-    # josnline like this
-    # {
-    #   "input": [{"name": "input1", "shape": (100, 83), "feat": "xxx.ark:123"}],
-    #   "output": [{"name":"target1", "shape": (40, 5002), "text": "a b c de"}],
-    #   "utt2spk": "111-2222",
-    #   "utt": "111-2222-333"
-    # }
-    count = 0
-    for manifest_path in args.manifest_paths:
-        with jsonlines.open(str(manifest_path), 'r') as reader:
-            manifest_jsons = list(reader)
-
-        for line_json in manifest_jsons:
-            output_json = {
-                "input": [],
-                "output": [],
-                'utt': line_json['utt'],
-                'utt2spk': line_json.get('utt2spk', 'global'),
-            }
-
-            # output
-            line = line_json['text']
-            if isinstance(line, str):
-                # only one target
-                tokens = text_feature.tokenize(line)
-                tokenids = text_feature.featurize(line)
-                output_json['output'].append({
-                    'name': 'target1',
-                    'shape': (len(tokenids), vocab_size),
-                    'text': line,
-                    'token': ' '.join(tokens),
-                    'tokenid': ' '.join(map(str, tokenids)),
-                })
-            else:
-                # isinstance(line, list), multi target in one vocab
-                for i, item in enumerate(line, 1):
-                    tokens = text_feature.tokenize(item)
-                    tokenids = text_feature.featurize(item)
-                    output_json['output'].append({
-                        'name': f'target{i}',
-                        'shape': (len(tokenids), vocab_size),
-                        'text': item,
-                        'token': ' '.join(tokens),
-                        'tokenid': ' '.join(map(str, tokenids)),
-                    })
-
-            # input
-            line = line_json['feat']
-            if isinstance(line, str):
-                # only one input
-                feat_shape = line_json['feat_shape']
-                assert isinstance(feat_shape, (list, tuple)), type(feat_shape)
-                filetype = feat_type(line)
-                if filetype == 'sound':
-                    feat_shape.append(feat_dim)
-                else: # kaldi
-                    raise NotImplementedError('no support kaldi feat now!')
-
-                output_json['input'].append({
-                    "name": "input1",
-                    "shape": feat_shape,
-                    "feat": line,
-                    "filetype": filetype,
-                })
-            else:
-                # isinstance(line, list), multi input 
-                raise NotImplementedError("not support multi input now!")
-
-            fout.write(json.dumps(output_json) + '\n')
-            count += 1
-
-    print(f"{args.manifest_paths} Examples number: {count}")
-    fout.close()
-
+from paddlespeech.dataset.s2t import format_data_main
 
 if __name__ == '__main__':
-    main()
+    format_data_main()
diff --git a/utils/format_rsl.py b/utils/format_rsl.py
index 8230416c..a6845a67 100644
--- a/utils/format_rsl.py
+++ b/utils/format_rsl.py
@@ -11,96 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import argparse
+from paddlespeech.dataset.s2t import format_rsl_main
 
-import jsonlines
-
-
-def trans_hyp(origin_hyp, trans_hyp=None, trans_hyp_sclite=None):
-    """
-    Args:
-        origin_hyp: The input json file which contains the model output
-        trans_hyp: The output file for caculate CER/WER
-        trans_hyp_sclite: The output file for caculate CER/WER using sclite
-    """
-    input_dict = {}
-
-    with open(origin_hyp, "r+", encoding="utf8") as f:
-        for item in jsonlines.Reader(f):
-            input_dict[item["utt"]] = item["hyps"][0]
-    if trans_hyp is not None:
-        with open(trans_hyp, "w+", encoding="utf8") as f:
-            for key in input_dict.keys():
-                f.write(key + " " + input_dict[key] + "\n")
-    if trans_hyp_sclite is not None:
-        with open(trans_hyp_sclite, "w+") as f:
-            for key in input_dict.keys():
-                line = input_dict[key] + "(" + key + ".wav" + ")" + "\n"
-                f.write(line)
-
-
-def trans_ref(origin_ref, trans_ref=None, trans_ref_sclite=None):
-    """
-    Args:
-        origin_hyp: The input json file which contains the model output
-        trans_hyp: The output file for caculate CER/WER
-        trans_hyp_sclite: The output file for caculate CER/WER using sclite
-    """
-    input_dict = {}
-
-    with open(origin_ref, "r", encoding="utf8") as f:
-        for item in jsonlines.Reader(f):
-            input_dict[item["utt"]] = item["text"]
-    if trans_ref is not None:
-        with open(trans_ref, "w", encoding="utf8") as f:
-            for key in input_dict.keys():
-                f.write(key + " " + input_dict[key] + "\n")
-
-    if trans_ref_sclite is not None:
-        with open(trans_ref_sclite, "w") as f:
-            for key in input_dict.keys():
-                line = input_dict[key] + "(" + key + ".wav" + ")" + "\n"
-                f.write(line)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        prog='format hyp file for compute CER/WER', add_help=True)
-    parser.add_argument(
-        '--origin_hyp', type=str, default=None, help='origin hyp file')
-    parser.add_argument(
-        '--trans_hyp',
-        type=str,
-        default=None,
-        help='hyp file for caculating CER/WER')
-    parser.add_argument(
-        '--trans_hyp_sclite',
-        type=str,
-        default=None,
-        help='hyp file for caculating CER/WER by sclite')
-
-    parser.add_argument(
-        '--origin_ref', type=str, default=None, help='origin ref file')
-    parser.add_argument(
-        '--trans_ref',
-        type=str,
-        default=None,
-        help='ref file for caculating CER/WER')
-    parser.add_argument(
-        '--trans_ref_sclite',
-        type=str,
-        default=None,
-        help='ref file for caculating CER/WER by sclite')
-    parser_args = parser.parse_args()
-
-    if parser_args.origin_hyp is not None:
-        trans_hyp(
-            origin_hyp=parser_args.origin_hyp,
-            trans_hyp=parser_args.trans_hyp,
-            trans_hyp_sclite=parser_args.trans_hyp_sclite, )
-
-    if parser_args.origin_ref is not None:
-        trans_ref(
-            origin_ref=parser_args.origin_ref,
-            trans_ref=parser_args.trans_ref,
-            trans_ref_sclite=parser_args.trans_ref_sclite, )
+if __name__ == '__main__':
+    format_rsl_main()
diff --git a/utils/format_triplet_data.py b/utils/format_triplet_data.py
index 44ff4527..e9a0cf54 100755
--- a/utils/format_triplet_data.py
+++ b/utils/format_triplet_data.py
@@ -22,8 +22,8 @@ import jsonlines
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.frontend.utility import load_cmvn
 from paddlespeech.s2t.io.utility import feat_type
-from paddlespeech.s2t.utils.utility import add_arguments
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import add_arguments
+from paddlespeech.utils.argparse import print_arguments
 
 parser = argparse.ArgumentParser(description=__doc__)
 add_arg = functools.partial(add_arguments, argparser=parser)
diff --git a/utils/fst/ctc_token_fst.py b/utils/fst/ctc_token_fst.py
index 2262912c..f63e9cda 100755
--- a/utils/fst/ctc_token_fst.py
+++ b/utils/fst/ctc_token_fst.py
@@ -6,7 +6,7 @@ def main(args):
     """Token Transducer"""
     # <eps> entry
     print('0 1 <eps> <eps>')
-    # skip begining and ending <blank>
+    # skip beginning and ending <blank>
     print('1 1 <blank> <eps>')
     print('2 2 <blank> <eps>')
     # <eps> exit
diff --git a/utils/manifest_key_value.py b/utils/manifest_key_value.py
index 3825fb9b..5ffe8e55 100755
--- a/utils/manifest_key_value.py
+++ b/utils/manifest_key_value.py
@@ -6,8 +6,8 @@ from pathlib import Path
 
 import jsonlines
 
-from utils.utility import add_arguments
-from utils.utility import print_arguments
+from paddlespeech.utils.argparse import add_arguments
+from paddlespeech.utils.argparse import print_arguments
 
 
 def main(args):
diff --git a/utils/tokenizer.perl b/utils/tokenizer.perl
index ae97d658..836fe19c 100644
--- a/utils/tokenizer.perl
+++ b/utils/tokenizer.perl
@@ -296,7 +296,7 @@ sub tokenize
         $text =~ s/DOTMULTI\./DOTDOTMULTI/g;
     }
 
-    # seperate out "," except if within numbers (5,300)
+    # separate out "," except if within numbers (5,300)
     #$text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
 
     # separate out "," except if within numbers (5,300)

序号	说明	合成音频（diffsinger_opencpop + pwgan_opencpop）
1	原始 opencpop 标注的 notes，note_durs，is_slurs，升F大调，起始在小字组（第3组）	+ + +
2	原始 opencpop 标注的 notes 和 is_slurs，note_durs 改变（从谱子获取）	+ + +
3	原始 opencpop 标注的 notes 去掉 rest（毛字一拍），is_slurs 和 note_durs 改变（从谱子获取）	+ + +
4	从谱子获取 notes，note durs，is_slurs，不含 rest（毛字一拍），起始在小字一组（第3组）	+ + +
5	从谱子获取 notes，note durs，is_slurs，加上 rest （毛字半拍，rest半拍），起始在小字一组（第3组）	+ + +
6	从谱子获取 notes， is_slurs，包含 rest，note_durs 从原始标注获取，起始在小字一组（第3组）	+ + +
7	从谱子获取 notes，note durs，is_slurs，不含 rest（毛字一拍），起始在小字一组（第4组）	+ + +