diff --git a/.github/stale.yml b/.github/stale.yml
index da19b6606..6b0da9b98 100644
--- a/.github/stale.yml
+++ b/.github/stale.yml
@@ -6,7 +6,8 @@ daysUntilClose: 30
 exemptLabels:
   - Roadmap 
   - Bug
-  - New Feature 
+  - feature request
+  - Tips
 # Label to use when marking an issue as stale
 staleLabel: Stale
 # Comment to post when marking an issue as stale. Set to `false` to disable
@@ -17,4 +18,4 @@ markComment: >
 unmarkComment: false
 # Comment to post when closing a stale issue. Set to `false` to disable
 closeComment: >
-  This issue is closed. Please re-open if needed.
\ No newline at end of file
+  This issue is closed. Please re-open if needed.
diff --git a/README.md b/README.md
index 2fb773634..d3b09576d 100644
--- a/README.md
+++ b/README.md
@@ -97,26 +97,40 @@
   </thead>
   <tbody>
    <tr>
-      <td >Life was like a box of chocolates, you never know what you're gonna get.</td>
+      <td>Life was like a box of chocolates, you never know what you're gonna get.</td>
       <td align = "center">
       <a href="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/tacotron2_ljspeech_waveflow_samples_0.2/sentence_1.wav" rel="nofollow">
             <img align="center" src="./docs/images/audio_icon.png" width="200" style="max-width: 100%;"></a><br>
       </td>
     </tr>
     <tr>
-      <td >早上好，今天是2020/10/29，最低温度是-3°C。</td>
+      <td>早上好，今天是2020/10/29，最低温度是-3°C。</td>
       <td align = "center">
       <a href="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/001.wav" rel="nofollow">
             <img align="center" src="./docs/images/audio_icon.png" width="200" style="max-width: 100%;"></a><br>
       </td>
     </tr>
     <tr>
-      <td >季姬寂，集鸡，鸡即棘鸡。棘鸡饥叽，季姬及箕稷济鸡。鸡既济，跻姬笈，季姬忌，急咭鸡，鸡急，继圾几，季姬急，即籍箕击鸡，箕疾击几伎，伎即齑，鸡叽集几基，季姬急极屐击鸡，鸡既殛，季姬激，即记《季姬击鸡记》。</td>
+      <td>季姬寂，集鸡，鸡即棘鸡。棘鸡饥叽，季姬及箕稷济鸡。鸡既济，跻姬笈，季姬忌，急咭鸡，鸡急，继圾几，季姬急，即籍箕击鸡，箕疾击几伎，伎即齑，鸡叽集几基，季姬急极屐击鸡，鸡既殛，季姬激，即记《季姬击鸡记》。</td>
       <td align = "center">
       <a href="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/jijiji.wav" rel="nofollow">
             <img align="center" src="./docs/images/audio_icon.png" width="200" style="max-width: 100%;"></a><br>
       </td>
     </tr>
+    <tr>
+      <td>大家好，我是 parrot 虚拟老师，我们来读一首诗，我与春风皆过客，I and the spring breeze are passing by，你携秋水揽星河，you take the autumn water to take the galaxy。</td>
+      <td align = "center">
+      <a href="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/labixiaoxin.wav" rel="nofollow">
+            <img align="center" src="./docs/images/audio_icon.png" width="200" style="max-width: 100%;"></a><br>
+      </td>
+    </tr>
+    <tr>
+      <td>宜家唔系事必要你讲，但系你所讲嘅说话将会变成呈堂证供。</td>
+      <td align = "center">
+      <a href="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/chengtangzhenggong.wav" rel="nofollow">
+            <img align="center" src="./docs/images/audio_icon.png" width="200" style="max-width: 100%;"></a><br>
+      </td>
+    </tr>
   </tbody>
 </table>
 
@@ -157,18 +171,19 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision
   - 🧩  *Cascaded models application*: as an extension of the typical traditional audio tasks, we combine the workflows of the aforementioned tasks with other fields like Natural language processing (NLP) and Computer Vision (CV).
 
 ### Recent Update
-- 🔥 2022.01.10: Add [code-switch asr CLI and Demos](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/speech_recognition).
-- 👑 2022.01.06: Add [code-switch asr tal_cs recipe](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/tal_cs/asr1/).
-- 🎉 2022.12.02: Add [end-to-end Prosody Prediction pipeline](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3_rhy) (including using prosody labels in Acoustic Model).
-- 🎉 2022.11.30: Add [TTS Android Demo](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/TTSAndroid).
+- 🎉 2023.02.16: Add [Cantonese TTS](./examples/canton/tts3).
+- 🔥 2023.01.10: Add [code-switch asr CLI and Demos](./demos/speech_recognition).
+- 👑 2023.01.06: Add [code-switch asr tal_cs recipe](./examples/tal_cs/asr1/).
+- 🎉 2022.12.02: Add [end-to-end Prosody Prediction pipeline](./examples/csmsc/tts3_rhy) (including using prosody labels in Acoustic Model).
+- 🎉 2022.11.30: Add [TTS Android Demo](./demos/TTSAndroid).
 - 🤗 2022.11.28: PP-TTS and PP-ASR demos are available in [AIStudio](https://aistudio.baidu.com/aistudio/modelsoverview) and [official website
  of paddlepaddle](https://www.paddlepaddle.org.cn/models).
 - 👑 2022.11.18: Add [Whisper CLI and Demos](https://github.com/PaddlePaddle/PaddleSpeech/pull/2640), support multi language recognition and translation.
-- 🔥 2022.11.18: Add [Wav2vec2 CLI and Demos](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/demos/speech_ssl), Support ASR and Feature Extraction.
+- 🔥 2022.11.18: Add [Wav2vec2 CLI and Demos](./demos/speech_ssl), Support ASR and Feature Extraction.
 - 🎉 2022.11.17: Add [male voice for TTS](https://github.com/PaddlePaddle/PaddleSpeech/pull/2660).
-- 🔥 2022.11.07: Add [U2/U2++ C++ High Performance Streaming ASR Deployment](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/speechx/examples/u2pp_ol/wenetspeech).
+- 🔥 2022.11.07: Add [U2/U2++ C++ High Performance Streaming ASR Deployment](./speechx/examples/u2pp_ol/wenetspeech).
 - 👑 2022.11.01: Add [Adversarial Loss](https://arxiv.org/pdf/1907.04448.pdf) for [Chinese English mixed TTS](./examples/zh_en_tts/tts3).
-- 🔥 2022.10.26: Add [Prosody Prediction](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/rhy) for TTS.
+- 🔥 2022.10.26: Add [Prosody Prediction](./examples/other/rhy) for TTS.
 - 🎉 2022.10.21: Add [SSML](https://github.com/PaddlePaddle/PaddleSpeech/discussions/2538) for TTS Chinese Text Frontend.
 - 👑 2022.10.11: Add [Wav2vec2ASR-en](./examples/librispeech/asr3), wav2vec2.0 fine-tuning for ASR on LibriSpeech.
 - 🔥 2022.09.26: Add Voice Cloning, TTS finetune, and [ERNIE-SAT](https://arxiv.org/abs/2211.03545) in [PaddleSpeech Web Demo](./demos/speech_web).
@@ -191,7 +206,7 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision
 - Scan the QR code below with your Wechat, you can access to official technical exchange group and get the bonus ( more than 20GB learning materials, such as papers, codes and videos ) and the live link of the lessons. Look forward to your participation.
 
 <div align="center">
-<img src="https://user-images.githubusercontent.com/30135920/196351517-19dece6b-d6ea-448e-a341-d6bfe5712ec1.jpg"  width = "200"  />
+<img src="https://user-images.githubusercontent.com/30135920/212860467-9e943cc3-8be8-49a4-97fd-7c94aad8e979.jpg"  width = "200"  />
 </div>
 
 ## Installation
@@ -987,8 +1002,9 @@ You are warmly welcome to submit questions in [discussions](https://github.com/P
 - Many thanks to [vpegasus](https://github.com/vpegasus)/[xuesebot](https://github.com/vpegasus/xuesebot) for developing a rasa chatbot,which is able to speak and listen thanks to PaddleSpeech.
 - Many thanks to [chenkui164](https://github.com/chenkui164)/[FastASR](https://github.com/chenkui164/FastASR) for the C++ inference implementation of PaddleSpeech ASR.
 - Many thanks to [heyudage](https://github.com/heyudage)/[VoiceTyping](https://github.com/heyudage/VoiceTyping) for the real-time voice typing tool implementation of PaddleSpeech ASR streaming services.
-
+- Many thanks to [EscaticZheng](https://github.com/EscaticZheng)/[ps3.9wheel-install](https://github.com/EscaticZheng/ps3.9wheel-install) for the python3.9 prebuilt wheel for PaddleSpeech installation in Windows without Viusal Studio.
 Besides, PaddleSpeech depends on a lot of open source repositories. See [references](./docs/source/reference.md) for more information.
+- Many thanks to [chinobing](https://github.com/chinobing)/[FastAPI-PaddleSpeech-Audio-To-Text](https://github.com/chinobing/FastAPI-PaddleSpeech-Audio-To-Text) for converting audio to text based on FastAPI and PaddleSpeech.
 
 <a name="License"></a>
 ## License
diff --git a/README_cn.md b/README_cn.md
index 53f6a66e4..be1c5d44f 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -122,6 +122,20 @@
             <img align="center" src="./docs/images/audio_icon.png" width="200" style="max-width: 100%;"></a><br>
       </td>
     </tr>
+    <tr>
+      <td>大家好，我是 parrot 虚拟老师，我们来读一首诗，我与春风皆过客，I and the spring breeze are passing by，你携秋水揽星河，you take the autumn water to take the galaxy。</td>
+      <td align = "center">
+      <a href="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/labixiaoxin.wav" rel="nofollow">
+            <img align="center" src="./docs/images/audio_icon.png" width="200" style="max-width: 100%;"></a><br>
+      </td>
+    </tr>
+    <tr>
+      <td>宜家唔系事必要你讲，但系你所讲嘅说话将会变成呈堂证供。</td>
+      <td align = "center">
+      <a href="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/chengtangzhenggong.wav" rel="nofollow">
+            <img align="center" src="./docs/images/audio_icon.png" width="200" style="max-width: 100%;"></a><br>
+      </td>
+    </tr>
   </tbody>
 </table>
 
@@ -161,20 +175,19 @@
   - 🔬 主流模型及数据集: 本工具包实现了参与整条语音任务流水线的各个模块，并且采用了主流数据集如 LibriSpeech、LJSpeech、AIShell、CSMSC，详情请见 [模型列表](#model-list)。
   - 🧩 级联模型应用: 作为传统语音任务的扩展，我们结合了自然语言处理、计算机视觉等任务，实现更接近实际需求的产业级应用。
 
-
-  
 ### 近期更新
-- 🔥 2022.01.10: 新增 [中英混合 ASR CLI 和 Demos](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/speech_recognition).
-- 👑 2022.01.06: 新增 [ASR中英混合 tal_cs 训练推理流程](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/tal_cs/asr1/).
-- 🎉 2022.12.02: 新增 [端到端韵律预测全流程](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3_rhy) (包含在声学模型中使用韵律标签)。
-- 🎉 2022.11.30: 新增 [TTS Android 部署示例](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/TTSAndroid)。
+- 🎉 2023.02.16: 新增[粤语语音合成](./examples/canton/tts3)。
+- 🔥 2023.01.10: 新增[中英混合 ASR CLI 和 Demos](./demos/speech_recognition)。
+- 👑 2023.01.06: 新增 [ASR 中英混合 tal_cs 训练推理流程](./examples/tal_cs/asr1/)。
+- 🎉 2022.12.02: 新增[端到端韵律预测全流程](./examples/csmsc/tts3_rhy) (包含在声学模型中使用韵律标签)。
+- 🎉 2022.11.30: 新增 [TTS Android 部署示例](./demos/TTSAndroid)。
 - 🤗 2022.11.28: PP-TTS and PP-ASR 示例可在 [AIStudio](https://aistudio.baidu.com/aistudio/modelsoverview) 和[飞桨官网](https://www.paddlepaddle.org.cn/models)体验！
 - 👑 2022.11.18: 新增 [Whisper CLI 和 Demos](https://github.com/PaddlePaddle/PaddleSpeech/pull/2640), 支持多种语言的识别与翻译。
-- 🔥 2022.11.18: 新增 [Wav2vec2 CLI 和 Demos](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/demos/speech_ssl), 支持 ASR 和 特征提取.
+- 🔥 2022.11.18: 新增 [Wav2vec2 CLI 和 Demos](./demos/speech_ssl), 支持 ASR 和特征提取。
 - 🎉 2022.11.17: TTS 新增[高质量男性音色](https://github.com/PaddlePaddle/PaddleSpeech/pull/2660)。
-- 🔥 2022.11.07: 新增 [U2/U2++ 高性能流式 ASR C++ 部署](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/speechx/examples/u2pp_ol/wenetspeech)。
+- 🔥 2022.11.07: 新增 [U2/U2++ 高性能流式 ASR C++ 部署](./speechx/examples/u2pp_ol/wenetspeech)。
 - 👑 2022.11.01: [中英文混合 TTS](./examples/zh_en_tts/tts3) 新增 [Adversarial Loss](https://arxiv.org/pdf/1907.04448.pdf) 模块。
-- 🔥 2022.10.26: TTS 新增[韵律预测](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/rhy)功能。
+- 🔥 2022.10.26: TTS 新增[韵律预测](./develop/examples/other/rhy)功能。
 - 🎉 2022.10.21: TTS 中文文本前端新增 [SSML](https://github.com/PaddlePaddle/PaddleSpeech/discussions/2538) 功能。
 - 👑 2022.10.11: 新增 [Wav2vec2ASR-en](./examples/librispeech/asr3), 在 LibriSpeech 上针对 ASR 任务对 wav2vec2.0 的 finetuning。
 - 🔥 2022.09.26: 新增 Voice Cloning, TTS finetune 和 [ERNIE-SAT](https://arxiv.org/abs/2211.03545) 到 [PaddleSpeech 网页应用](./demos/speech_web)。
@@ -202,7 +215,7 @@
 微信扫描二维码关注公众号，点击“马上报名”填写问卷加入官方交流群，获得更高效的问题答疑，与各行各业开发者充分交流，期待您的加入。
 
 <div align="center">
-<img src="https://user-images.githubusercontent.com/30135920/196351517-19dece6b-d6ea-448e-a341-d6bfe5712ec1.jpg"  width = "200"  />
+<img src="https://user-images.githubusercontent.com/30135920/212860467-9e943cc3-8be8-49a4-97fd-7c94aad8e979.jpg"  width = "200"  />
 </div>
 
 <a name="安装"></a>
@@ -988,10 +1001,11 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块：文本前端、声
 - 非常感谢 [awmmmm](https://github.com/awmmmm) 提供 fastspeech2 aishell3 conformer 预训练模型。
 - 非常感谢 [phecda-xu](https://github.com/phecda-xu)/[PaddleDubbing](https://github.com/phecda-xu/PaddleDubbing) 基于 PaddleSpeech 的 TTS 模型搭建带 GUI 操作界面的配音工具。
 - 非常感谢 [jerryuhoo](https://github.com/jerryuhoo)/[VTuberTalk](https://github.com/jerryuhoo/VTuberTalk) 基于 PaddleSpeech 的 TTS GUI 界面和基于 ASR 制作数据集的相关代码。
-
 - 非常感谢 [vpegasus](https://github.com/vpegasus)/[xuesebot](https://github.com/vpegasus/xuesebot) 基于 PaddleSpeech 的 ASR 与 TTS 设计的可听、说对话机器人。
 - 非常感谢 [chenkui164](https://github.com/chenkui164)/[FastASR](https://github.com/chenkui164/FastASR) 对 PaddleSpeech 的 ASR 进行 C++ 推理实现。
 - 非常感谢 [heyudage](https://github.com/heyudage)/[VoiceTyping](https://github.com/heyudage/VoiceTyping) 基于 PaddleSpeech 的 ASR 流式服务实现的实时语音输入法工具。
+- 非常感谢 [EscaticZheng](https://github.com/EscaticZheng)/[ps3.9wheel-install](https://github.com/EscaticZheng/ps3.9wheel-install) 对PaddleSpeech在Windows下的安装提供了无需Visua Studio，基于python3.9的预编译依赖安装包。
+- 非常感谢 [chinobing](https://github.com/chinobing)/[FastAPI-PaddleSpeech-Audio-To-Text](https://github.com/chinobing/FastAPI-PaddleSpeech-Audio-To-Text) 利用 FastAPI 实现 PaddleSpeech 语音转文字，文件上传、分割、转换进度显示、后台更新任务并以 csv 格式输出。
 
 此外，PaddleSpeech 依赖于许多开源存储库。有关更多信息，请参阅 [references](./docs/source/reference.md)。
 
diff --git a/audio/setup.py b/audio/setup.py
index 82e9a55a5..823e5dfad 100644
--- a/audio/setup.py
+++ b/audio/setup.py
@@ -40,14 +40,9 @@ COMMITID = 'none'
 base = [
     "kaldiio",
     "librosa==0.8.1",
-    "scipy>=1.0.0",
-    "soundfile~=0.10",
-    "colorlog",
-    "pathos == 0.2.8",
+    "pathos",
     "pybind11",
     "parameterized",
-    "tqdm",
-    "scikit-learn"
 ]
 
 requirements = {
@@ -273,7 +268,7 @@ def main():
         },
 
         # Package info
-        packages=find_packages(include=('paddleaudio*')),
+        packages=find_packages(include=['paddleaudio*']),
         package_data=lib_package_data,
         ext_modules=setup_helpers.get_ext_modules(),
         zip_safe=True,
diff --git a/demos/speech_web/speech_server/requirements.txt b/demos/speech_web/speech_server/requirements.txt
index cdc654656..8425a1fee 100644
--- a/demos/speech_web/speech_server/requirements.txt
+++ b/demos/speech_web/speech_server/requirements.txt
@@ -1,8 +1,6 @@
 aiofiles
 faiss-cpu
-praatio==5.0.0
+praatio>=5.0.0
 pydantic
 python-multipart
-scikit_learn
 starlette
-uvicorn
diff --git a/docs/requirements.txt b/docs/requirements.txt
index c6228d917..65f451cd2 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,12 +1,9 @@
 braceexpand
-colorlog
 editdistance
-fastapi
 g2p_en
 g2pM
 h5py
 inflect
-jieba
 jsonlines
 kaldiio
 keyboard
@@ -16,7 +13,7 @@ matplotlib
 myst-parser
 nara_wpe
 numpydoc
-onnxruntime==1.10.0
+onnxruntime>=1.11.0
 opencc
 paddlenlp
 # use paddlepaddle == 2.3.* according to: https://github.com/PaddlePaddle/Paddle/issues/48243
@@ -24,32 +21,25 @@ paddlepaddle>=2.2.2,<2.4.0
 paddlespeech_ctcdecoders
 paddlespeech_feat
 pandas
-pathos==0.2.8
 pattern_singleton
-Pillow>=9.0.0
 ppdiffusers>=0.9.0
-praatio==5.0.0
+praatio>=5.0.0
 prettytable
 pypinyin-dict
 pypinyin<=0.44.0
 python-dateutil
-pyworld==0.2.12
+pyworld>=0.2.12
 recommonmark>=0.5.0
-resampy==0.2.2
+resampy
 sacrebleu
-scipy
-sentencepiece~=0.1.96
-soundfile~=0.10
 sphinx
 sphinx-autobuild
 sphinx-markdown-tables
 sphinx_rtd_theme
 textgrid
 timer
-tqdm
+ToJyutping
 typeguard
-uvicorn
-visualdl
 webrtcvad
 websockets
 yacs~=0.1.8
diff --git a/examples/aishell/asr3/README.md b/examples/aishell/asr3/README.md
new file mode 100644
index 000000000..e5806d621
--- /dev/null
+++ b/examples/aishell/asr3/README.md
@@ -0,0 +1,198 @@
+# Wav2vec2ASR with Aishell
+This example contains code used to finetune [wav2vec2.0](https://https://arxiv.org/pdf/2006.11477.pdf) model with [Aishell dataset](http://www.openslr.org/resources/33)
+## Overview
+All the scripts you need are in `run.sh`. There are several stages in `run.sh`, and each stage has its function.
+| Stage | Function                                                     |
+|:---- |:----------------------------------------------------------- |
+| 0     | Process data. It includes: <br>       (1) Download the dataset <br>       (2) Calculate the CMVN of the train dataset <br>       (3) Get the vocabulary file <br>       (4) Get the manifest files of the train, development and test dataset<br>       (5) Download the pretrained wav2vec2 model |
+| 1     | Train the model                                              |
+| 2     | Get the final model by averaging the top-k models, set k = 1 means to choose the best model |
+| 3     | Test the final model performance                             |
+| 4     | Infer the single audio file                                  |
+
+
+You can choose to run a range of stages by setting `stage` and `stop_stage `. 
+
+For example, if you want to execute the code in stage 2 and stage 3, you can run this script:
+```bash
+bash run.sh --stage 2 --stop_stage 3
+```
+Or you can set `stage` equal to `stop-stage` to only run one stage.
+For example, if you only want to run `stage 0`, you can use the script below:
+```bash
+bash run.sh --stage 0 --stop_stage 0
+```
+The document below will describe the scripts in `run.sh` in detail.
+## The Environment Variables
+The path.sh contains the environment variables. 
+```bash
+. ./path.sh
+. ./cmd.sh
+```
+This script needs to be run first. And another script is also needed:
+```bash
+source ${MAIN_ROOT}/utils/parse_options.sh
+```
+It will support the way of using `--variable value` in the shell scripts.
+## The Local Variables
+Some local variables are set in `run.sh`. 
+`gpus` denotes the GPU number you want to use. If you set `gpus=`, it means you only use CPU. 
+`stage` denotes the number of stages you want to start from in the experiments.
+`stop stage` denotes the number of the stage you want to end at in the experiments. 
+`conf_path` denotes the config path of the model.
+`avg_num` denotes the number K of top-K models you want to average to get the final model.
+`audio file` denotes the file path of the single file you want to infer in stage 5
+`ckpt` denotes the checkpoint prefix of the model, e.g. "wav2vec2ASR"
+
+You can set the local variables (except `ckpt`) when you use `run.sh`
+
+For example, you can set the `gpus` and `avg_num` when you use the command line:
+```bash
+bash run.sh --gpus 0,1 --avg_num 20
+```
+## Stage 0: Data Processing
+To use this example, you need to process data firstly and you can use stage 0 in `run.sh` to do this. The code is shown below:
+```bash
+ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+     # prepare data
+     bash ./local/data.sh || exit -1
+ fi
+```
+Stage 0 is for processing the data.
+
+If you only want to process the data. You can run
+```bash
+bash run.sh --stage 0 --stop_stage 0
+```
+You can also just run these scripts in your command line.
+```bash
+. ./path.sh
+. ./cmd.sh
+bash ./local/data.sh
+```
+After processing the data, the `data` directory will look like this:
+```bash
+data/
+|-- dev.meta
+|-- lang_char
+|   `-- vocab.txt
+|-- manifest.dev
+|-- manifest.dev.raw
+|-- manifest.test
+|-- manifest.test.raw
+|-- manifest.train
+|-- manifest.train.raw
+|-- mean_std.json
+|-- test.meta
+|-- train.meta
+|-- train.csv
+|-- dev.csv
+|-- test.csv
+```
+
+Stage 0 also downloads the Chinese pre-trained [wav2vec2](https://paddlespeech.bj.bcebos.com/wav2vec/chinese-wav2vec2-large.pdparams) model.
+```bash
+mkdir -p exp/wav2vec2
+wget -P exp/wav2vec2 https://paddlespeech.bj.bcebos.com/wav2vec/chinese-wav2vec2-large.pdparams
+```
+## Stage 1: Model Training
+If you want to train the model. you can use stage 1 in `run.sh`. The code is shown below. 
+```bash
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+     # train model, all `ckpt` under `exp` dir
+     CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt}
+ fi
+```
+If you want to train the model, you can use the script below to execute stage 0 and stage 1:
+```bash
+bash run.sh --stage 0 --stop_stage 1
+```
+or you can run these scripts in the command line (only use CPU).
+```bash
+. ./path.sh
+. ./cmd.sh
+bash ./local/data.sh
+CUDA_VISIBLE_DEVICES= ./local/train.sh conf/wav2vec2ASR.yaml wav2vec2ASR
+```
+## Stage 2: Top-k Models Averaging
+After training the model, we need to get the final model for testing and inference. In every epoch, the model checkpoint is saved, so we can choose the best model from them based on the validation loss or we can sort them and average the parameters of the top-k models to get the final model. We can use stage 2 to do this, and the code is shown below. Note: We only train one epoch for wav2vec2ASR, thus the `avg_num` is set to 1.
+```bash
+ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+     # avg n best model
+     avg.sh best exp/${ckpt}/checkpoints ${avg_num}
+ fi
+```
+The `avg.sh` is in the `../../../utils/` which is define in the `path.sh`.
+If you want to get the final model, you can use the script below to execute stage 0, stage 1, and stage 2:
+```bash
+bash run.sh --stage 0 --stop_stage 2
+```
+or you can run these scripts in the command line (only use CPU).
+
+```bash
+. ./path.sh
+. ./cmd.sh
+bash ./local/data.sh
+CUDA_VISIBLE_DEVICES= ./local/train.sh conf/wav2vec2ASR.yaml wav2vec2ASR
+avg.sh best exp/wav2vec2ASR/checkpoints 1
+```
+## Stage 3: Model Testing
+The test stage is to evaluate the model performance. The code of test stage is shown below:
+```bash
+ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+     # test ckpt avg_n
+     CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+ fi
+```
+If you want to train a model and test it, you can use the script below to execute stage 0, stage 1, stage 2, and stage 3 :
+```bash
+bash run.sh --stage 0 --stop_stage 3
+```
+or you can run these scripts in the command line (only use CPU).
+```bash
+. ./path.sh
+. ./cmd.sh
+bash ./local/data.sh
+CUDA_VISIBLE_DEVICES= ./local/train.sh conf/wav2vec2ASR.yaml wav2vec2ASR
+avg.sh best exp/wav2vec2ASR/checkpoints 1
+CUDA_VISIBLE_DEVICES= ./local/test.sh conf/wav2vec2ASR.yaml conf/tuning/decode.yaml exp/wav2vec2ASR/checkpoints/avg_1
+```
+## Pretrained Model
+You can get the pretrained wav2vec2ASR from [this](../../../docs/source/released_model.md).
+
+using the `tar` scripts to unpack the model and then you can use the script to test the model.
+
+For example:
+```bash
+wget https://paddlespeech.bj.bcebos.com/s2t/aishell/asr3/wav2vec2ASR-large-aishell1_ckpt_1.3.0.model.tar.gz
+tar xzvf wav2vec2ASR-large-aishell1_ckpt_1.3.0.model.tar.gz
+source path.sh
+# If you have process the data and get the manifest file， you can skip the following 2 steps
+bash local/data.sh --stage -1 --stop_stage -1
+bash local/data.sh --stage 2 --stop_stage 2
+CUDA_VISIBLE_DEVICES= ./local/test.sh conf/wav2vec2ASR.yaml conf/tuning/decode.yaml exp/wav2vec2ASR/checkpoints/avg_1
+```
+The performance of the released models are shown in [here](./RESULTS.md).
+
+
+## Stage 4: Single Audio File Inference
+In some situations, you want to use the trained model to do the inference for the single audio file. You can use stage 5. The code is shown below
+```bash
+ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+     # test a single .wav file
+     CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1
+ fi
+```
+you can train the model by yourself using ```bash run.sh --stage 0 --stop_stage 3```, or you can download the pretrained model through the script below:
+```bash
+wget https://paddlespeech.bj.bcebos.com/s2t/aishell/asr3/wav2vec2ASR-large-aishell1_ckpt_1.3.0.model.tar.gz
+tar xzvf wav2vec2ASR-large-aishell1_ckpt_1.3.0.model.tar.gz
+```
+You can download the audio demo:
+```bash
+wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav -P data/
+```
+You need to prepare an audio file or use the audio demo above, please confirm the sample rate of the audio is 16K. You can get the result of the audio demo by running the script below.
+```bash
+CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/wav2vec2ASR.yaml conf/tuning/decode.yaml exp/wav2vec2ASR/checkpoints/avg_1 data/demo_002_en.wav
+```
diff --git a/examples/aishell/asr3/cmd.sh b/examples/aishell/asr3/cmd.sh
new file mode 100755
index 000000000..7b70ef5e0
--- /dev/null
+++ b/examples/aishell/asr3/cmd.sh
@@ -0,0 +1,89 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# "qsub" (SGE, Torque, PBS, etc.)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/gpu.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/examples/aishell/asr3/conf/preprocess.yaml b/examples/aishell/asr3/conf/preprocess.yaml
new file mode 100755
index 000000000..724782ed6
--- /dev/null
+++ b/examples/aishell/asr3/conf/preprocess.yaml
@@ -0,0 +1,3 @@
+process:
+    # use raw audio
+  - type: wav_process
diff --git a/examples/aishell/asr3/conf/train_with_wav2vec.yaml b/examples/aishell/asr3/conf/train_with_wav2vec.yaml
new file mode 100755
index 000000000..77b3762ef
--- /dev/null
+++ b/examples/aishell/asr3/conf/train_with_wav2vec.yaml
@@ -0,0 +1,101 @@
+# Copyright (c) 2023 speechbrain Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# 
+# Modified from speechbrain 2023 (https://github.com/speechbrain/speechbrain/blob/develop/recipes/AISHELL-1/ASR/CTC/hparams/train_with_wav2vec.yaml)
+
+# ############################################################################
+# Model: CTC-wav2vec2
+# Encoder: wav2vec2
+# Decoder: -
+# Tokens: Char
+# losses: CTC
+# Training: AISHELL-1
+# Authors:  Yingzhi WANG 2022
+# ############################################################################
+
+output_folder: !ref data
+cer_file: !ref <output_folder>/cer.txt
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+
+# Data files
+data_folder: data/aishell # e,g./path/to/aishell
+
+skip_prep: False
+ckpt_interval_minutes: 15 # save checkpoint every N min
+train_data: !ref <output_folder>/train.csv
+valid_data: !ref <output_folder>/dev.csv
+test_data: !ref <output_folder>/test.csv
+
+wav2vec2_hub: TencentGameMate/chinese-wav2vec2-large
+
+# Training parameters
+number_of_epochs: 80
+lr: 1.0
+lr_wav2vec: 0.0001
+sorting: ascending
+auto_mix_prec: False
+sample_rate: 16000
+
+# With data_parallel batch_size is split into N jobs
+# With DDP batch_size is multiplied by N jobs
+# Must be 8 per GPU to fit 32GB of VRAM
+batch_size: 5
+test_batch_size: 1 # need set to 1 when decoding
+
+dynamic_batching: False
+dynamic_batch_sampler:
+   feats_hop_size: 0.01
+   max_batch_len: 15 # in terms of "duration" in annotations by default, second here
+   left_bucket_len: 200 # old implementation attributs
+   multiplier: 1.1 # old implementation attributs
+   shuffle_ex: False # if true re-creates batches at each epoch shuffling examples.
+   num_buckets: 10 # floor(log(max_batch_len/left_bucket_len, multiplier)) + 1
+   batch_ordering: ascending
+
+num_workers: 6
+
+# Dataloader options
+train_dataloader_opts:
+   batch_size: !ref <batch_size>
+   num_workers: !ref <num_workers>
+valid_dataloader_opts:
+   batch_size: !ref <test_batch_size>
+   num_workers: !ref <num_workers>
+test_dataloader_opts:
+   batch_size: !ref <test_batch_size>
+   num_workers: !ref <num_workers>
+
+wav2vec_output_dim: 1024
+dnn_neurons: 1024
+freeze_wav2vec: False
+dropout: 0.15
+
+tokenizer: !apply:transformers.BertTokenizer.from_pretrained
+   pretrained_model_name_or_path: bert-base-chinese
+# bert-base-chinese tokens length
+output_neurons: 21128
+
+# Decoding parameters
+# Be sure that the bos and eos index match with the BPEs ones
+blank_index: 0
+
+# AISHELL-1 has spaces between words in the transcripts,
+# which Chinese writing normally does not do.
+# If remove_spaces, spaces are removed
+# from the transcript before computing CER.
+# (e.g., 祝 可爱 的 你 —> 祝可爱的你)
+remove_spaces: True
+split_tokens: !apply:operator.not_ [!ref <remove_spaces>]
diff --git a/examples/aishell/asr3/conf/tuning/decode.yaml b/examples/aishell/asr3/conf/tuning/decode.yaml
new file mode 100755
index 000000000..69d0a4551
--- /dev/null
+++ b/examples/aishell/asr3/conf/tuning/decode.yaml
@@ -0,0 +1,4 @@
+decode_batch_size: 1
+error_rate_type: cer
+decoding_method: ctc_greedy_search  # 'ctc_greedy_search', 'ctc_prefix_beam_search'
+beam_size: 10
diff --git a/examples/aishell/asr3/conf/wav2vec2ASR.yaml b/examples/aishell/asr3/conf/wav2vec2ASR.yaml
new file mode 100755
index 000000000..cdb04f8c1
--- /dev/null
+++ b/examples/aishell/asr3/conf/wav2vec2ASR.yaml
@@ -0,0 +1,167 @@
+############################################
+#          Network Architecture           #
+############################################
+freeze_wav2vec2: False
+normalize_wav: True
+output_norm: True
+init_type: 'kaiming_uniform' # !Warning: need to convergence
+enc:
+  input_shape: 1024
+  dnn_blocks: 3
+  dnn_neurons: 1024
+  activation: True
+  normalization: True
+  dropout_rate: [0.15, 0.15, 0.0]
+ctc:
+  enc_n_units: 1024
+  blank_id: 0
+  dropout_rate: 0.0
+
+audio_augment:
+  speeds: [90, 100, 110]
+
+spec_augment:
+  time_warp: True
+  time_warp_window: 5
+  time_warp_mode: bicubic
+  freq_mask: True
+  n_freq_mask: 2
+  time_mask: True
+  n_time_mask: 2
+  replace_with_zero: False
+  freq_mask_width: 30
+  time_mask_width: 40
+wav2vec2_params_path: exp/wav2vec2/chinese-wav2vec2-large.pdparams
+
+
+############################################
+#               Wav2Vec2.0                 #
+############################################
+# vocab_size: 1000000
+hidden_size: 1024
+num_hidden_layers: 24
+num_attention_heads: 16
+intermediate_size: 4096
+hidden_act: gelu
+hidden_dropout: 0.1
+activation_dropout: 0.0
+attention_dropout: 0.1
+feat_proj_dropout: 0.1
+feat_quantizer_dropout: 0.0
+final_dropout: 0.0
+layerdrop: 0.1
+initializer_range: 0.02
+layer_norm_eps: 1e-5
+feat_extract_norm: layer
+feat_extract_activation: gelu
+conv_dim: [512, 512, 512, 512, 512, 512, 512]
+conv_stride: [5, 2, 2, 2, 2, 2, 2]
+conv_kernel: [10, 3, 3, 3, 3, 2, 2]
+conv_bias: True
+num_conv_pos_embeddings: 128
+num_conv_pos_embedding_groups: 16
+do_stable_layer_norm: True
+apply_spec_augment: False
+mask_channel_length: 10
+mask_channel_min_space: 1
+mask_channel_other: 0.0
+mask_channel_prob: 0.0
+mask_channel_selection: static
+mask_feature_length: 10
+mask_feature_min_masks: 0
+mask_feature_prob: 0.0
+mask_time_length: 10
+mask_time_min_masks: 2
+mask_time_min_space: 1
+mask_time_other: 0.0
+mask_time_prob: 0.075
+mask_time_selection: static
+num_codevectors_per_group: 320
+num_codevector_groups: 2
+contrastive_logits_temperature: 0.1
+num_negatives: 100
+codevector_dim: 256
+proj_codevector_dim: 256
+diversity_loss_weight: 0.1
+use_weighted_layer_sum: False
+# pad_token_id: 0
+# bos_token_id: 1
+# eos_token_id: 2
+add_adapter: False
+adapter_kernel_size: 3
+adapter_stride: 2
+num_adapter_layers: 3
+output_hidden_size: None
+
+###########################################
+#                   Data                  #
+###########################################
+
+train_manifest: data/manifest.train
+dev_manifest: data/manifest.dev
+test_manifest: data/manifest.test
+vocab_filepath: data/lang_char/vocab.txt 
+
+###########################################
+#              Dataloader                 #
+###########################################
+
+unit_type: 'char'
+mean_std_filepath: 
+preprocess_config: conf/preprocess.yaml
+sortagrad: -1 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+batch_size: 5  # Different batch_size may cause large differences in results
+maxlen_in: 51200000000  # if input length  > maxlen-in batchsize is automatically reduced
+maxlen_out: 1500000  # if output length > maxlen-out batchsize is automatically reduced
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0 
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+num_workers: 6
+subsampling_factor: 1
+num_encs: 1
+dist_sampler: True
+shortest_first: True
+return_lens_rate: True
+
+###########################################
+#        use speechbrain dataloader       #
+###########################################
+use_sb_pipeline: True  # whether use speechbrain pipeline. Default is True.
+sb_pipeline_conf: conf/train_with_wav2vec.yaml
+
+###########################################
+#                 Training                #
+###########################################
+n_epoch: 80
+accum_grad: 1
+global_grad_clip: 5.0
+
+model_optim: adadelta
+model_optim_conf:
+  lr: 1.0
+  weight_decay: 0.0
+  rho: 0.95
+  epsilon: 1.0e-8
+
+wav2vec2_optim: adam
+wav2vec2_optim_conf:
+  lr: 0.0001
+  weight_decay: 0.0
+
+model_scheduler: newbobscheduler
+model_scheduler_conf:
+  improvement_threshold: 0.0025
+  annealing_factor: 0.8
+  patient: 0
+wav2vec2_scheduler: newbobscheduler
+wav2vec2_scheduler_conf:
+  improvement_threshold: 0.0025
+  annealing_factor: 0.9
+  patient: 0
+log_interval: 1
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
diff --git a/examples/aishell/asr3/local/aishell_prepare.py b/examples/aishell/asr3/local/aishell_prepare.py
new file mode 100644
index 000000000..a25735791
--- /dev/null
+++ b/examples/aishell/asr3/local/aishell_prepare.py
@@ -0,0 +1,129 @@
+# Copyright (c) 2023 speechbrain Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Modified from speechbrain 2023
+# (https://github.com/speechbrain/speechbrain/blob/develop/recipes/AISHELL-1/aishell_prepare.py)
+import argparse
+import csv
+import glob
+import logging
+import os
+
+from paddlespeech.s2t.models.wav2vec2.io.dataio import read_audio
+
+logger = logging.getLogger(__name__)
+
+DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--data_folder",
+    default=DATA_HOME + "/Aishell",
+    type=str,
+    help="Directory to save the dataset. (default: %(default)s)")
+parser.add_argument(
+    "--save_folder",
+    default="data/",
+    type=str,
+    help="Filepath prefix for output manifests. (default: %(default)s)")
+parser.add_argument(
+    "--skip_prep",
+    default=False,
+    type=bool,
+    help="If True, skip data preparation. (default: %(default)s)")
+args = parser.parse_args()
+
+
+def prepare_aishell(data_folder, save_folder, skip_prep=False):
+    """
+    This function prepares the AISHELL-1 dataset.
+    If the folder does not exist, the zip file will be extracted. If the zip file does not exist, it will be downloaded.
+    data_folder : path to AISHELL-1 dataset.
+    save_folder: path where to store the manifest csv files.
+    skip_prep: If True, skip data preparation.
+    """
+    if skip_prep:
+        return
+
+    # Create filename-to-transcript dictionary
+    filename2transcript = {}
+    with open(
+            os.path.join(data_folder,
+                         "data_aishell/transcript/aishell_transcript_v0.8.txt"),
+            "r", ) as f:
+        lines = f.readlines()
+        for line in lines:
+            key = line.split()[0]
+            value = " ".join(line.split()[1:])
+            filename2transcript[key] = value
+
+    splits = [
+        "train",
+        "dev",
+        "test",
+    ]
+    ID_start = 0  # needed to have a unique ID for each audio
+    for split in splits:
+        new_filename = os.path.join(save_folder, split) + ".csv"
+        if os.path.exists(new_filename):
+            continue
+        logger.info("Preparing %s..." % new_filename)
+
+        csv_output = [["ID", "duration", "wav", "transcript"]]
+        entry = []
+
+        all_wavs = glob.glob(
+            os.path.join(data_folder, "data_aishell/wav") + "/" + split +
+            "/*/*.wav")
+        for i in range(len(all_wavs)):
+            filename = all_wavs[i].split("/")[-1].split(".wav")[0]
+            if filename not in filename2transcript:
+                continue
+            signal = read_audio(all_wavs[i])
+            duration = signal.shape[0] / 16000
+            transcript_ = filename2transcript[filename]
+            csv_line = [
+                ID_start + i,
+                str(duration),
+                all_wavs[i],
+                transcript_,
+            ]
+            entry.append(csv_line)
+
+        csv_output = csv_output + entry
+
+        with open(new_filename, mode="w") as csv_f:
+            csv_writer = csv.writer(
+                csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
+            for line in csv_output:
+                csv_writer.writerow(line)
+
+        msg = "\t%s successfully created!" % (new_filename)
+        logger.info(msg)
+
+        ID_start += len(all_wavs)
+
+
+def main():
+    if args.data_folder.startswith('~'):
+        args.data_folder = os.path.expanduser(args.data_folder)
+
+    prepare_aishell(args.data_folder, args.save_folder, skip_prep=False)
+
+    print("Data csv prepare done!")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/aishell/asr3/local/data.sh b/examples/aishell/asr3/local/data.sh
new file mode 100755
index 000000000..1a468f546
--- /dev/null
+++ b/examples/aishell/asr3/local/data.sh
@@ -0,0 +1,101 @@
+#!/bin/bash
+
+stage=-1
+stop_stage=-1
+dict_dir=data/lang_char
+
+. ${MAIN_ROOT}/utils/parse_options.sh || exit -1;
+
+mkdir -p data
+mkdir -p ${dict_dir}
+TARGET_DIR=${MAIN_ROOT}/dataset
+mkdir -p ${TARGET_DIR}
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    # download data, generate manifests
+    python3 ${TARGET_DIR}/aishell/aishell.py \
+    --manifest_prefix="data/manifest" \
+    --target_dir="${TARGET_DIR}/aishell"
+
+    #generate csv file for speechbrain dataloader
+    python3 local/aishell_prepare.py \
+    --data_folder="${TARGET_DIR}/aishell" \
+    --save_folder="data/"
+
+
+    if [ $? -ne 0 ]; then
+        echo "Prepare Aishell failed. Terminated."
+        exit 1
+    fi
+
+    for dataset in train dev test; do
+        mv data/manifest.${dataset} data/manifest.${dataset}.raw
+    done
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # compute mean and stddev for normalizer
+    num_workers=$(nproc)
+    python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
+    --manifest_path="data/manifest.train.raw" \
+    --spectrum_type="fbank" \
+    --feat_dim=80 \
+    --delta_delta=false \
+    --stride_ms=10 \
+    --window_ms=25 \
+    --sample_rate=16000 \
+    --use_dB_normalization=False \
+    --num_samples=-1 \
+    --num_workers=${num_workers} \
+    --output_path="data/mean_std.json"
+
+    if [ $? -ne 0 ]; then
+        echo "Compute mean and stddev failed. Terminated."
+        exit 1
+    fi
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # download data, generate manifests
+    # build vocabulary
+    python3 ${MAIN_ROOT}/utils/build_vocab.py \
+    --unit_type="char" \
+    --count_threshold=0 \
+    --vocab_path="${dict_dir}/vocab.txt" \
+    --manifest_paths "data/manifest.train.raw"
+
+    if [ $? -ne 0 ]; then
+        echo "Build vocabulary failed. Terminated."
+        exit 1
+    fi
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # format manifest with tokenids, vocab size
+    for dataset in train dev test; do
+    {
+        python3 ${MAIN_ROOT}/utils/format_data.py \
+            --cmvn_path "data/mean_std.json" \
+            --unit_type "char" \
+            --vocab_path="${dict_dir}/vocab.txt" \
+            --manifest_path="data/manifest.${dataset}.raw" \
+            --output_path="data/manifest.${dataset}"
+
+        if [ $? -ne 0 ]; then
+            echo "Formt mnaifest failed. Terminated."
+            exit 1
+        fi
+    } &
+    done
+    wait
+fi
+echo "Aishell data preparation done."
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    mkdir -p exp/wav2vec2
+    echo "Pretrained wav2vec2 model download"
+    wget -P exp/wav2vec2 https://paddlespeech.bj.bcebos.com/wav2vec/chinese-wav2vec2-large.pdparams
+fi
+
+exit 0
+
diff --git a/examples/aishell/asr3/local/test.sh b/examples/aishell/asr3/local/test.sh
new file mode 100755
index 000000000..9d4b84291
--- /dev/null
+++ b/examples/aishell/asr3/local/test.sh
@@ -0,0 +1,84 @@
+#!/bin/bash
+
+set -e
+
+ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+echo "using $ngpu gpus..."
+
+expdir=exp
+datadir=data
+
+train_set=train_960
+recog_set="test-clean test-other dev-clean dev-other"
+recog_set="test-clean"
+
+config_path=$1
+decode_config_path=$2
+ckpt_prefix=$3
+
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
+
+# download language model
+#bash local/download_lm_en.sh
+#if [ $? -ne 0 ]; then
+#    exit 1
+#fi
+
+python3 utils/format_rsl.py \
+    --origin_ref data/manifest.test.raw \
+    --trans_ref data/manifest.test.text
+
+
+for type in ctc_greedy_search; do
+    echo "decoding ${type}"
+    batch_size=1
+    python3 -u ${BIN_DIR}/test.py \
+        --ngpu ${ngpu} \
+        --config ${config_path} \
+        --decode_cfg ${decode_config_path} \
+        --result_file ${ckpt_prefix}.${type}.rsl \
+        --checkpoint_path ${ckpt_prefix} \
+        --opts decode.decoding_method ${type} \
+        --opts decode.decode_batch_size ${batch_size}
+
+    if [ $? -ne 0 ]; then
+        echo "Failed in evaluation!"
+        exit 1
+    fi
+    python3 utils/format_rsl.py \
+        --origin_hyp ${ckpt_prefix}.${type}.rsl \
+        --trans_hyp ${ckpt_prefix}.${type}.rsl.text
+
+    python3 utils/compute-wer.py --char=1 --v=1 \
+        data/manifest.test.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error
+    echo "decoding ${type} done."
+done
+
+for type in ctc_prefix_beam_search; do
+    echo "decoding ${type}"
+    batch_size=1
+    python3 -u ${BIN_DIR}/test.py \
+        --ngpu ${ngpu} \
+        --config ${config_path} \
+        --decode_cfg ${decode_config_path} \
+        --result_file ${ckpt_prefix}.${type}.rsl \
+        --checkpoint_path ${ckpt_prefix} \
+        --opts decode.decoding_method ${type} \
+        --opts decode.decode_batch_size ${batch_size}
+
+    if [ $? -ne 0 ]; then
+        echo "Failed in evaluation!"
+        exit 1
+    fi
+    python3 utils/format_rsl.py \
+        --origin_hyp ${ckpt_prefix}.${type}.rsl \
+        --trans_hyp ${ckpt_prefix}.${type}.rsl.text
+
+    python3 utils/compute-wer.py --char=1 --v=1 \
+        data/manifest.test-clean.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error
+    echo "decoding ${type} done."
+done
+
+echo "Finished"
+
+exit 0
diff --git a/examples/aishell/asr3/local/test_wav.sh b/examples/aishell/asr3/local/test_wav.sh
new file mode 100755
index 000000000..fdf3589f4
--- /dev/null
+++ b/examples/aishell/asr3/local/test_wav.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+
+if [ $# != 4 ];then
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix audio_file"
+    exit -1
+fi
+
+ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+echo "using $ngpu gpus..."
+
+config_path=$1
+decode_config_path=$2
+ckpt_prefix=$3
+audio_file=$4
+
+mkdir -p data
+wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/en/demo_002_en.wav -P data/
+if [ $? -ne 0 ]; then
+   exit 1
+fi
+
+if [ ! -f ${audio_file} ]; then
+    echo "Plase input the right audio_file path"
+    exit 1
+fi
+
+chunk_mode=false
+if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then
+    chunk_mode=true
+fi
+
+# download language model
+#bash local/download_lm_ch.sh
+#if [ $? -ne 0 ]; then
+#    exit 1
+#fi
+
+for type in ctc_greedy_search; do
+    echo "decoding ${type}"
+    batch_size=1
+    output_dir=${ckpt_prefix}
+    mkdir -p ${output_dir}
+    python3 -u ${BIN_DIR}/test_wav.py \
+    --ngpu ${ngpu} \
+    --config ${config_path} \
+    --decode_cfg ${decode_config_path} \
+    --result_file ${output_dir}/${type}.rsl \
+    --checkpoint_path ${ckpt_prefix} \
+    --opts decode.decoding_method ${type} \
+    --opts decode.decode_batch_size ${batch_size} \
+    --audio_file ${audio_file}
+
+    if [ $? -ne 0 ]; then
+        echo "Failed in evaluation!"
+        exit 1
+    fi
+done
+exit 0
diff --git a/examples/aishell/asr3/local/train.sh b/examples/aishell/asr3/local/train.sh
new file mode 100755
index 000000000..e51e3d34c
--- /dev/null
+++ b/examples/aishell/asr3/local/train.sh
@@ -0,0 +1,59 @@
+#!/bin/bash
+
+if [ $# -lt 2 ] && [ $# -gt 3 ];then
+    echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ips(optional)"
+    exit -1
+fi
+
+ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+echo "using $ngpu gpus..."
+
+config_path=$1
+ckpt_name=$2
+resume=$3
+ips=$4
+
+if [ ! $ips ];then
+  ips_config=
+else
+  ips_config="--ips="${ips}
+fi
+
+mkdir -p exp
+
+# seed may break model convergence
+seed=2
+if [ ${seed} != 0 ]; then
+    export FLAGS_cudnn_deterministic=True
+fi
+
+# export FLAGS_cudnn_exhaustive_search=true
+# export FLAGS_conv_workspace_size_limit=4000
+# export FLAGS_allocator_strategy=naive_best_fit
+
+if [ ${ngpu} == 0 ]; then
+python3 -u ${BIN_DIR}/train.py \
+--ngpu ${ngpu} \
+--config ${config_path} \
+--output exp/${ckpt_name} \
+--seed ${seed} \
+--resume ${resume}
+else
+python3 -m paddle.distributed.launch --log_dir=${ckpt_name} --gpus=${CUDA_VISIBLE_DEVICES} ${ips_config} ${BIN_DIR}/train.py \
+--ngpu ${ngpu} \
+--config ${config_path} \
+--output exp/${ckpt_name} \
+--seed ${seed} \
+--resume ${resume}
+fi
+
+if [ ${seed} != 0 ]; then
+    unset FLAGS_cudnn_deterministic
+fi
+
+if [ $? -ne 0 ]; then
+    echo "Failed in training!"
+    exit 1
+fi
+
+exit 0
diff --git a/examples/aishell/asr3/path.sh b/examples/aishell/asr3/path.sh
new file mode 100755
index 000000000..f47178382
--- /dev/null
+++ b/examples/aishell/asr3/path.sh
@@ -0,0 +1,15 @@
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/tools/sctk/bin:${PWD}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
+
+
+MODEL=wav2vec2
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/s2t/exps/${MODEL}/bin
diff --git a/examples/aishell/asr3/run.sh b/examples/aishell/asr3/run.sh
new file mode 100755
index 000000000..9b0a3c472
--- /dev/null
+++ b/examples/aishell/asr3/run.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+set -e
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+
+gpus=0,1,2,3
+stage=0
+stop_stage=4
+conf_path=conf/wav2vec2ASR.yaml
+ips=            #xx.xx.xx.xx,xx.xx.xx.xx
+decode_conf_path=conf/tuning/decode.yaml
+avg_num=1
+resume=         # xx e.g. 30
+export FLAGS_cudnn_deterministic=1
+. ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
+
+audio_file=data/demo_002_en.wav
+
+avg_ckpt=avg_${avg_num}
+ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
+echo "checkpoint name ${ckpt}"git revert -v 
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    bash ./local/data.sh || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `exp` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} ${resume} ${ips} 
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # avg n best model
+    avg.sh last exp/${ckpt}/checkpoints ${avg_num}
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # greedy search decoder
+    CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    # test a single .wav file
+    CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1
+fi
diff --git a/examples/aishell/asr3/utils b/examples/aishell/asr3/utils
new file mode 120000
index 000000000..973afe674
--- /dev/null
+++ b/examples/aishell/asr3/utils
@@ -0,0 +1 @@
+../../../utils
\ No newline at end of file
diff --git a/examples/aishell3/tts3/path.sh b/examples/aishell3/tts3/path.sh
deleted file mode 100755
index fb7e8411c..000000000
--- a/examples/aishell3/tts3/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=fastspeech2
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
diff --git a/examples/aishell3/tts3/path.sh b/examples/aishell3/tts3/path.sh
new file mode 120000
index 000000000..4785b9095
--- /dev/null
+++ b/examples/aishell3/tts3/path.sh
@@ -0,0 +1 @@
+../../csmsc/tts3/path.sh
\ No newline at end of file
diff --git a/examples/aishell3/vc0/path.sh b/examples/aishell3/vc0/path.sh
deleted file mode 100755
index a37cd21e3..000000000
--- a/examples/aishell3/vc0/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=tacotron2
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
diff --git a/examples/aishell3/vc0/path.sh b/examples/aishell3/vc0/path.sh
new file mode 120000
index 000000000..9e1fdbd16
--- /dev/null
+++ b/examples/aishell3/vc0/path.sh
@@ -0,0 +1 @@
+../../csmsc/tts0/path.sh
\ No newline at end of file
diff --git a/examples/aishell3/vc1/local/train.sh b/examples/aishell3/vc1/local/train.sh
deleted file mode 100755
index c775fcadc..000000000
--- a/examples/aishell3/vc1/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-python3 ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=2 \
-    --phones-dict=dump/phone_id_map.txt \
-    --voice-cloning=True
\ No newline at end of file
diff --git a/examples/aishell3/vc1/local/train.sh b/examples/aishell3/vc1/local/train.sh
new file mode 120000
index 000000000..115a0b8dc
--- /dev/null
+++ b/examples/aishell3/vc1/local/train.sh
@@ -0,0 +1 @@
+../../vc0/local/train.sh
\ No newline at end of file
diff --git a/examples/aishell3/vc1/path.sh b/examples/aishell3/vc1/path.sh
deleted file mode 100755
index fb7e8411c..000000000
--- a/examples/aishell3/vc1/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=fastspeech2
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
diff --git a/examples/aishell3/vc1/path.sh b/examples/aishell3/vc1/path.sh
new file mode 120000
index 000000000..4785b9095
--- /dev/null
+++ b/examples/aishell3/vc1/path.sh
@@ -0,0 +1 @@
+../../csmsc/tts3/path.sh
\ No newline at end of file
diff --git a/examples/aishell3/vc2/local/synthesize.sh b/examples/aishell3/vc2/local/synthesize.sh
deleted file mode 100755
index 8fd8977d3..000000000
--- a/examples/aishell3/vc2/local/synthesize.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-ckpt_name=$3
-
-python3 ${BIN_DIR}/../synthesize.py \
-    --am=fastspeech2_aishell3 \
-    --am_config=${config_path} \
-    --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
-    --am_stat=dump/train/speech_stats.npy \
-    --voc=pwgan_aishell3 \
-    --voc_config=pwg_aishell3_ckpt_0.5/default.yaml \
-    --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
-    --voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
-    --test_metadata=dump/test/norm/metadata.jsonl \
-    --output_dir=${train_output_path}/test \
-    --phones_dict=dump/phone_id_map.txt \
-    --speaker_dict=dump/speaker_id_map.txt \
-    --voice-cloning=True
diff --git a/examples/aishell3/vc2/local/synthesize.sh b/examples/aishell3/vc2/local/synthesize.sh
new file mode 120000
index 000000000..ca8df6b04
--- /dev/null
+++ b/examples/aishell3/vc2/local/synthesize.sh
@@ -0,0 +1 @@
+../../vc1/local/synthesize.sh
\ No newline at end of file
diff --git a/examples/aishell3/vc2/local/train.sh b/examples/aishell3/vc2/local/train.sh
deleted file mode 100755
index c775fcadc..000000000
--- a/examples/aishell3/vc2/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-python3 ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=2 \
-    --phones-dict=dump/phone_id_map.txt \
-    --voice-cloning=True
\ No newline at end of file
diff --git a/examples/aishell3/vc2/local/train.sh b/examples/aishell3/vc2/local/train.sh
new file mode 120000
index 000000000..115a0b8dc
--- /dev/null
+++ b/examples/aishell3/vc2/local/train.sh
@@ -0,0 +1 @@
+../../vc0/local/train.sh
\ No newline at end of file
diff --git a/examples/aishell3/vc2/path.sh b/examples/aishell3/vc2/path.sh
deleted file mode 100755
index fb7e8411c..000000000
--- a/examples/aishell3/vc2/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=fastspeech2
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
diff --git a/examples/aishell3/vc2/path.sh b/examples/aishell3/vc2/path.sh
new file mode 120000
index 000000000..4785b9095
--- /dev/null
+++ b/examples/aishell3/vc2/path.sh
@@ -0,0 +1 @@
+../../csmsc/tts3/path.sh
\ No newline at end of file
diff --git a/examples/aishell3/voc1/local/preprocess.sh b/examples/aishell3/voc1/local/preprocess.sh
index 44cc3dbe4..71eab68ad 100755
--- a/examples/aishell3/voc1/local/preprocess.sh
+++ b/examples/aishell3/voc1/local/preprocess.sh
@@ -42,14 +42,18 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     python3 ${BIN_DIR}/../normalize.py \
         --metadata=dump/train/raw/metadata.jsonl \
         --dumpdir=dump/train/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
+        --skip-wav-copy
+
     python3 ${BIN_DIR}/../normalize.py \
         --metadata=dump/dev/raw/metadata.jsonl \
         --dumpdir=dump/dev/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
+        --skip-wav-copy
     
     python3 ${BIN_DIR}/../normalize.py \
         --metadata=dump/test/raw/metadata.jsonl \
         --dumpdir=dump/test/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
+        --skip-wav-copy
 fi
diff --git a/examples/aishell3/voc1/local/synthesize.sh b/examples/aishell3/voc1/local/synthesize.sh
deleted file mode 100755
index 145557b3d..000000000
--- a/examples/aishell3/voc1/local/synthesize.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-ckpt_name=$3
-
-FLAGS_allocator_strategy=naive_best_fit \
-FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ${BIN_DIR}/../synthesize.py \
-    --config=${config_path} \
-    --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
-    --test-metadata=dump/test/norm/metadata.jsonl \
-    --output-dir=${train_output_path}/test \
-    --generator-type=pwgan
diff --git a/examples/aishell3/voc1/local/synthesize.sh b/examples/aishell3/voc1/local/synthesize.sh
new file mode 120000
index 000000000..d6aecd8d1
--- /dev/null
+++ b/examples/aishell3/voc1/local/synthesize.sh
@@ -0,0 +1 @@
+../../../csmsc/voc1/local/synthesize.sh
\ No newline at end of file
diff --git a/examples/aishell3/voc1/local/train.sh b/examples/aishell3/voc1/local/train.sh
deleted file mode 100755
index 9695631ef..000000000
--- a/examples/aishell3/voc1/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-FLAGS_cudnn_exhaustive_search=true \
-FLAGS_conv_workspace_size_limit=4000 \
-python ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=1
diff --git a/examples/aishell3/voc1/local/train.sh b/examples/aishell3/voc1/local/train.sh
new file mode 120000
index 000000000..2942893d2
--- /dev/null
+++ b/examples/aishell3/voc1/local/train.sh
@@ -0,0 +1 @@
+../../../csmsc/voc1/local/train.sh
\ No newline at end of file
diff --git a/examples/aishell3/voc1/path.sh b/examples/aishell3/voc1/path.sh
deleted file mode 100755
index 1e6647b86..000000000
--- a/examples/aishell3/voc1/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=parallelwave_gan
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL}
\ No newline at end of file
diff --git a/examples/aishell3/voc1/path.sh b/examples/aishell3/voc1/path.sh
new file mode 120000
index 000000000..b7ed4fb8f
--- /dev/null
+++ b/examples/aishell3/voc1/path.sh
@@ -0,0 +1 @@
+../../csmsc/voc1/path.sh
\ No newline at end of file
diff --git a/examples/aishell3/voc5/local/preprocess.sh b/examples/aishell3/voc5/local/preprocess.sh
deleted file mode 100755
index 44cc3dbe4..000000000
--- a/examples/aishell3/voc5/local/preprocess.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/bash
-
-stage=0
-stop_stage=100
-
-config_path=$1
-
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    # get durations from MFA's result
-    echo "Generate durations.txt from MFA results ..."
-    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
-        --inputdir=./aishell3_alignment_tone \
-        --output=durations.txt \
-        --config=${config_path}
-fi
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    # extract features
-    echo "Extract features ..."
-    python3 ${BIN_DIR}/../preprocess.py \
-        --rootdir=~/datasets/data_aishell3/ \
-        --dataset=aishell3 \
-        --dumpdir=dump \
-        --dur-file=durations.txt \
-        --config=${config_path} \
-        --cut-sil=True \
-        --num-cpu=20
-fi
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    # get features' stats(mean and std)
-    echo "Get features' stats ..."
-    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
-        --metadata=dump/train/raw/metadata.jsonl \
-        --field-name="feats"
-fi
-
-if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
-    # normalize, dev and test should use train's stats
-    echo "Normalize ..."
-   
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/train/raw/metadata.jsonl \
-        --dumpdir=dump/train/norm \
-        --stats=dump/train/feats_stats.npy
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/dev/raw/metadata.jsonl \
-        --dumpdir=dump/dev/norm \
-        --stats=dump/train/feats_stats.npy
-    
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/test/raw/metadata.jsonl \
-        --dumpdir=dump/test/norm \
-        --stats=dump/train/feats_stats.npy
-fi
diff --git a/examples/aishell3/voc5/local/preprocess.sh b/examples/aishell3/voc5/local/preprocess.sh
new file mode 120000
index 000000000..f0cb24de9
--- /dev/null
+++ b/examples/aishell3/voc5/local/preprocess.sh
@@ -0,0 +1 @@
+../../voc1/local/preprocess.sh
\ No newline at end of file
diff --git a/examples/aishell3/voc5/local/synthesize.sh b/examples/aishell3/voc5/local/synthesize.sh
deleted file mode 100755
index 647896175..000000000
--- a/examples/aishell3/voc5/local/synthesize.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-ckpt_name=$3
-
-FLAGS_allocator_strategy=naive_best_fit \
-FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ${BIN_DIR}/../synthesize.py \
-    --config=${config_path} \
-    --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
-    --test-metadata=dump/test/norm/metadata.jsonl \
-    --output-dir=${train_output_path}/test \
-    --generator-type=hifigan
diff --git a/examples/aishell3/voc5/local/synthesize.sh b/examples/aishell3/voc5/local/synthesize.sh
new file mode 120000
index 000000000..c887112c0
--- /dev/null
+++ b/examples/aishell3/voc5/local/synthesize.sh
@@ -0,0 +1 @@
+../../../csmsc/voc5/local/synthesize.sh
\ No newline at end of file
diff --git a/examples/aishell3/voc5/local/train.sh b/examples/aishell3/voc5/local/train.sh
deleted file mode 100755
index 9695631ef..000000000
--- a/examples/aishell3/voc5/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-FLAGS_cudnn_exhaustive_search=true \
-FLAGS_conv_workspace_size_limit=4000 \
-python ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=1
diff --git a/examples/aishell3/voc5/local/train.sh b/examples/aishell3/voc5/local/train.sh
new file mode 120000
index 000000000..2942893d2
--- /dev/null
+++ b/examples/aishell3/voc5/local/train.sh
@@ -0,0 +1 @@
+../../../csmsc/voc1/local/train.sh
\ No newline at end of file
diff --git a/examples/aishell3/voc5/path.sh b/examples/aishell3/voc5/path.sh
deleted file mode 100755
index 7451b3218..000000000
--- a/examples/aishell3/voc5/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=hifigan
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL}
diff --git a/examples/aishell3/voc5/path.sh b/examples/aishell3/voc5/path.sh
new file mode 120000
index 000000000..b67fe2b39
--- /dev/null
+++ b/examples/aishell3/voc5/path.sh
@@ -0,0 +1 @@
+../../csmsc/voc5/path.sh
\ No newline at end of file
diff --git a/examples/aishell3_vctk/ernie_sat/local/synthesize.sh b/examples/aishell3_vctk/ernie_sat/local/synthesize.sh
deleted file mode 100755
index 8b4178f13..000000000
--- a/examples/aishell3_vctk/ernie_sat/local/synthesize.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-ckpt_name=$3
-
-stage=0
-stop_stage=0
-
-# hifigan
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    FLAGS_allocator_strategy=naive_best_fit \
-    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-    python3 ${BIN_DIR}/synthesize.py \
-        --erniesat_config=${config_path} \
-        --erniesat_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
-        --erniesat_stat=dump/train/speech_stats.npy \
-        --voc=hifigan_aishell3 \
-        --voc_config=hifigan_aishell3_ckpt_0.2.0/default.yaml \
-        --voc_ckpt=hifigan_aishell3_ckpt_0.2.0/snapshot_iter_2500000.pdz \
-        --voc_stat=hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \
-        --test_metadata=dump/test/norm/metadata.jsonl \
-        --output_dir=${train_output_path}/test \
-        --phones_dict=dump/phone_id_map.txt
-fi
diff --git a/examples/aishell3_vctk/ernie_sat/local/synthesize.sh b/examples/aishell3_vctk/ernie_sat/local/synthesize.sh
new file mode 120000
index 000000000..5703dcb2c
--- /dev/null
+++ b/examples/aishell3_vctk/ernie_sat/local/synthesize.sh
@@ -0,0 +1 @@
+../../../aishell3/ernie_sat/local/synthesize.sh
\ No newline at end of file
diff --git a/examples/aishell3_vctk/ernie_sat/local/train.sh b/examples/aishell3_vctk/ernie_sat/local/train.sh
deleted file mode 100755
index 526aac435..000000000
--- a/examples/aishell3_vctk/ernie_sat/local/train.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-python3 ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=8 \
-    --phones-dict=dump/phone_id_map.txt
\ No newline at end of file
diff --git a/examples/aishell3_vctk/ernie_sat/local/train.sh b/examples/aishell3_vctk/ernie_sat/local/train.sh
new file mode 120000
index 000000000..9f1d2346d
--- /dev/null
+++ b/examples/aishell3_vctk/ernie_sat/local/train.sh
@@ -0,0 +1 @@
+../../../aishell3/ernie_sat/local/train.sh
\ No newline at end of file
diff --git a/examples/aishell3_vctk/ernie_sat/path.sh b/examples/aishell3_vctk/ernie_sat/path.sh
deleted file mode 100755
index 4ecab0251..000000000
--- a/examples/aishell3_vctk/ernie_sat/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=ernie_sat
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
\ No newline at end of file
diff --git a/examples/aishell3_vctk/ernie_sat/path.sh b/examples/aishell3_vctk/ernie_sat/path.sh
new file mode 120000
index 000000000..5ec397590
--- /dev/null
+++ b/examples/aishell3_vctk/ernie_sat/path.sh
@@ -0,0 +1 @@
+../../aishell3/ernie_sat/path.sh
\ No newline at end of file
diff --git a/examples/canton/tts3/README.md b/examples/canton/tts3/README.md
new file mode 100644
index 000000000..d90eef2f3
--- /dev/null
+++ b/examples/canton/tts3/README.md
@@ -0,0 +1,121 @@
+# FastSpeech2 with Cantonese language
+
+## Dataset
+### Download and Extract
+If you don't have the Cantonese datasets mentioned above, please download and unzip  [Guangzhou_Cantonese_Scripted_Speech_Corpus_Daily_Use_Sentence](https://magichub.com/datasets/guangzhou-cantonese-scripted-speech-corpus-daily-use-sentence/) and [Guangzhou_Cantonese_Scripted_Speech_Corpus_in_Vehicle](https://magichub.com/datasets/guangzhou-cantonese-scripted-speech-corpus-in-the-vehicle/) under `~/datasets/`.
+
+To obtain better performance, please combine these two datasets together as follows:
+
+```bash
+mkdir -p ~/datasets/canton_all/WAV
+cp -r ~/datasets/Guangzhou_Cantonese_Scripted_Speech_Corpus_Daily_Use_Sentence/WAV/* ~/datasets/canton_all/WAV
+cp -r ~/datasets/Guangzhou_Cantonese_Scripted_Speech_Corpus_in_Vehicle/WAV/* ~/datasets/canton_all/WAV
+```
+
+After that, it should be look like:
+```
+~/datasets/canton_all
+│   └── WAV
+│       └──G0001
+│       └──G0002
+│       ...
+│       └──G0071
+│       └──G0072
+```
+
+
+### Get MFA Result and Extract
+We use [MFA1.x](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for canton_fastspeech2.
+You can train your MFA model reference to [canton_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) (use MFA1.x now) of our repo.
+We here provide the MFA results of these two datasets. [canton_alignment.zip](https://paddlespeech.bj.bcebos.com/MFA/Canton/canton_alignment.zip)
+
+## Get Started
+Assume the path to the Cantonese MFA result of the two datsets mentioned above is `./canton_alignment`.
+Run the command below to
+1. **source path**.
+2. preprocess the dataset.
+3. train the model.
+4. synthesize wavs.
+    - synthesize waveform from `metadata.jsonl`.
+    - synthesize waveform from text file.
+```bash
+./run.sh
+```
+You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, running the following command will only preprocess the dataset.
+```bash
+./run.sh --stage 0 --stop-stage 0
+```
+
+### Data Preprocessing
+```bash
+./local/preprocess.sh ${conf_path}
+```
+When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.
+```text
+dump
+├── dev
+│   ├── norm
+│   └── raw
+├── phone_id_map.txt
+├── speaker_id_map.txt
+├── test
+│   ├── norm
+│   └── raw
+└── train
+    ├── energy_stats.npy
+    ├── norm
+    ├── pitch_stats.npy
+    ├── raw
+    └── speech_stats.npy
+```
+The dataset is split into 3 parts, namely `train`, `dev`, and` test`, each of which contains a `norm` and `raw` subfolder. The raw folder contains speech、pitch and energy features of each utterance, while the norm folder contains normalized ones. The statistics used to normalize features are computed from the training set, which is located in `dump/train/*_stats.npy`.
+
+Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file that contains phones, text_lengths, speech_lengths, durations, the path of speech features, the path of pitch features, a path of energy features, speaker, and id of each utterance.
+
+### Training details can refer to the script of [examples/aishell3/tts3](../../aishell3/tts3).
+
+## Pretrained Model
+Pretrained FastSpeech2 model with no silence in the edge of audios:
+- [fastspeech2_canton_ckpt_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_canton_ckpt_1.4.0.zip)
+
+FastSpeech2 checkpoint contains files listed below.
+
+```text
+fastspeech2_canton_ckpt_1.4.0
+├── default.yaml            # default config used to train fastspeech2
+├── energy_stats.npy        # statistics used to normalize energy when training fastspeech2
+├── phone_id_map.txt        # phone vocabulary file when training fastspeech2
+├── pitch_stats.npy         # statistics used to normalize pitch when training fastspeech2
+├── snapshot_iter_140000.pdz # model parameters and optimizer states
+├── speaker_id_map.txt      # speaker id map file when training a multi-speaker fastspeech2
+└── speech_stats.npy        # statistics used to normalize spectrogram when training fastspeech2
+```
+We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1) as the neural vocoder.
+Download the pretrained parallel wavegan model from [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip) and unzip it.
+```bash
+unzip pwg_aishell3_ckpt_0.5.zip
+```
+
+You can use the following scripts to synthesize for `${BIN_DIR}/../sentences_canton.txt` using pretrained fastspeech2 and parallel wavegan models.
+```bash
+source path.sh
+
+FLAGS_allocator_strategy=naive_best_fit \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python3 ${BIN_DIR}/../synthesize_e2e.py \
+  --am=fastspeech2_aishell3 \
+  --am_config=fastspeech2_canton_ckpt_1.4.0/default.yaml \
+  --am_ckpt=fastspeech2_canton_ckpt_1.4.0/snapshot_iter_140000.pdz \
+  --am_stat=fastspeech2_canton_ckpt_1.4.0/speech_stats.npy \
+  --voc=pwgan_aishell3 \
+  --voc_config=pwg_aishell3_ckpt_0.5/default.yaml \
+  --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
+  --voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
+  --lang=canton \
+  --text=${BIN_DIR}/../sentences_canton.txt \
+  --output_dir=exp/default/test_e2e \
+  --phones_dict=fastspeech2_canton_ckpt_1.4.0/phone_id_map.txt \
+  --speaker_dict=fastspeech2_canton_ckpt_1.4.0/speaker_id_map.txt \
+  --spk_id=0 \
+  --inference_dir=exp/default/inference
+```
diff --git a/examples/canton/tts3/conf/default.yaml b/examples/canton/tts3/conf/default.yaml
new file mode 100644
index 000000000..a101e6eea
--- /dev/null
+++ b/examples/canton/tts3/conf/default.yaml
@@ -0,0 +1,107 @@
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+
+fs: 24000          # sr
+n_fft: 2048        # FFT size (samples).
+n_shift: 300       # Hop size (samples). 12.5ms
+win_length: 1200   # Window length (samples). 50ms
+                   # If set to null, it will be the same as fft_size.
+window: "hann"     # Window function.
+
+# Only used for feats_type != raw
+
+fmin: 80           # Minimum frequency of Mel basis.
+fmax: 7600         # Maximum frequency of Mel basis.
+n_mels: 80         # The number of mel basis.
+
+# Only used for the model using pitch features (e.g. FastSpeech2)
+# The canton datasets we use are different from others like Databaker or LJSpeech, 
+# we set it to 110 to avoid too many zero-pitch problem. 
+# Reference: https://github.com/JeremyCCHsu/Python-Wrapper-for-World-Vocoder/issues/38
+f0min: 110          # Minimum f0 for pitch extraction.
+f0max: 400         # Maximum f0 for pitch extraction.
+
+
+###########################################################
+#                       DATA SETTING                      #
+###########################################################
+batch_size: 32
+num_workers: 2
+
+
+###########################################################
+#                       MODEL SETTING                     #
+###########################################################
+model:
+    adim: 384         # attention dimension
+    aheads: 2         # number of attention heads
+    elayers: 4        # number of encoder layers
+    eunits: 1536      # number of encoder ff units
+    dlayers: 4        # number of decoder layers
+    dunits: 1536      # number of decoder ff units
+    positionwise_layer_type: conv1d   # type of position-wise layer
+    positionwise_conv_kernel_size: 3  # kernel size of position wise conv layer
+    duration_predictor_layers: 2      # number of layers of duration predictor
+    duration_predictor_chans: 256     # number of channels of duration predictor
+    duration_predictor_kernel_size: 3 # filter size of duration predictor
+    postnet_layers: 5                 # number of layers of postnset
+    postnet_filts: 5                  # filter size of conv layers in postnet
+    postnet_chans: 256                # number of channels of conv layers in postnet
+    use_scaled_pos_enc: True          # whether to use scaled positional encoding
+    encoder_normalize_before: True    # whether to perform layer normalization before the input
+    decoder_normalize_before: True    # whether to perform layer normalization before the input
+    reduction_factor: 1               # reduction factor
+    init_type: xavier_uniform         # initialization type
+    init_enc_alpha: 1.0               # initial value of alpha of encoder scaled position encoding
+    init_dec_alpha: 1.0               # initial value of alpha of decoder scaled position encoding
+    transformer_enc_dropout_rate: 0.2            # dropout rate for transformer encoder layer
+    transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
+    transformer_enc_attn_dropout_rate: 0.2       # dropout rate for transformer encoder attention layer
+    transformer_dec_dropout_rate: 0.2            # dropout rate for transformer decoder layer
+    transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
+    transformer_dec_attn_dropout_rate: 0.2       # dropout rate for transformer decoder attention layer
+    pitch_predictor_layers: 5                  # number of conv layers in pitch predictor
+    pitch_predictor_chans: 256                 # number of channels of conv layers in pitch predictor
+    pitch_predictor_kernel_size: 5             # kernel size of conv leyers in pitch predictor
+    pitch_predictor_dropout: 0.5               # dropout rate in pitch predictor
+    pitch_embed_kernel_size: 1                 # kernel size of conv embedding layer for pitch
+    pitch_embed_dropout: 0.0                   # dropout rate after conv embedding layer for pitch
+    stop_gradient_from_pitch_predictor: True   # whether to stop the gradient from pitch predictor to encoder
+    energy_predictor_layers: 2                 # number of conv layers in energy predictor
+    energy_predictor_chans: 256                # number of channels of conv layers in energy predictor
+    energy_predictor_kernel_size: 3            # kernel size of conv leyers in energy predictor
+    energy_predictor_dropout: 0.5              # dropout rate in energy predictor
+    energy_embed_kernel_size: 1                # kernel size of conv embedding layer for energy
+    energy_embed_dropout: 0.0                  # dropout rate after conv embedding layer for energy
+    stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder
+    spk_embed_dim: 256                         # speaker embedding dimension
+    spk_embed_integration_type: concat         # speaker embedding integration type
+
+
+
+###########################################################
+#                       UPDATER SETTING                   #
+###########################################################
+updater:
+    use_masking: True                 # whether to apply masking for padded part in loss calculation
+
+
+###########################################################
+#                     OPTIMIZER SETTING                   #
+###########################################################
+optimizer:
+    optim: adam               # optimizer type
+    learning_rate: 0.001      # learning rate
+
+###########################################################
+#                     TRAINING SETTING                    #
+###########################################################
+max_epoch: 1000
+num_snapshots: 5
+
+
+###########################################################
+#                       OTHER SETTING                     #
+###########################################################
+seed: 10086
diff --git a/examples/canton/tts3/local/preprocess.sh b/examples/canton/tts3/local/preprocess.sh
new file mode 100755
index 000000000..f70b1c028
--- /dev/null
+++ b/examples/canton/tts3/local/preprocess.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+
+stage=0
+stop_stage=100
+
+config_path=$1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # get durations from MFA's result
+    echo "Generate durations.txt from MFA results ..."
+    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
+        --inputdir=./canton_alignment \
+        --output durations.txt \
+        --config=${config_path}
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # extract features
+    echo "Extract features ..."
+    python3 ${BIN_DIR}/preprocess.py \
+        --dataset=canton \
+        --rootdir=~/datasets/canton_all \
+        --dumpdir=dump \
+        --dur-file=durations.txt \
+        --config=${config_path} \
+        --num-cpu=20 \
+        --cut-sil=True
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # get features' stats(mean and std)
+    echo "Get features' stats ..."
+    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --field-name="speech"
+
+    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --field-name="pitch"
+
+    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --field-name="energy"
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # normalize and covert phone/speaker to id, dev and test should use train's stats
+    echo "Normalize ..."
+    python3 ${BIN_DIR}/normalize.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --dumpdir=dump/train/norm \
+        --speech-stats=dump/train/speech_stats.npy \
+        --pitch-stats=dump/train/pitch_stats.npy \
+        --energy-stats=dump/train/energy_stats.npy \
+        --phones-dict=dump/phone_id_map.txt \
+        --speaker-dict=dump/speaker_id_map.txt
+
+    python3 ${BIN_DIR}/normalize.py \
+        --metadata=dump/dev/raw/metadata.jsonl \
+        --dumpdir=dump/dev/norm \
+        --speech-stats=dump/train/speech_stats.npy \
+        --pitch-stats=dump/train/pitch_stats.npy \
+        --energy-stats=dump/train/energy_stats.npy \
+        --phones-dict=dump/phone_id_map.txt \
+        --speaker-dict=dump/speaker_id_map.txt
+
+    python3 ${BIN_DIR}/normalize.py \
+        --metadata=dump/test/raw/metadata.jsonl \
+        --dumpdir=dump/test/norm \
+        --speech-stats=dump/train/speech_stats.npy \
+        --pitch-stats=dump/train/pitch_stats.npy \
+        --energy-stats=dump/train/energy_stats.npy \
+        --phones-dict=dump/phone_id_map.txt \
+        --speaker-dict=dump/speaker_id_map.txt
+fi
diff --git a/examples/canton/tts3/local/synthesize.sh b/examples/canton/tts3/local/synthesize.sh
new file mode 120000
index 000000000..ca9966ed5
--- /dev/null
+++ b/examples/canton/tts3/local/synthesize.sh
@@ -0,0 +1 @@
+../../../aishell3/tts3/local/synthesize.sh
\ No newline at end of file
diff --git a/examples/canton/tts3/local/synthesize_e2e.sh b/examples/canton/tts3/local/synthesize_e2e.sh
new file mode 100755
index 000000000..509129e3d
--- /dev/null
+++ b/examples/canton/tts3/local/synthesize_e2e.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
+stage=0
+stop_stage=0
+
+# pwgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=fastspeech2_canton \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=pwgan_aishell3 \
+        --voc_config=pwg_aishell3_ckpt_0.5/default.yaml \
+        --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
+        --voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
+        --lang=canton \
+        --text=${BIN_DIR}/../sentences_canton.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --speaker_dict=dump/speaker_id_map.txt \
+        --spk_id=0 \
+        --inference_dir=${train_output_path}/inference
+fi
+
+# hifigan
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    echo "in hifigan syn_e2e"
+    FLAGS_allocator_strategy=naive_best_fit \
+    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=fastspeech2_canton \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=hifigan_aishell3 \
+        --voc_config=hifigan_aishell3_ckpt_0.2.0/default.yaml \
+        --voc_ckpt=hifigan_aishell3_ckpt_0.2.0/snapshot_iter_2500000.pdz \
+        --voc_stat=hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \
+        --lang=canton \
+        --text=${BIN_DIR}/../sentences_canton.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --speaker_dict=dump/speaker_id_map.txt \
+        --spk_id=0 \
+        --inference_dir=${train_output_path}/inference
+    fi
diff --git a/examples/canton/tts3/local/train.sh b/examples/canton/tts3/local/train.sh
new file mode 120000
index 000000000..78885a300
--- /dev/null
+++ b/examples/canton/tts3/local/train.sh
@@ -0,0 +1 @@
+../../../aishell3/tts3/local/train.sh
\ No newline at end of file
diff --git a/examples/canton/tts3/path.sh b/examples/canton/tts3/path.sh
new file mode 120000
index 000000000..4785b9095
--- /dev/null
+++ b/examples/canton/tts3/path.sh
@@ -0,0 +1 @@
+../../csmsc/tts3/path.sh
\ No newline at end of file
diff --git a/examples/canton/tts3/run.sh b/examples/canton/tts3/run.sh
new file mode 100755
index 000000000..e84323134
--- /dev/null
+++ b/examples/canton/tts3/run.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+set -e
+source path.sh
+
+gpus=0,1
+stage=0
+stop_stage=100
+
+conf_path=conf/default.yaml
+train_output_path=exp/default
+
+ckpt_name=snapshot_iter_140000.pdz
+
+# with the following command, you can choose the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    ./local/preprocess.sh ${conf_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # synthesize, vocoder is pwgan by default
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # synthesize_e2e, vocoder is pwgan by default
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
diff --git a/examples/csmsc/tts3/local/PTQ_static.sh b/examples/csmsc/tts3/local/PTQ_static.sh
index a70a77b58..c6dce53cb 100755
--- a/examples/csmsc/tts3/local/PTQ_static.sh
+++ b/examples/csmsc/tts3/local/PTQ_static.sh
@@ -5,4 +5,4 @@ python3 ${BIN_DIR}/../PTQ_static.py \
     --dev-metadata=dump/dev/norm/metadata.jsonl \
     --inference_dir ${train_output_path}/inference \
     --model_name ${model_name} \
-    --onnx_forma=True
\ No newline at end of file
+    --onnx_format=True
\ No newline at end of file
diff --git a/examples/csmsc/vits/README.md b/examples/csmsc/vits/README.md
index 8f223e07b..50d703b2d 100644
--- a/examples/csmsc/vits/README.md
+++ b/examples/csmsc/vits/README.md
@@ -147,14 +147,14 @@ optional arguments:
 
 The pretrained model can be downloaded here:
 
-- [vits_csmsc_ckpt_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/vits/vits_csmsc_ckpt_1.1.0.zip) (add_blank=true)
+- [vits_csmsc_ckpt_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/vits/vits_csmsc_ckpt_1.4.0.zip) (add_blank=true)
 
 VITS checkpoint contains files listed below.
 ```text
-vits_csmsc_ckpt_1.1.0
-├── default.yaml              # default config used to train vitx
-├── phone_id_map.txt          # phone vocabulary file when training vits
-└── snapshot_iter_333000.pdz  # model parameters and optimizer states
+vits_csmsc_ckpt_1.4.0
+├── default.yaml                    # default config used to train vitx
+├── phone_id_map.txt                # phone vocabulary file when training vits
+└── snapshot_iter_150000.pdz  # model parameters and optimizer states
 ```
 
 ps: This ckpt is not good enough, a better result is training
@@ -168,9 +168,9 @@ add_blank=true
 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
 python3 ${BIN_DIR}/synthesize_e2e.py \
-    --config=vits_csmsc_ckpt_1.1.0/default.yaml \
-    --ckpt=vits_csmsc_ckpt_1.1.0/snapshot_iter_333000.pdz \
-    --phones_dict=vits_csmsc_ckpt_1.1.0/phone_id_map.txt \
+    --config=vits_csmsc_ckpt_1.4.0/default.yaml \
+    --ckpt=vits_csmsc_ckpt_1.4.0/snapshot_iter_150000.pdz \
+    --phones_dict=vits_csmsc_ckpt_1.4.0/phone_id_map.txt \
     --output_dir=exp/default/test_e2e \
     --text=${BIN_DIR}/../sentences.txt \
     --add-blank=${add_blank} 
diff --git a/examples/csmsc/voc1/local/PTQ_static.sh b/examples/csmsc/voc1/local/PTQ_static.sh
index 2e5166141..c85ebd109 100755
--- a/examples/csmsc/voc1/local/PTQ_static.sh
+++ b/examples/csmsc/voc1/local/PTQ_static.sh
@@ -2,7 +2,7 @@ train_output_path=$1
 model_name=$2
 
 python3 ${BIN_DIR}/../../PTQ_static.py \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
+    --dev-metadata=dump/dev/raw/metadata.jsonl \
     --inference_dir ${train_output_path}/inference \
     --model_name ${model_name} \
     --onnx_format=True 
\ No newline at end of file
diff --git a/examples/csmsc/voc1/local/preprocess.sh b/examples/csmsc/voc1/local/preprocess.sh
index 61d6d62be..62d0717b9 100755
--- a/examples/csmsc/voc1/local/preprocess.sh
+++ b/examples/csmsc/voc1/local/preprocess.sh
@@ -42,14 +42,18 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     python3 ${BIN_DIR}/../normalize.py \
         --metadata=dump/train/raw/metadata.jsonl \
         --dumpdir=dump/train/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
+        --skip-wav-copy
+        
     python3 ${BIN_DIR}/../normalize.py \
         --metadata=dump/dev/raw/metadata.jsonl \
         --dumpdir=dump/dev/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
+        --skip-wav-copy
     
     python3 ${BIN_DIR}/../normalize.py \
         --metadata=dump/test/raw/metadata.jsonl \
         --dumpdir=dump/test/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
+        --skip-wav-copy
 fi
diff --git a/examples/csmsc/voc3/finetune.sh b/examples/csmsc/voc3/finetune.sh
deleted file mode 100755
index 6719bd0be..000000000
--- a/examples/csmsc/voc3/finetune.sh
+++ /dev/null
@@ -1,64 +0,0 @@
-#!/bin/bash
-
-source path.sh
-
-gpus=0
-stage=0
-stop_stage=100
-
-source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
-
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    python3 ${MAIN_ROOT}/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py \
-        --fastspeech2-config=fastspeech2_nosil_baker_ckpt_0.4/default.yaml \
-        --fastspeech2-checkpoint=fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \
-        --fastspeech2-stat=fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \
-        --dur-file=durations.txt \
-        --output-dir=dump_finetune \
-        --phones-dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt \
-        --dataset=baker \
-        --rootdir=~/datasets/BZNSYP/
-fi
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    python3 ${MAIN_ROOT}/utils/link_wav.py \
-        --old-dump-dir=dump \
-        --dump-dir=dump_finetune
-fi
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    # get features' stats(mean and std)
-    echo "Get features' stats ..."
-    cp dump/train/feats_stats.npy dump_finetune/train/
-fi
-
-if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
-    # normalize, dev and test should use train's stats
-    echo "Normalize ..."
-   
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump_finetune/train/raw/metadata.jsonl \
-        --dumpdir=dump_finetune/train/norm \
-        --stats=dump_finetune/train/feats_stats.npy
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump_finetune/dev/raw/metadata.jsonl \
-        --dumpdir=dump_finetune/dev/norm \
-        --stats=dump_finetune/train/feats_stats.npy
-    
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump_finetune/test/raw/metadata.jsonl \
-        --dumpdir=dump_finetune/test/norm \
-        --stats=dump_finetune/train/feats_stats.npy
-fi
-
-if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
-    CUDA_VISIBLE_DEVICES=${gpus} \
-    FLAGS_cudnn_exhaustive_search=true \
-    FLAGS_conv_workspace_size_limit=4000 \
-    python ${BIN_DIR}/train.py \
-        --train-metadata=dump_finetune/train/norm/metadata.jsonl \
-        --dev-metadata=dump_finetune/dev/norm/metadata.jsonl \
-        --config=conf/finetune.yaml \
-        --output-dir=exp/finetune \
-        --ngpu=1
-fi 
\ No newline at end of file
diff --git a/examples/csmsc/voc3/finetune.sh b/examples/csmsc/voc3/finetune.sh
new file mode 120000
index 000000000..b6fa868e2
--- /dev/null
+++ b/examples/csmsc/voc3/finetune.sh
@@ -0,0 +1 @@
+../voc5/finetune.sh
\ No newline at end of file
diff --git a/examples/csmsc/voc3/local/preprocess.sh b/examples/csmsc/voc3/local/preprocess.sh
deleted file mode 100755
index 61d6d62be..000000000
--- a/examples/csmsc/voc3/local/preprocess.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/bash
-
-stage=0
-stop_stage=100
-
-config_path=$1
-
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    # get durations from MFA's result
-    echo "Generate durations.txt from MFA results ..."
-    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
-        --inputdir=./baker_alignment_tone \
-        --output=durations.txt \
-        --config=${config_path}
-fi
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    # extract features
-    echo "Extract features ..."
-    python3 ${BIN_DIR}/../preprocess.py \
-        --rootdir=~/datasets/BZNSYP/ \
-        --dataset=baker \
-        --dumpdir=dump \
-        --dur-file=durations.txt \
-        --config=${config_path} \
-        --cut-sil=True \
-        --num-cpu=20
-fi
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    # get features' stats(mean and std)
-    echo "Get features' stats ..."
-    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
-        --metadata=dump/train/raw/metadata.jsonl \
-        --field-name="feats"
-fi
-
-if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
-    # normalize, dev and test should use train's stats
-    echo "Normalize ..."
-   
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/train/raw/metadata.jsonl \
-        --dumpdir=dump/train/norm \
-        --stats=dump/train/feats_stats.npy
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/dev/raw/metadata.jsonl \
-        --dumpdir=dump/dev/norm \
-        --stats=dump/train/feats_stats.npy
-    
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/test/raw/metadata.jsonl \
-        --dumpdir=dump/test/norm \
-        --stats=dump/train/feats_stats.npy
-fi
diff --git a/examples/csmsc/voc3/local/preprocess.sh b/examples/csmsc/voc3/local/preprocess.sh
new file mode 120000
index 000000000..f0cb24de9
--- /dev/null
+++ b/examples/csmsc/voc3/local/preprocess.sh
@@ -0,0 +1 @@
+../../voc1/local/preprocess.sh
\ No newline at end of file
diff --git a/examples/csmsc/voc3/local/train.sh b/examples/csmsc/voc3/local/train.sh
deleted file mode 100755
index 9695631ef..000000000
--- a/examples/csmsc/voc3/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-FLAGS_cudnn_exhaustive_search=true \
-FLAGS_conv_workspace_size_limit=4000 \
-python ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=1
diff --git a/examples/csmsc/voc3/local/train.sh b/examples/csmsc/voc3/local/train.sh
new file mode 120000
index 000000000..9ec3ed94b
--- /dev/null
+++ b/examples/csmsc/voc3/local/train.sh
@@ -0,0 +1 @@
+../../voc1/local/train.sh
\ No newline at end of file
diff --git a/examples/csmsc/voc4/local/preprocess.sh b/examples/csmsc/voc4/local/preprocess.sh
deleted file mode 100755
index 61d6d62be..000000000
--- a/examples/csmsc/voc4/local/preprocess.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/bash
-
-stage=0
-stop_stage=100
-
-config_path=$1
-
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    # get durations from MFA's result
-    echo "Generate durations.txt from MFA results ..."
-    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
-        --inputdir=./baker_alignment_tone \
-        --output=durations.txt \
-        --config=${config_path}
-fi
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    # extract features
-    echo "Extract features ..."
-    python3 ${BIN_DIR}/../preprocess.py \
-        --rootdir=~/datasets/BZNSYP/ \
-        --dataset=baker \
-        --dumpdir=dump \
-        --dur-file=durations.txt \
-        --config=${config_path} \
-        --cut-sil=True \
-        --num-cpu=20
-fi
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    # get features' stats(mean and std)
-    echo "Get features' stats ..."
-    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
-        --metadata=dump/train/raw/metadata.jsonl \
-        --field-name="feats"
-fi
-
-if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
-    # normalize, dev and test should use train's stats
-    echo "Normalize ..."
-   
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/train/raw/metadata.jsonl \
-        --dumpdir=dump/train/norm \
-        --stats=dump/train/feats_stats.npy
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/dev/raw/metadata.jsonl \
-        --dumpdir=dump/dev/norm \
-        --stats=dump/train/feats_stats.npy
-    
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/test/raw/metadata.jsonl \
-        --dumpdir=dump/test/norm \
-        --stats=dump/train/feats_stats.npy
-fi
diff --git a/examples/csmsc/voc4/local/preprocess.sh b/examples/csmsc/voc4/local/preprocess.sh
new file mode 120000
index 000000000..f0cb24de9
--- /dev/null
+++ b/examples/csmsc/voc4/local/preprocess.sh
@@ -0,0 +1 @@
+../../voc1/local/preprocess.sh
\ No newline at end of file
diff --git a/examples/csmsc/voc4/local/train.sh b/examples/csmsc/voc4/local/train.sh
deleted file mode 100755
index 9695631ef..000000000
--- a/examples/csmsc/voc4/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-FLAGS_cudnn_exhaustive_search=true \
-FLAGS_conv_workspace_size_limit=4000 \
-python ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=1
diff --git a/examples/csmsc/voc4/local/train.sh b/examples/csmsc/voc4/local/train.sh
new file mode 120000
index 000000000..9ec3ed94b
--- /dev/null
+++ b/examples/csmsc/voc4/local/train.sh
@@ -0,0 +1 @@
+../../voc1/local/train.sh
\ No newline at end of file
diff --git a/examples/csmsc/voc5/finetune.sh b/examples/csmsc/voc5/finetune.sh
index 6719bd0be..eb8325aeb 100755
--- a/examples/csmsc/voc5/finetune.sh
+++ b/examples/csmsc/voc5/finetune.sh
@@ -39,16 +39,19 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     python3 ${BIN_DIR}/../normalize.py \
         --metadata=dump_finetune/train/raw/metadata.jsonl \
         --dumpdir=dump_finetune/train/norm \
-        --stats=dump_finetune/train/feats_stats.npy
+        --stats=dump_finetune/train/feats_stats.npy \
+        --skip-wav-copy
     python3 ${BIN_DIR}/../normalize.py \
         --metadata=dump_finetune/dev/raw/metadata.jsonl \
         --dumpdir=dump_finetune/dev/norm \
-        --stats=dump_finetune/train/feats_stats.npy
+        --stats=dump_finetune/train/feats_stats.npy \
+        --skip-wav-copy
     
     python3 ${BIN_DIR}/../normalize.py \
         --metadata=dump_finetune/test/raw/metadata.jsonl \
         --dumpdir=dump_finetune/test/norm \
-        --stats=dump_finetune/train/feats_stats.npy
+        --stats=dump_finetune/train/feats_stats.npy \
+        --skip-wav-copy
 fi
 
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
diff --git a/examples/csmsc/voc5/local/preprocess.sh b/examples/csmsc/voc5/local/preprocess.sh
deleted file mode 100755
index 61d6d62be..000000000
--- a/examples/csmsc/voc5/local/preprocess.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/bash
-
-stage=0
-stop_stage=100
-
-config_path=$1
-
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    # get durations from MFA's result
-    echo "Generate durations.txt from MFA results ..."
-    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
-        --inputdir=./baker_alignment_tone \
-        --output=durations.txt \
-        --config=${config_path}
-fi
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    # extract features
-    echo "Extract features ..."
-    python3 ${BIN_DIR}/../preprocess.py \
-        --rootdir=~/datasets/BZNSYP/ \
-        --dataset=baker \
-        --dumpdir=dump \
-        --dur-file=durations.txt \
-        --config=${config_path} \
-        --cut-sil=True \
-        --num-cpu=20
-fi
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    # get features' stats(mean and std)
-    echo "Get features' stats ..."
-    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
-        --metadata=dump/train/raw/metadata.jsonl \
-        --field-name="feats"
-fi
-
-if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
-    # normalize, dev and test should use train's stats
-    echo "Normalize ..."
-   
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/train/raw/metadata.jsonl \
-        --dumpdir=dump/train/norm \
-        --stats=dump/train/feats_stats.npy
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/dev/raw/metadata.jsonl \
-        --dumpdir=dump/dev/norm \
-        --stats=dump/train/feats_stats.npy
-    
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/test/raw/metadata.jsonl \
-        --dumpdir=dump/test/norm \
-        --stats=dump/train/feats_stats.npy
-fi
diff --git a/examples/csmsc/voc5/local/preprocess.sh b/examples/csmsc/voc5/local/preprocess.sh
new file mode 120000
index 000000000..f0cb24de9
--- /dev/null
+++ b/examples/csmsc/voc5/local/preprocess.sh
@@ -0,0 +1 @@
+../../voc1/local/preprocess.sh
\ No newline at end of file
diff --git a/examples/csmsc/voc5/local/train.sh b/examples/csmsc/voc5/local/train.sh
deleted file mode 100755
index 9695631ef..000000000
--- a/examples/csmsc/voc5/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-FLAGS_cudnn_exhaustive_search=true \
-FLAGS_conv_workspace_size_limit=4000 \
-python ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=1
diff --git a/examples/csmsc/voc5/local/train.sh b/examples/csmsc/voc5/local/train.sh
new file mode 120000
index 000000000..9ec3ed94b
--- /dev/null
+++ b/examples/csmsc/voc5/local/train.sh
@@ -0,0 +1 @@
+../../voc1/local/train.sh
\ No newline at end of file
diff --git a/examples/csmsc/voc6/local/preprocess.sh b/examples/csmsc/voc6/local/preprocess.sh
index 2dcc39ac7..509824b8e 100755
--- a/examples/csmsc/voc6/local/preprocess.sh
+++ b/examples/csmsc/voc6/local/preprocess.sh
@@ -42,14 +42,18 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     python3 ${BIN_DIR}/../gan_vocoder/normalize.py \
         --metadata=dump/train/raw/metadata.jsonl \
         --dumpdir=dump/train/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
+        --skip-wav-copy
+
     python3 ${BIN_DIR}/../gan_vocoder/normalize.py \
         --metadata=dump/dev/raw/metadata.jsonl \
         --dumpdir=dump/dev/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
+        --skip-wav-copy
     
     python3 ${BIN_DIR}/../gan_vocoder/normalize.py \
         --metadata=dump/test/raw/metadata.jsonl \
         --dumpdir=dump/test/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
+        --skip-wav-copy
 fi
diff --git a/examples/csmsc/voc6/local/train.sh b/examples/csmsc/voc6/local/train.sh
deleted file mode 100755
index 9695631ef..000000000
--- a/examples/csmsc/voc6/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-FLAGS_cudnn_exhaustive_search=true \
-FLAGS_conv_workspace_size_limit=4000 \
-python ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=1
diff --git a/examples/csmsc/voc6/local/train.sh b/examples/csmsc/voc6/local/train.sh
new file mode 120000
index 000000000..9ec3ed94b
--- /dev/null
+++ b/examples/csmsc/voc6/local/train.sh
@@ -0,0 +1 @@
+../../voc1/local/train.sh
\ No newline at end of file
diff --git a/examples/ljspeech/tts0/local/train.sh b/examples/ljspeech/tts0/local/train.sh
deleted file mode 100755
index f90db9150..000000000
--- a/examples/ljspeech/tts0/local/train.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-python3 ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=1 \
-    --phones-dict=dump/phone_id_map.txt
\ No newline at end of file
diff --git a/examples/ljspeech/tts0/local/train.sh b/examples/ljspeech/tts0/local/train.sh
new file mode 120000
index 000000000..7f54e9239
--- /dev/null
+++ b/examples/ljspeech/tts0/local/train.sh
@@ -0,0 +1 @@
+../../../csmsc/tts0/local/train.sh
\ No newline at end of file
diff --git a/examples/ljspeech/tts0/path.sh b/examples/ljspeech/tts0/path.sh
deleted file mode 100755
index a37cd21e3..000000000
--- a/examples/ljspeech/tts0/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=tacotron2
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
diff --git a/examples/ljspeech/tts0/path.sh b/examples/ljspeech/tts0/path.sh
new file mode 120000
index 000000000..9e1fdbd16
--- /dev/null
+++ b/examples/ljspeech/tts0/path.sh
@@ -0,0 +1 @@
+../../csmsc/tts0/path.sh
\ No newline at end of file
diff --git a/examples/ljspeech/tts3/local/train.sh b/examples/ljspeech/tts3/local/train.sh
deleted file mode 100755
index d1302f99f..000000000
--- a/examples/ljspeech/tts3/local/train.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-python3 ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=1 \
-    --phones-dict=dump/phone_id_map.txt
diff --git a/examples/ljspeech/tts3/local/train.sh b/examples/ljspeech/tts3/local/train.sh
new file mode 120000
index 000000000..d7b05058e
--- /dev/null
+++ b/examples/ljspeech/tts3/local/train.sh
@@ -0,0 +1 @@
+../../../csmsc/tts3/local/train.sh
\ No newline at end of file
diff --git a/examples/ljspeech/tts3/path.sh b/examples/ljspeech/tts3/path.sh
deleted file mode 100755
index fb7e8411c..000000000
--- a/examples/ljspeech/tts3/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=fastspeech2
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
diff --git a/examples/ljspeech/tts3/path.sh b/examples/ljspeech/tts3/path.sh
new file mode 120000
index 000000000..4785b9095
--- /dev/null
+++ b/examples/ljspeech/tts3/path.sh
@@ -0,0 +1 @@
+../../csmsc/tts3/path.sh
\ No newline at end of file
diff --git a/examples/ljspeech/voc1/local/preprocess.sh b/examples/ljspeech/voc1/local/preprocess.sh
index d1af60dad..bfbf75b7d 100755
--- a/examples/ljspeech/voc1/local/preprocess.sh
+++ b/examples/ljspeech/voc1/local/preprocess.sh
@@ -42,14 +42,18 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     python3 ${BIN_DIR}/../normalize.py \
         --metadata=dump/train/raw/metadata.jsonl \
         --dumpdir=dump/train/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
+        --skip-wav-copy
+
     python3 ${BIN_DIR}/../normalize.py \
         --metadata=dump/dev/raw/metadata.jsonl \
         --dumpdir=dump/dev/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
+        --skip-wav-copy
     
     python3 ${BIN_DIR}/../normalize.py \
         --metadata=dump/test/raw/metadata.jsonl \
         --dumpdir=dump/test/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
+        --skip-wav-copy
 fi
diff --git a/examples/ljspeech/voc1/local/synthesize.sh b/examples/ljspeech/voc1/local/synthesize.sh
deleted file mode 100755
index 145557b3d..000000000
--- a/examples/ljspeech/voc1/local/synthesize.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-ckpt_name=$3
-
-FLAGS_allocator_strategy=naive_best_fit \
-FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ${BIN_DIR}/../synthesize.py \
-    --config=${config_path} \
-    --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
-    --test-metadata=dump/test/norm/metadata.jsonl \
-    --output-dir=${train_output_path}/test \
-    --generator-type=pwgan
diff --git a/examples/ljspeech/voc1/local/synthesize.sh b/examples/ljspeech/voc1/local/synthesize.sh
new file mode 120000
index 000000000..d6aecd8d1
--- /dev/null
+++ b/examples/ljspeech/voc1/local/synthesize.sh
@@ -0,0 +1 @@
+../../../csmsc/voc1/local/synthesize.sh
\ No newline at end of file
diff --git a/examples/ljspeech/voc1/local/train.sh b/examples/ljspeech/voc1/local/train.sh
deleted file mode 100755
index 9695631ef..000000000
--- a/examples/ljspeech/voc1/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-FLAGS_cudnn_exhaustive_search=true \
-FLAGS_conv_workspace_size_limit=4000 \
-python ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=1
diff --git a/examples/ljspeech/voc1/local/train.sh b/examples/ljspeech/voc1/local/train.sh
new file mode 120000
index 000000000..2942893d2
--- /dev/null
+++ b/examples/ljspeech/voc1/local/train.sh
@@ -0,0 +1 @@
+../../../csmsc/voc1/local/train.sh
\ No newline at end of file
diff --git a/examples/ljspeech/voc1/path.sh b/examples/ljspeech/voc1/path.sh
deleted file mode 100755
index 1e6647b86..000000000
--- a/examples/ljspeech/voc1/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=parallelwave_gan
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL}
\ No newline at end of file
diff --git a/examples/ljspeech/voc1/path.sh b/examples/ljspeech/voc1/path.sh
new file mode 120000
index 000000000..b7ed4fb8f
--- /dev/null
+++ b/examples/ljspeech/voc1/path.sh
@@ -0,0 +1 @@
+../../csmsc/voc1/path.sh
\ No newline at end of file
diff --git a/examples/ljspeech/voc5/local/preprocess.sh b/examples/ljspeech/voc5/local/preprocess.sh
deleted file mode 100755
index d1af60dad..000000000
--- a/examples/ljspeech/voc5/local/preprocess.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/bash
-
-stage=0
-stop_stage=100
-
-config_path=$1
-
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    # get durations from MFA's result
-    echo "Generate durations.txt from MFA results ..."
-    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
-        --inputdir=./ljspeech_alignment \
-        --output=durations.txt \
-        --config=${config_path}
-fi
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    # extract features
-    echo "Extract features ..."
-    python3 ${BIN_DIR}/../preprocess.py \
-        --rootdir=~/datasets/LJSpeech-1.1/ \
-        --dataset=ljspeech \
-        --dumpdir=dump \
-        --dur-file=durations.txt \
-        --config=${config_path} \
-        --cut-sil=True \
-        --num-cpu=20
-fi
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    # get features' stats(mean and std)
-    echo "Get features' stats ..."
-    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
-        --metadata=dump/train/raw/metadata.jsonl \
-        --field-name="feats"
-fi
-
-if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
-    # normalize, dev and test should use train's stats
-    echo "Normalize ..."
-   
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/train/raw/metadata.jsonl \
-        --dumpdir=dump/train/norm \
-        --stats=dump/train/feats_stats.npy
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/dev/raw/metadata.jsonl \
-        --dumpdir=dump/dev/norm \
-        --stats=dump/train/feats_stats.npy
-    
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/test/raw/metadata.jsonl \
-        --dumpdir=dump/test/norm \
-        --stats=dump/train/feats_stats.npy
-fi
diff --git a/examples/ljspeech/voc5/local/preprocess.sh b/examples/ljspeech/voc5/local/preprocess.sh
new file mode 120000
index 000000000..f0cb24de9
--- /dev/null
+++ b/examples/ljspeech/voc5/local/preprocess.sh
@@ -0,0 +1 @@
+../../voc1/local/preprocess.sh
\ No newline at end of file
diff --git a/examples/ljspeech/voc5/local/synthesize.sh b/examples/ljspeech/voc5/local/synthesize.sh
deleted file mode 100755
index 647896175..000000000
--- a/examples/ljspeech/voc5/local/synthesize.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-ckpt_name=$3
-
-FLAGS_allocator_strategy=naive_best_fit \
-FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ${BIN_DIR}/../synthesize.py \
-    --config=${config_path} \
-    --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
-    --test-metadata=dump/test/norm/metadata.jsonl \
-    --output-dir=${train_output_path}/test \
-    --generator-type=hifigan
diff --git a/examples/ljspeech/voc5/local/synthesize.sh b/examples/ljspeech/voc5/local/synthesize.sh
new file mode 120000
index 000000000..c887112c0
--- /dev/null
+++ b/examples/ljspeech/voc5/local/synthesize.sh
@@ -0,0 +1 @@
+../../../csmsc/voc5/local/synthesize.sh
\ No newline at end of file
diff --git a/examples/ljspeech/voc5/local/train.sh b/examples/ljspeech/voc5/local/train.sh
deleted file mode 100755
index 9695631ef..000000000
--- a/examples/ljspeech/voc5/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-FLAGS_cudnn_exhaustive_search=true \
-FLAGS_conv_workspace_size_limit=4000 \
-python ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=1
diff --git a/examples/ljspeech/voc5/local/train.sh b/examples/ljspeech/voc5/local/train.sh
new file mode 120000
index 000000000..2942893d2
--- /dev/null
+++ b/examples/ljspeech/voc5/local/train.sh
@@ -0,0 +1 @@
+../../../csmsc/voc1/local/train.sh
\ No newline at end of file
diff --git a/examples/ljspeech/voc5/path.sh b/examples/ljspeech/voc5/path.sh
deleted file mode 100755
index 7451b3218..000000000
--- a/examples/ljspeech/voc5/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=hifigan
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL}
diff --git a/examples/ljspeech/voc5/path.sh b/examples/ljspeech/voc5/path.sh
new file mode 120000
index 000000000..b67fe2b39
--- /dev/null
+++ b/examples/ljspeech/voc5/path.sh
@@ -0,0 +1 @@
+../../csmsc/voc5/path.sh
\ No newline at end of file
diff --git a/examples/other/mfa/README.md b/examples/other/mfa/README.md
index 216d1275b..b85dac4d9 100644
--- a/examples/other/mfa/README.md
+++ b/examples/other/mfa/README.md
@@ -7,3 +7,10 @@ Run the following script to get started, for more detail, please see `run.sh`.
 # Rhythm tags for MFA
 If you want to get rhythm tags with duration through MFA tool, you may add flag `--rhy-with-duration` in the first two commands in `run.sh`
 Note that only CSMSC dataset is supported so far, and we replace `#` with `sp` in rhythm tags for MFA.
+
+# MFA for Cantonese language
+First, go download these datasets [Guangzhou_Cantonese_Scripted_Speech_Corpus_Daily_Use_Sentence](https://magichub.com/datasets/guangzhou-cantonese-scripted-speech-corpus-daily-use-sentence/) and [Guangzhou_Cantonese_Scripted_Speech_Corpus_in_Vehicle](https://magichub.com/datasets/guangzhou-cantonese-scripted-speech-corpus-in-the-vehicle/) under `~/datasets/`.
+Then,
+```bash
+./run_canton.sh
+```
diff --git a/examples/other/mfa/local/generate_canton_lexicon_wavlabs.py b/examples/other/mfa/local/generate_canton_lexicon_wavlabs.py
new file mode 100644
index 000000000..36bb74467
--- /dev/null
+++ b/examples/other/mfa/local/generate_canton_lexicon_wavlabs.py
@@ -0,0 +1,80 @@
+import argparse
+import os
+import re
+import shutil
+
+import ToJyutping
+
+
+def check(str):
+    my_re = re.compile(r'[A-Za-z]', re.S)
+    res = re.findall(my_re, str)
+    if len(res):
+        return True
+    else:
+        return False
+
+
+consonants = [
+    'p', 'b', 't', 'd', 'ts', 'dz', 'k', 'g', 'kw', 'gw', 'f', 'h', 'l', 'm',
+    'ng', 'n', 's', 'y', 'w', 'c', 'z', 'j'
+]
+
+
+def get_lines(canton):
+    for consonant in consonants:
+        if canton.startswith(consonant):
+            c, v = canton[:len(consonant)], canton[len(consonant):]
+            return canton + ' ' + c + ' ' + v
+    return canton + ' ' + canton
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Generate lexicon for Cantonese pinyin to phoneme for MFA")
+    parser.add_argument(
+        "--output_lexicon", type=str, help="Path to save lexicon.")
+    parser.add_argument(
+        "--output_wavlabs",
+        type=str,
+        help="Path of wavs and labs for MFA training.")
+    parser.add_argument(
+        "--inputs", type=str, nargs="+", help="Path to the cantonese datasets.")
+    args = parser.parse_args()
+
+    os.mkdir(args.output_wavlabs)
+
+    utterance_info = []
+    all_canton = []
+    for input_ in args.inputs:
+        utt = "UTTRANSINFO.txt" if "Guangzhou_Cantonese_Scripted_Speech_Corpus_Daily_Use_Sentence" in input_ else "UTTERANCEINFO.txt"
+        input_utttxt = os.path.join(input_, utt)
+
+        with open(input_utttxt, 'r') as f:
+            utterance_info = f.readlines()[1:]
+
+        for utterance_line in utterance_info:
+            _, wav_name, spk, _, text = utterance_line.split('\t')
+            text = text.strip().replace(' ', '')
+            # check the characters and drop the short text.
+            if not check(text) and len(text) > 2:
+                source_path = os.path.join(input_, 'WAV', spk, wav_name)
+                out_spk_path = os.path.join(args.output_wavlabs, spk)
+                os.makedirs(out_spk_path, exist_ok=True)
+                target_path = os.path.join(out_spk_path, wav_name)
+
+                shutil.copy(source_path, target_path)
+
+                lab_name = wav_name.split('.')[0] + '.lab'
+                lab_target_path = os.path.join(out_spk_path, lab_name)
+                canton_list = ToJyutping.get_jyutping_text(text)
+                with open(lab_target_path, 'w') as f:
+                    f.write(canton_list)
+
+                canton_list = canton_list.split(' ')
+                all_canton.extend(canton_list)
+    all_canton = set(all_canton)
+
+    with open(args.output_lexicon, 'w') as f:
+        for canton in all_canton:
+            f.write(get_lines(canton) + '\n')
diff --git a/examples/other/mfa/run_canton.sh b/examples/other/mfa/run_canton.sh
new file mode 100755
index 000000000..cef6a2f04
--- /dev/null
+++ b/examples/other/mfa/run_canton.sh
@@ -0,0 +1,34 @@
+EXP_DIR=exp
+
+mkdir -p $EXP_DIR
+LEXICON_NAME='canton'
+if [ ! -f "$EXP_DIR/$LEXICON_NAME.lexicon" ]; then
+    echo "generating lexicon and training data..."
+    python local/generate_canton_lexicon_wavlabs.py --output_lexicon "$EXP_DIR/$LEXICON_NAME.lexicon" --output_wavlabs "$EXP_DIR/$LEXICON_NAME"_wavlabs --inputs ~/datasets/Guangzhou_Cantonese_Scripted_Speech_Corpus_Daily_Use_Sentence ~/datasets/Guangzhou_Cantonese_Scripted_Speech_Corpus_in_Vehicle
+    echo "lexicon and training data done"
+fi
+
+
+MFA_DOWNLOAD_DIR=local/
+
+if [ ! -f "$MFA_DOWNLOAD_DIR/montreal-forced-aligner_linux.tar.gz" ]; then
+    echo "downloading mfa..."
+    (cd $MFA_DOWNLOAD_DIR && wget https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/download/v1.0.1/montreal-forced-aligner_linux.tar.gz)
+    echo "download mfa done!"
+fi
+
+if [ ! -d "$MFA_DOWNLOAD_DIR/montreal-forced-aligner" ]; then
+    echo "extracting mfa..."
+    (cd $MFA_DOWNLOAD_DIR && tar xvf "montreal-forced-aligner_linux.tar.gz")
+    echo "extraction done!"
+fi
+
+export PATH="$MFA_DOWNLOAD_DIR/montreal-forced-aligner/bin"
+if [ ! -d "$EXP_DIR/canton_alignment" ]; then
+    echo "Start MFA training..."
+    mfa_train_and_align "$EXP_DIR/$LEXICON_NAME"_wavlabs "$EXP_DIR/$LEXICON_NAME.lexicon" $EXP_DIR/canton_alignment -o $EXP_DIR/canton_model --clean --verbose --temp_directory $EXP_DIR/.mfa_train_and_align
+    echo "training done!"
+    echo "results: $EXP_DIR/canton_alignment"
+    echo "model: $EXP_DIR/canton_model"
+fi
+
diff --git a/examples/tess/cls0/local/train.py b/examples/tess/cls0/local/train.py
index 25382d8c3..f023a37b7 100644
--- a/examples/tess/cls0/local/train.py
+++ b/examples/tess/cls0/local/train.py
@@ -121,7 +121,7 @@ if __name__ == "__main__":
             optimizer.clear_grad()
 
             # Calculate loss
-            avg_loss += loss.numpy()[0]
+            avg_loss += float(loss)
 
             # Calculate metrics
             preds = paddle.argmax(logits, axis=1)
diff --git a/examples/vctk/ernie_sat/local/train.sh b/examples/vctk/ernie_sat/local/train.sh
deleted file mode 100755
index 526aac435..000000000
--- a/examples/vctk/ernie_sat/local/train.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-python3 ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=8 \
-    --phones-dict=dump/phone_id_map.txt
\ No newline at end of file
diff --git a/examples/vctk/ernie_sat/local/train.sh b/examples/vctk/ernie_sat/local/train.sh
new file mode 120000
index 000000000..9f1d2346d
--- /dev/null
+++ b/examples/vctk/ernie_sat/local/train.sh
@@ -0,0 +1 @@
+../../../aishell3/ernie_sat/local/train.sh
\ No newline at end of file
diff --git a/examples/vctk/ernie_sat/path.sh b/examples/vctk/ernie_sat/path.sh
deleted file mode 100755
index 4ecab0251..000000000
--- a/examples/vctk/ernie_sat/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=ernie_sat
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
\ No newline at end of file
diff --git a/examples/vctk/ernie_sat/path.sh b/examples/vctk/ernie_sat/path.sh
new file mode 120000
index 000000000..5ec397590
--- /dev/null
+++ b/examples/vctk/ernie_sat/path.sh
@@ -0,0 +1 @@
+../../aishell3/ernie_sat/path.sh
\ No newline at end of file
diff --git a/examples/vctk/tts3/local/train.sh b/examples/vctk/tts3/local/train.sh
deleted file mode 100755
index 3a5076505..000000000
--- a/examples/vctk/tts3/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-python3 ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=1 \
-    --phones-dict=dump/phone_id_map.txt \
-    --speaker-dict=dump/speaker_id_map.txt
diff --git a/examples/vctk/tts3/local/train.sh b/examples/vctk/tts3/local/train.sh
new file mode 120000
index 000000000..78885a300
--- /dev/null
+++ b/examples/vctk/tts3/local/train.sh
@@ -0,0 +1 @@
+../../../aishell3/tts3/local/train.sh
\ No newline at end of file
diff --git a/examples/vctk/tts3/path.sh b/examples/vctk/tts3/path.sh
deleted file mode 100755
index fb7e8411c..000000000
--- a/examples/vctk/tts3/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=fastspeech2
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
diff --git a/examples/vctk/tts3/path.sh b/examples/vctk/tts3/path.sh
new file mode 120000
index 000000000..4785b9095
--- /dev/null
+++ b/examples/vctk/tts3/path.sh
@@ -0,0 +1 @@
+../../csmsc/tts3/path.sh
\ No newline at end of file
diff --git a/examples/vctk/voc1/local/preprocess.sh b/examples/vctk/voc1/local/preprocess.sh
index 88a478cd5..6b7e5288a 100755
--- a/examples/vctk/voc1/local/preprocess.sh
+++ b/examples/vctk/voc1/local/preprocess.sh
@@ -42,14 +42,18 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     python3 ${BIN_DIR}/../normalize.py \
         --metadata=dump/train/raw/metadata.jsonl \
         --dumpdir=dump/train/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
+        --skip-wav-copy
+
     python3 ${BIN_DIR}/../normalize.py \
         --metadata=dump/dev/raw/metadata.jsonl \
         --dumpdir=dump/dev/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
+        --skip-wav-copy
     
     python3 ${BIN_DIR}/../normalize.py \
         --metadata=dump/test/raw/metadata.jsonl \
         --dumpdir=dump/test/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
+        --skip-wav-copy
 fi
diff --git a/examples/vctk/voc1/local/synthesize.sh b/examples/vctk/voc1/local/synthesize.sh
deleted file mode 100755
index 145557b3d..000000000
--- a/examples/vctk/voc1/local/synthesize.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-ckpt_name=$3
-
-FLAGS_allocator_strategy=naive_best_fit \
-FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ${BIN_DIR}/../synthesize.py \
-    --config=${config_path} \
-    --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
-    --test-metadata=dump/test/norm/metadata.jsonl \
-    --output-dir=${train_output_path}/test \
-    --generator-type=pwgan
diff --git a/examples/vctk/voc1/local/synthesize.sh b/examples/vctk/voc1/local/synthesize.sh
new file mode 120000
index 000000000..d6aecd8d1
--- /dev/null
+++ b/examples/vctk/voc1/local/synthesize.sh
@@ -0,0 +1 @@
+../../../csmsc/voc1/local/synthesize.sh
\ No newline at end of file
diff --git a/examples/vctk/voc1/local/train.sh b/examples/vctk/voc1/local/train.sh
deleted file mode 100755
index 9695631ef..000000000
--- a/examples/vctk/voc1/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-FLAGS_cudnn_exhaustive_search=true \
-FLAGS_conv_workspace_size_limit=4000 \
-python ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=1
diff --git a/examples/vctk/voc1/local/train.sh b/examples/vctk/voc1/local/train.sh
new file mode 120000
index 000000000..2942893d2
--- /dev/null
+++ b/examples/vctk/voc1/local/train.sh
@@ -0,0 +1 @@
+../../../csmsc/voc1/local/train.sh
\ No newline at end of file
diff --git a/examples/vctk/voc1/path.sh b/examples/vctk/voc1/path.sh
deleted file mode 100755
index 1e6647b86..000000000
--- a/examples/vctk/voc1/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=parallelwave_gan
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL}
\ No newline at end of file
diff --git a/examples/vctk/voc1/path.sh b/examples/vctk/voc1/path.sh
new file mode 120000
index 000000000..b7ed4fb8f
--- /dev/null
+++ b/examples/vctk/voc1/path.sh
@@ -0,0 +1 @@
+../../csmsc/voc1/path.sh
\ No newline at end of file
diff --git a/examples/vctk/voc5/local/preprocess.sh b/examples/vctk/voc5/local/preprocess.sh
deleted file mode 100755
index 88a478cd5..000000000
--- a/examples/vctk/voc5/local/preprocess.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/bash
-
-stage=0
-stop_stage=100
-
-config_path=$1
-
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    # get durations from MFA's result
-    echo "Generate durations.txt from MFA results ..."
-    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
-        --inputdir=./vctk_alignment \
-        --output=durations.txt \
-        --config=${config_path}
-fi
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    # extract features
-    echo "Extract features ..."
-    python3 ${BIN_DIR}/../preprocess.py \
-        --rootdir=~/datasets/VCTK-Corpus-0.92/ \
-        --dataset=vctk \
-        --dumpdir=dump \
-        --dur-file=durations.txt \
-        --config=${config_path} \
-        --cut-sil=True \
-        --num-cpu=20
-fi
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    # get features' stats(mean and std)
-    echo "Get features' stats ..."
-    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
-        --metadata=dump/train/raw/metadata.jsonl \
-        --field-name="feats"
-fi
-
-if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
-    # normalize, dev and test should use train's stats
-    echo "Normalize ..."
-   
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/train/raw/metadata.jsonl \
-        --dumpdir=dump/train/norm \
-        --stats=dump/train/feats_stats.npy
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/dev/raw/metadata.jsonl \
-        --dumpdir=dump/dev/norm \
-        --stats=dump/train/feats_stats.npy
-    
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/test/raw/metadata.jsonl \
-        --dumpdir=dump/test/norm \
-        --stats=dump/train/feats_stats.npy
-fi
diff --git a/examples/vctk/voc5/local/preprocess.sh b/examples/vctk/voc5/local/preprocess.sh
new file mode 120000
index 000000000..f0cb24de9
--- /dev/null
+++ b/examples/vctk/voc5/local/preprocess.sh
@@ -0,0 +1 @@
+../../voc1/local/preprocess.sh
\ No newline at end of file
diff --git a/examples/vctk/voc5/local/synthesize.sh b/examples/vctk/voc5/local/synthesize.sh
deleted file mode 100755
index 647896175..000000000
--- a/examples/vctk/voc5/local/synthesize.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-ckpt_name=$3
-
-FLAGS_allocator_strategy=naive_best_fit \
-FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ${BIN_DIR}/../synthesize.py \
-    --config=${config_path} \
-    --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
-    --test-metadata=dump/test/norm/metadata.jsonl \
-    --output-dir=${train_output_path}/test \
-    --generator-type=hifigan
diff --git a/examples/vctk/voc5/local/synthesize.sh b/examples/vctk/voc5/local/synthesize.sh
new file mode 120000
index 000000000..c887112c0
--- /dev/null
+++ b/examples/vctk/voc5/local/synthesize.sh
@@ -0,0 +1 @@
+../../../csmsc/voc5/local/synthesize.sh
\ No newline at end of file
diff --git a/examples/vctk/voc5/local/train.sh b/examples/vctk/voc5/local/train.sh
deleted file mode 100755
index 9695631ef..000000000
--- a/examples/vctk/voc5/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-FLAGS_cudnn_exhaustive_search=true \
-FLAGS_conv_workspace_size_limit=4000 \
-python ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=1
diff --git a/examples/vctk/voc5/local/train.sh b/examples/vctk/voc5/local/train.sh
new file mode 120000
index 000000000..2942893d2
--- /dev/null
+++ b/examples/vctk/voc5/local/train.sh
@@ -0,0 +1 @@
+../../../csmsc/voc1/local/train.sh
\ No newline at end of file
diff --git a/examples/vctk/voc5/path.sh b/examples/vctk/voc5/path.sh
deleted file mode 100755
index 7451b3218..000000000
--- a/examples/vctk/voc5/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=hifigan
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL}
diff --git a/examples/vctk/voc5/path.sh b/examples/vctk/voc5/path.sh
new file mode 120000
index 000000000..b67fe2b39
--- /dev/null
+++ b/examples/vctk/voc5/path.sh
@@ -0,0 +1 @@
+../../csmsc/voc5/path.sh
\ No newline at end of file
diff --git a/examples/zh_en_tts/tts3/local/train.sh b/examples/zh_en_tts/tts3/local/train.sh
deleted file mode 100755
index 1da72f117..000000000
--- a/examples/zh_en_tts/tts3/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-python3 ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=2 \
-    --phones-dict=dump/phone_id_map.txt \
-    --speaker-dict=dump/speaker_id_map.txt
diff --git a/examples/zh_en_tts/tts3/local/train.sh b/examples/zh_en_tts/tts3/local/train.sh
new file mode 120000
index 000000000..78885a300
--- /dev/null
+++ b/examples/zh_en_tts/tts3/local/train.sh
@@ -0,0 +1 @@
+../../../aishell3/tts3/local/train.sh
\ No newline at end of file
diff --git a/examples/zh_en_tts/tts3/path.sh b/examples/zh_en_tts/tts3/path.sh
deleted file mode 100755
index fb7e8411c..000000000
--- a/examples/zh_en_tts/tts3/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=fastspeech2
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
diff --git a/examples/zh_en_tts/tts3/path.sh b/examples/zh_en_tts/tts3/path.sh
new file mode 120000
index 000000000..4785b9095
--- /dev/null
+++ b/examples/zh_en_tts/tts3/path.sh
@@ -0,0 +1 @@
+../../csmsc/tts3/path.sh
\ No newline at end of file
diff --git a/paddlespeech/cli/text/infer.py b/paddlespeech/cli/text/infer.py
index ff822f674..bd76a13d0 100644
--- a/paddlespeech/cli/text/infer.py
+++ b/paddlespeech/cli/text/infer.py
@@ -127,7 +127,7 @@ class TextExecutor(BaseExecutor):
         if self.task == 'punc':
             # punc list
             self._punc_list = []
-            with open(self.vocab_file, 'r') as f:
+            with open(self.vocab_file, 'r', encoding='utf-8') as f:
                 for line in f:
                     self._punc_list.append(line.strip())
 
@@ -178,12 +178,12 @@ class TextExecutor(BaseExecutor):
         if self.task == 'punc':
             # punc list
             self._punc_list = []
-            with open(self.vocab_file, 'r') as f:
+            with open(self.vocab_file, 'r', encoding='utf-8') as f:
                 for line in f:
                     self._punc_list.append(line.strip())
 
             # model
-            with open(self.cfg_path) as f:
+            with open(self.cfg_path, 'r', encoding='utf-8') as f:
                 config = CfgNode(yaml.safe_load(f))
             self.model = ErnieLinear(**config["model"])
 
diff --git a/paddlespeech/cli/tts/infer.py b/paddlespeech/cli/tts/infer.py
index 707518c05..5515ade26 100644
--- a/paddlespeech/cli/tts/infer.py
+++ b/paddlespeech/cli/tts/infer.py
@@ -292,19 +292,19 @@ class TTSExecutor(BaseExecutor):
         with open(self.voc_config) as f:
             self.voc_config = CfgNode(yaml.safe_load(f))
 
-        with open(self.phones_dict, "r") as f:
+        with open(self.phones_dict, 'rt', encoding='utf-8') as f:
             phn_id = [line.strip().split() for line in f.readlines()]
         vocab_size = len(phn_id)
 
         tone_size = None
         if self.tones_dict:
-            with open(self.tones_dict, "r") as f:
+            with open(self.tones_dict, 'rt', encoding='utf-8') as f:
                 tone_id = [line.strip().split() for line in f.readlines()]
             tone_size = len(tone_id)
 
         spk_num = None
         if self.speaker_dict:
-            with open(self.speaker_dict, 'rt') as f:
+            with open(self.speaker_dict, 'rt', encoding='utf-8') as f:
                 spk_id = [line.strip().split() for line in f.readlines()]
             spk_num = len(spk_id)
 
diff --git a/paddlespeech/cli/whisper/infer.py b/paddlespeech/cli/whisper/infer.py
index c016b453a..ebcca890b 100644
--- a/paddlespeech/cli/whisper/infer.py
+++ b/paddlespeech/cli/whisper/infer.py
@@ -152,8 +152,7 @@ class WhisperExecutor(BaseExecutor):
         Init model and other resources from a specific path.
         """
         logger.debug("start to init the model")
-        # default max_len: unit:second
-        self.max_len = 50
+
         if hasattr(self, 'model'):
             logger.debug('Model had been initialized.')
             return
@@ -339,12 +338,6 @@ class WhisperExecutor(BaseExecutor):
         try:
             audio, audio_sample_rate = soundfile.read(
                 audio_file, dtype="int16", always_2d=True)
-            audio_duration = audio.shape[0] / audio_sample_rate
-            if audio_duration > self.max_len:
-                logger.error(
-                    f"Please input audio file less then {self.max_len} seconds.\n"
-                )
-                return False
         except Exception as e:
             logger.exception(e)
             logger.error(
diff --git a/paddlespeech/s2t/exps/wav2vec2/model.py b/paddlespeech/s2t/exps/wav2vec2/model.py
index 6a3321e46..86b56b876 100644
--- a/paddlespeech/s2t/exps/wav2vec2/model.py
+++ b/paddlespeech/s2t/exps/wav2vec2/model.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,17 +17,22 @@ import math
 import os
 import re
 import time
-from collections import defaultdict
 from collections import OrderedDict
 from contextlib import nullcontext
 
 import jsonlines
 import numpy as np
 import paddle
+from hyperpyyaml import load_hyperpyyaml
 from paddle import distributed as dist
+from paddlenlp.transformers import AutoTokenizer
 
 from paddlespeech.s2t.frontend.featurizer import TextFeaturizer
 from paddlespeech.s2t.io.dataloader import DataLoaderFactory
+from paddlespeech.s2t.io.speechbrain import data_pipeline
+from paddlespeech.s2t.io.speechbrain import dataio
+from paddlespeech.s2t.io.speechbrain import dataset
+from paddlespeech.s2t.io.speechbrain.dataloader import make_dataloader
 from paddlespeech.s2t.models.wav2vec2.processing.speech_augmentation import TimeDomainSpecAugment
 from paddlespeech.s2t.models.wav2vec2.wav2vec2_ASR import Wav2vec2ASR
 from paddlespeech.s2t.training.optimizer import OptimizerFactory
@@ -45,10 +50,96 @@ from paddlespeech.s2t.utils.utility import UpdateConfig
 logger = Log(__name__).getlog()
 
 
+def clip_grad_norm_(
+        parameters,
+        max_norm,
+        norm_type=2.0,
+        error_if_nonfinite=False, ):
+    r"""Clips gradient norm of the iteratable parameters.
+
+    Norms are calculated together on all gradients, just as they are
+    connected into one vector. The gradient will be modified in place.
+
+    This API can only run in dynamic graph mode, not static graph mode.
+
+    Args:
+        parameters (Iterable[paddle.Tensor] or paddle.Tensor): Tensors or a single Tensor
+            that will be normalized gradients
+        max_norm (float or int): max norm of the gradients
+        norm_type (float or int): type of the used p-norm. Can be `inf` for
+            infinity norm.
+        error_if_nonfinite (bool): if True, throw an error if the total
+            norm of the gradients from :attr:`parameters` is `nan`,
+            `inf`, or `-inf`.
+
+    Returns:
+        Total norm of the parameter gradients (treated as a single vector).
+    Example:
+        .. code-block:: python
+            import paddle
+
+            x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
+            max_norm = float(5.0)
+            linear = paddle.nn.Linear(in_features=10, out_features=10)
+            out = linear(x)
+            loss = paddle.mean(out)
+            loss.backward()
+
+            paddle.nn.utils.clip_grad_norm_(linear.parameters(), max_norm)
+
+            sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters())
+            sdg.step()
+    """
+    if not paddle.in_dynamic_mode():
+        raise RuntimeError('this API can only run in dynamic mode.')
+
+    if isinstance(parameters, paddle.Tensor):
+        parameters = [parameters]
+
+    support_norm_type = [float("inf"), 0, 1, 2]
+    if norm_type not in support_norm_type:
+        raise ValueError(f'norm_type only support {support_norm_type}')
+
+    grads = [p.grad for p in parameters if p.grad is not None]
+    max_norm = float(max_norm)
+    norm_type = float(norm_type)
+    if len(grads) == 0:
+        return paddle.to_tensor(0.0)
+    if norm_type == float("inf"):
+        norms = [g.detach().abs().max() for g in grads]
+        total_norm = (norms[0]
+                      if len(norms) == 1 else paddle.max(paddle.stack(norms)))
+    else:
+        total_norm = paddle.linalg.norm(
+            paddle.stack(
+                [paddle.linalg.norm(g.detach(), norm_type) for g in grads]),
+            norm_type, )
+
+    if error_if_nonfinite and paddle.logical_or(total_norm.isnan(),
+                                                total_norm.isinf()):
+        raise RuntimeError(
+            f'The total norm of {norm_type} order of the gradients from '
+            '`parameters` is non-finite, so it cannot be clipped. In any case, '
+            'disable this error and scale the gradient by non-finite norm, '
+            'set `error_if_nonfinite=False`')
+    clip_coef = max_norm / (total_norm + 1e-6)
+    # Note: when the coef is clamped to 1, it is redundant to multiply the clamped coef, but this
+    # avoids the `if clip_coef < 1:` condition.
+    clip_coef_clamped = paddle.clip(clip_coef, max=1.0)
+    with paddle.no_grad():
+        for _, p in enumerate(parameters):
+            g = p.grad
+            if g is not None:
+                p.grad = paddle.multiply(x=g, y=clip_coef_clamped)
+    return total_norm
+
+
 class Wav2Vec2ASRTrainer(Trainer):
     def __init__(self, config, args):
         super().__init__(config, args)
         self.avg_train_loss = 0.0
+        self.loss_isfinite = True  # while flag is 'False', loss in Nan or inf, and can not be avg
+        self.use_sb = True  # whether use speech brain dataloader
 
     def update_average(self, batch_index, loss):
         """Update running average of the loss.
@@ -62,6 +153,9 @@ class Wav2Vec2ASRTrainer(Trainer):
         if math.isfinite(loss):
             self.avg_train_loss -= self.avg_train_loss / (batch_index + 1)
             self.avg_train_loss += loss / (batch_index + 1)
+        else:
+            self.loss_isfinite = False
+            logger.info('loss:{} in Nan or inf, error'.format(loss))
 
     def before_train(self):
         from_scratch = self.resume_or_scratch()
@@ -81,14 +175,22 @@ class Wav2Vec2ASRTrainer(Trainer):
         start = time.time()
 
         # forward
-        utt, wav, wavs_lens, target, target_lens = batch
-        wavs_lens_rate = wavs_lens / wav.shape[1]
+        ## sb data pipeline
+        if self.use_sb:
+            wav, wavs_lens_rate = batch['sig']
+            target, target_lens_rate = batch['tokens']
+            target_lens = (target_lens_rate *
+                           target.shape[1]).round().astype(paddle.int64)
+        else:
+            utt, wav, wavs_lens, target, target_lens = batch
+            wavs_lens_rate = wavs_lens / wav.shape[1]
+            wav = wav[:, :, 0]
 
-        wav = wav[:, :, 0]
         if hasattr(train_conf, 'audio_augment'):
             wav = self.speech_augmentation(wav, wavs_lens_rate)
 
         loss = self.model(wav, wavs_lens_rate, target, target_lens)
+
         # loss div by `batch_size * accum_grad`
         loss /= train_conf.accum_grad
         # update self.avg_train_loss
@@ -108,10 +210,15 @@ class Wav2Vec2ASRTrainer(Trainer):
             context = nullcontext
         with context():
             loss.backward()
+
             layer_tools.print_grads(self.model, print_func=None)
 
         # optimizer step old
         if (batch_index + 1) % train_conf.accum_grad == 0:
+            #do global grad clip
+            if train_conf.global_grad_clip != 0:
+                clip_grad_norm_(self.model.parameters(),
+                                train_conf.global_grad_clip)
             self.model_optimizer.step()
             self.model_optimizer.clear_grad()
             if not train_conf.freeze_wav2vec2:
@@ -123,10 +230,12 @@ class Wav2Vec2ASRTrainer(Trainer):
                 if not train_conf.freeze_wav2vec2:
                     self.wav2vec2_lr_scheduler.step()
             self.iteration += 1
+
         losses_np = {'loss': self.avg_train_loss * train_conf.accum_grad}
         iteration_time = time.time() - start
         for k, v in losses_np.items():
             report(k, v)
+        report("loss_whitoutavg", float(loss))
         report("batch_size", self.config.batch_size)
         report("accum", train_conf.accum_grad)
         report("step_cost", iteration_time)
@@ -148,24 +257,34 @@ class Wav2Vec2ASRTrainer(Trainer):
         if not self.use_streamdata:
             logger.info(
                 f"Valid Total Examples: {len(self.valid_loader.dataset)}")
-        valid_losses = defaultdict(list)
-        num_seen_utts = 1
+        valid_losses = {}
+        step = 0
         total_loss = 0.0
+        num_seen_utts = 1  # use update_average and no need for num_seen_utts here
         for i, batch in enumerate(self.valid_loader):
-            utt, wav, wavs_lens, target, target_lens = batch
-            wavs_lens_rate = wavs_lens / wav.shape[1]
-            wav = wav[:, :, 0]
+            if self.use_sb:
+                wav, wavs_lens_rate = batch['sig']
+                target, target_lens_rate = batch['tokens']
+                target_lens = (target_lens_rate *
+                               target.shape[1]).round().astype(paddle.int64)
+            else:
+                utt, wav, wavs_lens, target, target_lens = batch
+                wavs_lens_rate = wavs_lens / wav.shape[1]
+                wav = wav[:, :, 0]
+
             loss = self.model(wav, wavs_lens_rate, target, target_lens)
+            # use update_average
+            total_loss -= total_loss / (step + 1)
+            total_loss += loss / (step + 1)
 
             if math.isfinite(float(loss)):
-                num_utts = batch[1].shape[0]
-                num_seen_utts += num_utts
-                total_loss += float(loss) * num_utts
-                valid_losses['val_loss'].append(float(loss))
+                step += 1
+                valid_losses['val_loss'] = float(loss)
+            else:
+                logger.info('loss:{} in Nan or inf, error'.format(float(loss)))
 
             if (i + 1) % self.config.log_interval == 0:
-                valid_dump = {k: np.mean(v) for k, v in valid_losses.items()}
-                valid_dump['val_history_loss'] = total_loss / num_seen_utts
+                valid_losses['val_history_loss'] = float(total_loss)
 
                 # logging
                 msg = f"Valid: Rank: {dist.get_rank()}, "
@@ -175,11 +294,11 @@ class Wav2Vec2ASRTrainer(Trainer):
                     msg += "batch: {}/{}, ".format(i + 1,
                                                    len(self.valid_loader))
                 msg += ', '.join('{}: {:>.6f}'.format(k, v)
-                                 for k, v in valid_dump.items())
+                                 for k, v in valid_losses.items())
                 logger.info(msg)
 
-        logger.info('Rank {} Val info val_loss {}'.format(
-            dist.get_rank(), total_loss / num_seen_utts))
+        logger.info(
+            'Rank {} Val info val_loss {}'.format(dist.get_rank(), total_loss))
         return total_loss, num_seen_utts
 
     @mp_tools.rank_zero_only
@@ -228,7 +347,7 @@ class Wav2Vec2ASRTrainer(Trainer):
             logger.info("Saved scheduler state to {}".format(scheduler_path))
         info_path = re.sub('.pdparams$', '.json', params_path)
         infos = {} if infos is None else infos
-        with open(info_path, 'w') as fout:
+        with open(info_path, 'w', encoding='utf8') as fout:
             data = json.dumps(infos)
             fout.write(data)
 
@@ -245,7 +364,7 @@ class Wav2Vec2ASRTrainer(Trainer):
             # lr will resotre from optimizer ckpt
             resume_json_path = os.path.join(self.checkpoint_dir,
                                             self.args.resume + '.json')
-            with open(resume_json_path, 'r') as f:
+            with open(resume_json_path, 'r', encoding='utf8') as f:
                 resume_json = json.load(f)
             self.iteration = 0
             self.epoch = resume_json["epoch"]
@@ -340,14 +459,13 @@ class Wav2Vec2ASRTrainer(Trainer):
                 total_loss, num_seen_utts = self.valid()
                 if dist.get_world_size() > 1:
                     num_seen_utts = paddle.to_tensor(num_seen_utts)
-                    # the default operator in all_reduce function is sum.
                     dist.all_reduce(num_seen_utts)
                     total_loss = paddle.to_tensor(total_loss)
                     dist.all_reduce(total_loss)
                     cv_loss = total_loss / num_seen_utts
                     cv_loss = float(cv_loss)
                 else:
-                    cv_loss = total_loss / num_seen_utts
+                    cv_loss = float(total_loss)
             logger.info(
                 'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))
             if self.visualizer:
@@ -368,45 +486,182 @@ class Wav2Vec2ASRTrainer(Trainer):
                 if not self.config.freeze_wav2vec2:
                     self.wav2vec2_lr_scheduler.step(cv_loss)
             self.save(tag=self.epoch, infos={'val_loss': cv_loss})
+            self.avg_train_loss = 0.0
             self.new_epoch()
 
+    def dataio_prepare(self, hparams):
+        """This function prepares the datasets to be used in the brain class.
+        It also defines the data processing pipeline through user-defined functions."""
+        data_folder = hparams["data_folder"]
+
+        train_data = dataset.DynamicItemDataset.from_csv(
+            csv_path=hparams["train_data"],
+            replacements={"data_root": data_folder}, )
+
+        if hparams["sorting"] == "ascending":
+            # we sort training data to speed up training and get better results.
+            train_data = train_data.filtered_sorted(sort_key="duration")
+            # when sorting do not shuffle in dataloader ! otherwise is pointless
+            hparams["train_dataloader_opts"]["shuffle"] = False
+
+        elif hparams["sorting"] == "descending":
+            train_data = train_data.filtered_sorted(
+                sort_key="duration", reverse=True)
+            # when sorting do not shuffle in dataloader ! otherwise is pointless
+            hparams["train_dataloader_opts"]["shuffle"] = False
+
+        elif hparams["sorting"] == "random":
+            pass
+
+        else:
+            raise NotImplementedError(
+                "sorting must be random, ascending or descending")
+
+        valid_data = dataset.DynamicItemDataset.from_csv(
+            csv_path=hparams["valid_data"],
+            replacements={"data_root": data_folder}, )
+        valid_data = valid_data.filtered_sorted(sort_key="duration")
+
+        test_data = dataset.DynamicItemDataset.from_csv(
+            csv_path=hparams["test_data"],
+            replacements={"data_root": data_folder}, )
+        test_data = test_data.filtered_sorted(sort_key="duration")
+
+        datasets = [train_data, valid_data, test_data]
+
+        # Defining tokenizer and loading it
+        tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese')
+        self.tokenizer = tokenizer
+        # 2. Define audio pipeline:
+        @data_pipeline.takes("wav")
+        @data_pipeline.provides("sig")
+        def audio_pipeline(wav):
+            sig = dataio.read_audio(wav)
+            return sig
+
+        dataset.add_dynamic_item(datasets, audio_pipeline)
+
+        # 3. Define text pipeline:
+        @data_pipeline.takes("transcript")
+        @data_pipeline.provides("wrd", "tokens_list", "tokens")
+        def text_pipeline(wrd):
+            wrd = "".join(wrd.split(" "))
+            yield wrd
+            tokens_list = tokenizer(wrd)["input_ids"]
+            yield tokens_list
+            tokens = np.array(tokens_list, dtype="int64")
+            # tokens = paddle.to_tensor(tokens_list, dtype="int64")
+            yield tokens
+
+        dataset.add_dynamic_item(datasets, text_pipeline)
+
+        # 4. Set output:
+        dataset.set_output_keys(
+            datasets,
+            ["id", "sig", "wrd", "tokens"], )
+
+        # 5. If Dynamic Batching is used, we instantiate the needed samplers.
+        train_batch_sampler = None
+        valid_batch_sampler = None
+        if hparams["dynamic_batching"]:
+            from sampler import DynamicBatchSampler  # noqa
+
+            dynamic_hparams = hparams["dynamic_batch_sampler"]
+            num_buckets = dynamic_hparams["num_buckets"]
+
+            train_batch_sampler = DynamicBatchSampler(
+                train_data,
+                dynamic_hparams["max_batch_len"],
+                num_buckets=num_buckets,
+                length_func=lambda x: x["duration"],
+                shuffle=dynamic_hparams["shuffle_ex"],
+                batch_ordering=dynamic_hparams["batch_ordering"], )
+
+            valid_batch_sampler = DynamicBatchSampler(
+                valid_data,
+                dynamic_hparams["max_batch_len"],
+                num_buckets=num_buckets,
+                length_func=lambda x: x["duration"],
+                shuffle=dynamic_hparams["shuffle_ex"],
+                batch_ordering=dynamic_hparams["batch_ordering"], )
+
+        return (train_data, valid_data, test_data, tokenizer,
+                train_batch_sampler, valid_batch_sampler, )
+
     def setup_dataloader(self):
         config = self.config.clone()
         self.use_streamdata = config.get("use_stream_data", False)
-        if self.train:
-            self.train_loader = DataLoaderFactory.get_dataloader(
-                'train', config, self.args)
-            self.valid_loader = DataLoaderFactory.get_dataloader(
-                'valid', config, self.args)
-            logger.info("Setup train/valid Dataloader!")
+        self.use_sb = config.use_sb_pipeline
+        if self.use_sb:
+            hparams_file = config.sb_pipeline_conf
+            with open(hparams_file, 'r', encoding='utf8') as fin:
+                hparams = load_hyperpyyaml(fin, None)
+
+            (train_data, valid_data, test_data, tokenizer, train_bsampler,
+             valid_bsampler, ) = self.dataio_prepare(hparams)
+
+            train_dataloader_opts = hparams["train_dataloader_opts"]
+            valid_dataloader_opts = hparams["valid_dataloader_opts"]
+
+            if train_bsampler is not None:
+                train_dataloader_opts = {
+                    "batch_sampler": train_bsampler,
+                    "num_workers": hparams["num_workers"],
+                }
+
+            if valid_bsampler is not None:
+                valid_dataloader_opts = {"batch_sampler": valid_bsampler}
+
+            if self.train:
+                self.train_loader = make_dataloader(
+                    train_data, stage='train', **train_dataloader_opts)
+                self.valid_loader = make_dataloader(
+                    valid_data,
+                    stage='val',
+                    **valid_dataloader_opts, )
+                logger.info("Setup train/valid Dataloader!")
+            else:
+                self.test_loader = make_dataloader(
+                    test_data, stage='test', **hparams["test_dataloader_opts"])
         else:
-            decode_batch_size = config.get('decode', dict()).get(
-                'decode_batch_size', 1)
-            self.test_loader = DataLoaderFactory.get_dataloader('test', config,
-                                                                self.args)
-            self.align_loader = DataLoaderFactory.get_dataloader(
-                'align', config, self.args)
-            logger.info("Setup test/align Dataloader!")
+            if self.train:
+                self.train_loader = DataLoaderFactory.get_dataloader(
+                    'train', config, self.args)
+                self.valid_loader = DataLoaderFactory.get_dataloader(
+                    'valid', config, self.args)
+                logger.info("Setup train/valid Dataloader!")
+            else:
+                decode_batch_size = config.get('decode', dict()).get(
+                    'decode_batch_size', 1)
+                self.test_loader = DataLoaderFactory.get_dataloader(
+                    'test', config, self.args)
+                self.align_loader = DataLoaderFactory.get_dataloader(
+                    'align', config, self.args)
+                logger.info("Setup test/align Dataloader!")
 
     def setup_model(self):
         config = self.config
         model_conf = config
 
         with UpdateConfig(model_conf):
-            if self.train:
-                model_conf.input_dim = self.train_loader.feat_dim
-                model_conf.output_dim = self.train_loader.vocab_size
+            if self.use_sb:
+                model_conf.output_dim = self.tokenizer.vocab_size
             else:
-                model_conf.input_dim = self.test_loader.feat_dim
-                model_conf.output_dim = self.test_loader.vocab_size
+                if self.train:
+                    model_conf.input_dim = self.train_loader.feat_dim
+                    model_conf.output_dim = self.train_loader.vocab_size
+                else:
+                    model_conf.input_dim = self.test_loader.feat_dim
+                    model_conf.output_dim = self.test_loader.vocab_size
 
         model = Wav2vec2ASR.from_config(model_conf)
+
         model_dict = paddle.load(config.wav2vec2_params_path)
         model.wav2vec2.set_state_dict(model_dict)
 
         if self.parallel:
             model = paddle.DataParallel(model, find_unused_parameters=True)
-        logger.info(f"{model}")
+
         layer_tools.print_params(model, logger.info)
         self.model = model
         logger.info("Setup model!")
@@ -422,8 +677,11 @@ class Wav2Vec2ASRTrainer(Trainer):
         train_config = config
         model_optim_type = train_config.model_optim
         model_optim_conf = train_config.model_optim_conf
-        wav2vec2_optim_type = train_config.model_optim
+        logger.info("optim_model:{},{}", model_optim_type, model_optim_conf)
+        wav2vec2_optim_type = train_config.wav2vec2_optim
         wav2vec2_optim_conf = train_config.wav2vec2_optim_conf
+        logger.info("optim_model:{},{}", wav2vec2_optim_type,
+                    wav2vec2_optim_conf)
 
         model_scheduler_type = train_config.model_scheduler
         model_scheduler_conf = train_config.model_scheduler_conf
@@ -449,11 +707,8 @@ class Wav2Vec2ASRTrainer(Trainer):
                 optim_conf,
                 parameters,
                 lr_scheduler=None, ):
-            train_config = config
             optim_arg = dict(optim_conf)
             optim_arg.update({
-                "grad_clip":
-                train_config.global_grad_clip,
                 "learning_rate":
                 lr_scheduler if lr_scheduler else optim_conf.lr,
                 "parameters":
@@ -475,10 +730,12 @@ class Wav2Vec2ASRTrainer(Trainer):
                                                   'params':
                                                   model.ctc.parameters()
                                               }], model_lr_scheduler)
+
         wav2vec2_optimizer_args = optimizer_args(
             config, wav2vec2_optim_type, wav2vec2_optim_conf,
             model._layers.wav2vec2.parameters() if self.parallel else
             model.wav2vec2.parameters(), wav2vec2_lr_scheduler)
+
         model_optimizer = OptimizerFactory.from_args(model_optim_type,
                                                      model_optimizer_args)
         wav2vec2_optimizer = OptimizerFactory.from_args(wav2vec2_optim_type,
@@ -507,12 +764,7 @@ class Wav2Vec2ASRTester(Wav2Vec2ASRTrainer):
             trans.append(self.text_featurizer.defeaturize(ids.numpy().tolist()))
         return trans
 
-    def compute_metrics(self,
-                        utts,
-                        audio,
-                        audio_len,
-                        texts,
-                        texts_len,
+    def compute_metrics(self, id, audio, audio_len, texts, texts_len,
                         fout=None):
         decode_cfg = self.config.decode
         errors_sum, len_refs, num_ins = 0.0, 0, 0
@@ -529,7 +781,7 @@ class Wav2Vec2ASRTester(Wav2Vec2ASRTrainer):
         decode_time = time.time() - start_time
 
         for utt, target, result, rec_tids in zip(
-                utts, target_transcripts, result_transcripts, result_tokenids):
+                id, target_transcripts, result_transcripts, result_tokenids):
             errors, len_ref = errors_func(target, result)
             errors_sum += errors
             len_refs += len_ref
@@ -556,6 +808,49 @@ class Wav2Vec2ASRTester(Wav2Vec2ASRTrainer):
             num_frames=audio_len.sum().numpy().item(),
             decode_time=decode_time)
 
+    def sb_compute_metrics(self, id, sig, wrd, tokens, fout=None):
+        decode_cfg = self.config.decode
+        errors_sum, len_refs, num_ins = 0.0, 0, 0
+        errors_func = error_rate.char_errors if decode_cfg.error_rate_type == 'cer' else error_rate.word_errors
+        error_rate_func = error_rate.cer if decode_cfg.error_rate_type == 'cer' else error_rate.wer
+        start_time = time.time()
+        target_transcripts = wrd
+        result_transcripts, result_tokenids = self.model.decode(
+            sig[0],
+            text_feature=self.tokenizer,
+            decoding_method=decode_cfg.decoding_method,
+            beam_size=decode_cfg.beam_size,
+            sb_pipeline=True)
+        decode_time = time.time() - start_time
+
+        for utt, target, result, rec_tids in zip(
+                id, target_transcripts, result_transcripts, result_tokenids):
+            errors, len_ref = errors_func(target, result)
+            errors_sum += errors
+            len_refs += len_ref
+            num_ins += 1
+            if fout:
+                fout.write({
+                    "utt": utt,
+                    "refs": [target],
+                    "hyps": [result],
+                    "hyps_tokenid": [rec_tids],
+                })
+            logger.info(f"Utt: {utt}")
+            logger.info(f"Ref: {target}")
+            logger.info(f"Hyp: {result}")
+            logger.info("One example error rate [%s] = %f" % (
+                decode_cfg.error_rate_type, error_rate_func(target, result)))
+
+        return dict(
+            errors_sum=errors_sum,
+            len_refs=len_refs,
+            num_ins=num_ins,  # num examples
+            error_rate=errors_sum / len_refs,
+            error_rate_type=decode_cfg.error_rate_type,
+            num_frames=sig[1].sum().numpy().item(),
+            decode_time=decode_time)
+
     @mp_tools.rank_zero_only
     @paddle.no_grad()
     def test(self):
@@ -573,7 +868,10 @@ class Wav2Vec2ASRTester(Wav2Vec2ASRTrainer):
 
         with jsonlines.open(self.args.result_file, 'w') as fout:
             for i, batch in enumerate(self.test_loader):
-                metrics = self.compute_metrics(*batch, fout=fout)
+                if self.use_sb:
+                    metrics = self.sb_compute_metrics(**batch, fout=fout)
+                else:
+                    metrics = self.compute_metrics(*batch, fout=fout)
                 num_frames += metrics['num_frames']
                 num_time += metrics["decode_time"]
                 errors_sum += metrics['errors_sum']
@@ -595,7 +893,7 @@ class Wav2Vec2ASRTester(Wav2Vec2ASRTrainer):
 
         err_meta_path = os.path.splitext(self.args.result_file)[0] + '.err'
         err_type_str = "{}".format(error_rate_type)
-        with open(err_meta_path, 'w') as f:
+        with open(err_meta_path, 'w', encoding='utf8') as f:
             data = json.dumps({
                 "epoch":
                 self.epoch,
diff --git a/paddlespeech/s2t/io/dataloader.py b/paddlespeech/s2t/io/dataloader.py
index 5ba891c39..db6292f2c 100644
--- a/paddlespeech/s2t/io/dataloader.py
+++ b/paddlespeech/s2t/io/dataloader.py
@@ -464,5 +464,5 @@ class DataLoaderFactory():
                 subsampling_factor=config.subsampling_factor,
                 load_aux_output=config.get('load_transcript', None),
                 num_encs=config.num_encs,
-                dist_sampler=config.dist_sampler,
+                dist_sampler=config.get('dist_sampler', None),
                 shortest_first=config.shortest_first)
diff --git a/paddlespeech/s2t/io/speechbrain/__init__.py b/paddlespeech/s2t/io/speechbrain/__init__.py
new file mode 100644
index 000000000..595add0ae
--- /dev/null
+++ b/paddlespeech/s2t/io/speechbrain/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddlespeech/s2t/io/speechbrain/batch.py b/paddlespeech/s2t/io/speechbrain/batch.py
new file mode 100755
index 000000000..73f13181e
--- /dev/null
+++ b/paddlespeech/s2t/io/speechbrain/batch.py
@@ -0,0 +1,107 @@
+# Copyright (c) 2023 speechbrain Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# 
+# Modified from speechbrain 2023 (https://github.com/speechbrain/speechbrain/blob/develop/speechbrain/dataio/batch.py)
+"""Batch collation
+
+Authors
+  * Aku Rouhe 2020
+"""
+import collections
+
+import paddle
+
+from paddlespeech.s2t.io.speechbrain.data_utils import batch_pad_right
+from paddlespeech.s2t.io.speechbrain.data_utils import mod_default_collate
+
+PaddedData = collections.namedtuple("PaddedData", ["data", "lengths"])
+
+
+class PaddedBatch:
+    """Collate_fn when examples are dicts and have variable-length sequences.
+
+    Different elements in the examples get matched by key.
+    All numpy tensors get converted to paddle.Tensor 
+    Then, by default, all paddle.Tensor valued elements get padded and support
+    collective pin_memory() and to() calls.
+    Regular Python data types are just collected in a list.
+
+    Arguments
+    ---------
+    examples : list
+        List of example dicts, as produced by Dataloader.
+    padded_keys : list, None
+        (Optional) List of keys to pad on. If None, pad all paddle.Tensors
+    device_prep_keys : list, None
+        (Optional) Only these keys participate in collective memory pinning and moving with
+        to().
+        If None, defaults to all items with paddle.Tensor values.
+    padding_func : callable, optional
+        Called with a list of tensors to be padded together. Needs to return
+        two tensors: the padded data, and another tensor for the data lengths.
+    padding_kwargs : dict
+        (Optional) Extra kwargs to pass to padding_func. E.G. mode, value
+    nonpadded_stack : bool
+        Whether to apply Tensor stacking on values that didn't get padded. 
+        This stacks if it can, but doesn't error out if it cannot. 
+        Default:True, usually does the right thing.
+    """
+
+    def __init__(
+            self,
+            examples,
+            padded_keys=None,
+            device_prep_keys=None,
+            padding_func=batch_pad_right,
+            padding_kwargs={},
+            nonpadded_stack=True, ):
+        self.__length = len(examples)
+        self.__keys = list(examples[0].keys())
+        self.__padded_keys = []
+        self.__device_prep_keys = []
+        for key in self.__keys:
+            values = [example[key] for example in examples]
+            # Default convert usually does the right thing (numpy2tensor etc.)
+            values = paddle.to_tensor(values)
+
+            if (padded_keys is not None and key in padded_keys) or (
+                    padded_keys is None and
+                    isinstance(values[0], paddle.Tensor)):
+                # Padding and PaddedData
+                self.__padded_keys.append(key)
+                padded = PaddedData(*padding_func(values, **padding_kwargs))
+                setattr(self, key, padded)
+            else:
+                if nonpadded_stack:
+                    values = mod_default_collate(values)
+                setattr(self, key, values)
+            if (device_prep_keys is not None and key in device_prep_keys) or (
+                    device_prep_keys is None and
+                    isinstance(values[0], paddle.Tensor)):
+                self.__device_prep_keys.append(key)
+
+    def __len__(self):
+        return self.__length
+
+    def __getitem__(self, key):
+        if key in self.__keys:
+            return getattr(self, key)
+        else:
+            raise KeyError(f"Batch doesn't have key: {key}")
+
+    def __iter__(self):
+        """Iterates over the different elements of the batch.
+        """
+        return iter((getattr(self, key) for key in self.__keys))
diff --git a/paddlespeech/s2t/io/speechbrain/data_pipeline.py b/paddlespeech/s2t/io/speechbrain/data_pipeline.py
new file mode 100755
index 000000000..1bfe0e1d3
--- /dev/null
+++ b/paddlespeech/s2t/io/speechbrain/data_pipeline.py
@@ -0,0 +1,488 @@
+# Copyright (c) 2023 speechbrain Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# 
+# Modified from speechbrain 2023 (https://github.com/speechbrain/speechbrain/blob/develop/speechbrain/utils/data_pipeline.py)
+"""A pipeline for data transformations.
+
+Author:
+    * Aku Rouhe
+"""
+import inspect
+from dataclasses import dataclass
+
+from paddlespeech.s2t.io.speechbrain.depgraph import DependencyGraph
+
+
+@dataclass
+class StaticItem:
+    """Data class that represents a static item.
+
+    Static items are in-memory items so they don't need to be computed
+    dynamically.
+    """
+
+    key: str
+
+
+class DynamicItem:
+    """Essentially represents a data transformation function.
+
+    A DynamicItem takes some arguments and computes its value dynamically when
+    called. A straight-forward use-case is to load something from disk
+    dynamically; take the path and provide the loaded data.
+
+    Instances of this class are often created implicitly via the
+    @takes and @provides decorators or otherwise from specifying the taken and
+    provided arguments and the function.
+
+    A counterpart is the GeneratorDynamicItem, which should be used for
+    generator functions.
+
+    Arguments
+    ---------
+    takes : list
+        The keys of the items that this needs to compute its output.
+    func : callable
+        The function that is used to compute the output.
+    provides : list
+        The keys that this provides.
+    """
+
+    def __init__(self, takes=[], func=None, provides=[]):
+        self.takes = takes
+        self.func = func
+        self.provides = provides
+
+    def __call__(self, *args):
+        return self.func(*args)
+
+    # The next methods are more about supporting GeneratorDynamicItems
+    def next_takes(self):
+        """The next argkeys to provide to this, when called."""
+        # Regular function DynamicItems always just need the same set of args
+        return self.takes
+
+    def next_provides(self):
+        """The next keys that this provides, when called."""
+        # Regular function DynamicItems always just provide the same set of keys
+        return self.provides
+
+    def provided_in_order(self):
+        """Assuming that this may need to be called multiple times; which keys
+        does it provide at that call. Returns a list, with len equal to the
+        number of times that this may be called."""
+        # Regular function DynamicItems are only called once:
+        return [self.provides]
+
+    def reset(self):
+        """Signals that this will not be called any more times on this pipeline
+        call."""
+        # Regular function DynamicItems don't need special resets.
+        pass
+
+
+class GeneratorDynamicItem(DynamicItem):
+    """Essentially represents a multi-step data transformation.
+
+    This is the generator function counterpart for DynamicItem (which should be
+    used for regular functions).
+
+    A GeneratorDynamicItem first takes some arguments and then uses those in
+    multiple steps to incrementally compute some values when called.
+
+    A typical use-case is a pipeline of transformations on data: e.g. taking in
+    text as a string, and first a tokenized version, and then on the second
+    call providing an integer-encoded version. This can be used even though the
+    integer-encoder needs to be trained on the first outputs.
+
+    The main benefit is to be able to define the pipeline in a clear function,
+    even if parts of the pipeline depend on others for their initialization.
+
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # Doesn't generate electricity, only stores the currently active
+        # generator:
+        self.current_generator = None
+        self.num_provided_items = 0
+
+    def __call__(self, *args):
+        if self.num_provided_items == len(self.provides):
+            raise RuntimeError("DynamicItemPipeline called too many times!")
+        if not self.current_generator:
+            self.current_generator = self.func(*args)
+        # NOTE: Not supporting sending new values to the pipeline.
+        out = next(self.current_generator)
+        self.num_provided_items += 1
+        return out
+
+    def next_takes(self):
+        """The next argkeys to provide to this, when called."""
+        if not self.current_generator:
+            return self.takes
+        else:
+            return []
+
+    def next_provides(self):
+        """The next keys that this provides, when called."""
+        keys = self.provides[self.num_provided_items]
+        # Support multiple yielded values like:
+        # @yields("wav_read", ["left_ch", "right_ch"])
+        if isinstance(keys, str):
+            return [keys]
+        else:
+            return keys
+
+    def provided_in_order(self):
+        """Assuming that this may need to be called multiple times; which keys
+        does it provide at that call. Returns a list, with len equal to the
+        number of times that this may be called."""
+        in_order = []
+        for keys in self.provides:
+            # Support multiple yielded values like:
+            # @provides("wav_read", ["left_ch", "right_ch"])
+            if isinstance(keys, str):
+                in_order.append([keys])
+            else:
+                in_order.append(keys)
+        return in_order
+
+    def reset(self):
+        """Signals that this will not be called any more times on this pipeline
+        call."""
+        if self.current_generator is not None:
+            self.current_generator.close()
+        self.current_generator = None
+        self.num_provided_items = 0
+
+
+def takes(*argkeys):
+    """Decorator which makes a DynamicItem and specifies its argkeys.
+
+    If the wrapped object is a generator function (has a yield statement),
+    Creates a GeneratorDynamicItem. If the object is already a DynamicItem,
+    just specifies the argkeys for that. Otherwise creates a new regular
+    DynamicItem, with argkeys specified.
+
+    The args are always passed to the function at the start. Generators could
+    support sending new arguments, but for such use cases, simply create a new
+    dynamic item. The GeneratorDynamicItem class is meant for pipelines which
+    take in an input and transform it in multiple ways, where the intermediate
+    representations may be needed for e.g. fitting a BPE segmenter.
+
+    Example
+    -------
+    >>> @takes("text")
+    ... def tokenize(text):
+    ...     return text.strip().lower().split()
+    >>> tokenize.provides = ["tokenized"]
+    >>> tokenize('\tThis Example gets tokenized')
+    ['this', 'example', 'gets', 'tokenized']
+    """
+
+    def decorator(obj):
+        """Decorator definition."""
+        if isinstance(obj, DynamicItem):
+            if obj.takes:
+                raise ValueError("Can't overwrite DynamicItem.takes")
+            obj.takes = argkeys
+            return obj
+        elif inspect.isgeneratorfunction(obj):
+            return GeneratorDynamicItem(takes=argkeys, func=obj)
+        else:
+            return DynamicItem(takes=argkeys, func=obj)
+
+    return decorator
+
+
+takes_decorator = takes  # Just for DataPipeline.add_dynamic_item
+
+
+def provides(*output_keys):
+    """Decorator which makes a DynamicItem and specifies what keys it provides.
+
+    If the wrapped object is a generator function (has a yield statement),
+    Creates a GeneratorDynamicItem. If the object is already a DynamicItem,
+    just specifies the provided keys for that. Otherwise creates a new regular
+    DynamicItem, with provided keys specified.
+
+    NOTE
+    ----
+    The behavior is slightly different for generators and regular functions, if
+    many output keys are specified, e.g. @provides("signal", "mfcc"). Regular
+    functions should return a tuple with len equal to len(output_keys), while
+    generators should yield the items one by one.
+
+    >>> @provides("signal", "feat")
+    ... def read_feat():
+    ...     wav = [.1,.2,-.1]
+    ...     feat = [s**2 for s in wav]
+    ...     return wav, feat
+    >>> @provides("signal", "feat")
+    ... def read_feat():
+    ...     wav = [.1,.2,-.1]
+    ...     yield wav
+    ...     feat = [s**2 for s in wav]
+    ...     yield feat
+
+    If multiple keys are yielded at once, write e.g.,
+
+    >>> @provides("wav_read", ["left_channel", "right_channel"])
+    ... def read_multi_channel():
+    ...     wav = [[.1,.2,-.1],[.2,.1,-.1]]
+    ...     yield wav
+    ...     yield wav[0], wav[1]
+
+    """
+
+    def decorator(obj):
+        """Decorator definition."""
+        if isinstance(obj, DynamicItem):
+            if obj.provides:
+                raise ValueError("Can't overwrite DynamicItem provides-list.")
+            obj.provides = output_keys
+            return obj
+        elif inspect.isgeneratorfunction(obj):
+            return GeneratorDynamicItem(func=obj, provides=output_keys)
+        else:
+            return DynamicItem(func=obj, provides=output_keys)
+
+    return decorator
+
+
+provides_decorator = provides  # Just for DataPipeline.add_dynamic_item
+
+
+class DataPipeline:
+    """Organises data transformations into a pipeline.
+
+    Example
+    -------
+    >>> pipeline = DataPipeline(
+    ...     static_data_keys=["text"],
+    ...     dynamic_items=[
+    ...     {"func": lambda x: x.lower(), "takes": "text", "provides": "foo"},
+    ...     {"func": lambda x: x[::-1], "takes": "foo", "provides": "bar"},
+    ...     ],
+    ...     output_keys=["bar"],
+    ... )
+    >>> pipeline({"text": "Test"})
+    {'bar': 'tset'}
+    """
+
+    def __init__(self, static_data_keys, dynamic_items=[], output_keys=[]):
+        self.dg = DependencyGraph()
+        self._exec_order = None
+        self.key_to_node = {}
+        self.unaccounted_keys = {}
+        self.dynamic_items = []
+        self.output_mapping = {}
+        self.add_static_keys(static_data_keys)
+        self.add_dynamic_items(dynamic_items)
+        self.set_output_keys(output_keys)
+
+    def add_static_keys(self, static_keys):
+        """Informs the pipeline about static items.
+
+        Static items are the ones provided to __call__ as data.
+        """
+        for key in static_keys:
+            node_id = self.dg.add_node(data=StaticItem(key=key))
+            self.key_to_node[key] = node_id
+
+    def add_dynamic_items(self, dynamic_items):
+        """Add multiple dynamic items at once."""
+        for item in dynamic_items:
+            try:
+                self.add_dynamic_item(**item)
+            except TypeError:
+                self.add_dynamic_item(item)
+
+    def add_dynamic_item(self, func, takes=None, provides=None):
+        """Adds a dynamic item to the Pipeline.
+
+        Two calling conventions. For DynamicItem objects, just use:
+        add_dynamic_item(dynamic_item)
+        But otherwise, should use:
+        add_dynamic_item(func, takes, provides)
+
+        Arguments
+        ---------
+        func : callable, DynamicItem
+            If a DynamicItem is given, adds that directly. Otherwise a
+            DynamicItem is created, and this specifies the callable to use. If
+            a generator function is given, then create a GeneratorDynamicItem.
+            Otherwise creates a normal DynamicItem.
+        takes : list, str
+            List of keys. When func is called, each key is resolved to
+            either an entry in the data or the output of another dynamic_item.
+            The func is then called with these as positional arguments,
+            in the same order as specified here.
+            A single key can be given as a bare string.
+        provides : str, list
+            For regular functions, the key or list of keys that it provides.
+            If you give a generator function, key or list of keys that it
+            yields, in order. Also see the provides decorator.
+            A single key can be given as a bare string.
+        """
+        if isinstance(func, DynamicItem):
+            if takes is not None or provides is not None:
+                raise ValueError("If providing a DynamicItem directly, don't "
+                                 "specify takes or provides")
+            else:
+                self._add_dynamic_item_object(func)
+                return
+        if isinstance(takes, str):
+            takes = [takes]
+        if isinstance(provides, str):
+            provides = [provides]
+        di = takes_decorator(*takes)(provides_decorator(*provides)(func))
+        self._add_dynamic_item_object(di)
+
+    def _add_dynamic_item_object(self, obj):
+        """Internally adds the object.
+
+        There is a node in the dependency graph for each call of the
+        DynamicItem. Each call may return multiple keys and depend on multiple
+        keys. An internal dict maps key to the id of the node that produces it.
+        """
+        if not obj.provides:
+            raise ValueError("Won't add redundant dynamic item which doesn't "
+                             "provide anything.")
+        depended = []
+        for key in obj.takes:
+            # Might not be accounted for, yet:
+            if key not in self.key_to_node:
+                dependee_keys = self.unaccounted_keys.setdefault(key, [])
+                dependee_keys.extend(obj.next_provides())
+            else:
+                depended.append(self.key_to_node[key])
+        for provided in obj.provided_in_order():
+            node_id = self.dg.add_node(data=obj)
+            for key in provided:
+                self.key_to_node[key] = node_id
+                # This key may also be unaccounted for, so account for it now:
+                if key in self.unaccounted_keys:
+                    for dependee_key in self.unaccounted_keys[key]:
+                        dependee_node = self.key_to_node[dependee_key]
+                        self.dg.add_edge(dependee_node, node_id)
+                    del self.unaccounted_keys[key]  # Now accounted for!
+            for dep_id in depended:
+                self.dg.add_edge(node_id, dep_id)
+            # Next call will depend on this call:
+            depended = [node_id]
+        # Keep a reference to the item in this object, as well:
+        self.dynamic_items.append(obj)
+
+    def set_output_keys(self, keys):
+        """Use this to change the output keys.
+
+        Also re-evaluates execution order.
+        So if you request different outputs, some parts of the
+        data pipeline may be skipped.
+
+        Arguments
+        ---------
+        keys : dict, list, None
+            List of keys (str) to produce in output.
+
+            If a dict is given; it is used to map internal keys to output keys.
+            From the output_keys dict key:value pairs the key appears outside,
+            and value is the internal key.
+        """
+        self.output_mapping = self._output_keys_to_mapping(keys)
+        self._exec_order = None
+
+    @staticmethod
+    def _output_keys_to_mapping(keys):
+        # Ensure a mapping (accept a list for convenience, too)
+        if keys is None:
+            output_mapping = {}
+        elif isinstance(keys, dict):
+            output_mapping = keys
+        else:
+            output_mapping = {key: key for key in keys}
+        return output_mapping
+
+    def compute_outputs(self, data):
+        """
+        Arguments
+        ---------
+        data : dict
+            Dictionary with data entries by key.
+
+        Returns
+        -------
+        dict
+            With the keys that were set.
+        """
+        if self._exec_order is None:
+            self._prepare_run(data)
+        return self._compute(data, self._exec_order, self.output_mapping)
+
+    def compute_specific(self, keys, data):
+        """Compute output of specific item, without changing output_keys."""
+        output_mapping = self._output_keys_to_mapping(keys)
+        order = self.dg.get_evaluation_order(
+            selected_keys=self.get_selected_node_ids(keys))
+        return self._compute(data, order, output_mapping)
+
+    def _compute(self, data, order, output_mapping):
+        if self.unaccounted_keys:
+            MSG = "These keys are still unaccounted for in the data pipeline: "
+            MSG += ", ".join(self.unaccounted_keys)
+            raise RuntimeError(MSG)
+        intermediate = {}
+        for node_id, edges, item in order:
+            if isinstance(item, StaticItem):
+                # Static item in data.
+                # Just check that key is found.
+                try:
+                    data[item.key]
+                    continue
+                except KeyError:
+                    raise KeyError(f"Expected key {item.key} in data!")
+            # A dynamic item, which we should compute:
+            args = [
+                data[argkey] if argkey in data else intermediate[argkey]
+                for argkey in item.next_takes()
+            ]
+            # This needs to be called BEFORE the dynamic item is called.
+            provided_keys = item.next_provides()
+            values = item(*args)  # Call the DynamicItem to produce output
+            # If there is just one output value, wrap in a list so that
+            # it can be zipped as well:
+            if len(provided_keys) == 1:
+                values = [values]
+            intermediate.update(zip(provided_keys, values))
+        for dynamic_item in self.dynamic_items:
+            dynamic_item.reset()
+        return {
+            outkey: data[inkey] if inkey in data else intermediate[inkey]
+            for outkey, inkey in output_mapping.items()
+        }
+
+    def get_selected_node_ids(self, selected_keys):
+        """Translates selected keys to dependency graph keys."""
+        return [self.key_to_node[key] for key in selected_keys]
+
+    def __call__(self, data):
+        return self.compute_outputs(data)
+
+    def _prepare_run(self, data):
+        self._exec_order = list(
+            self.dg.get_evaluation_order(
+                self.get_selected_node_ids(self.output_mapping.values())))
diff --git a/paddlespeech/s2t/io/speechbrain/data_utils.py b/paddlespeech/s2t/io/speechbrain/data_utils.py
new file mode 100755
index 000000000..3fca690aa
--- /dev/null
+++ b/paddlespeech/s2t/io/speechbrain/data_utils.py
@@ -0,0 +1,177 @@
+# Copyright (c) 2023 speechbrain Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# 
+# Modified from speechbrain 2023 (https://github.com/speechbrain/speechbrain/blob/develop/speechbrain/utils/data_utils.py)
+import collections.abc
+import csv
+import os
+import pathlib
+import re
+import shutil
+import urllib.request
+
+import numpy as np
+import paddle
+import tqdm
+
+
+def batch_pad_right(array: list, mode="constant", value=0):
+    """Given a list of paddle tensors it batches them together by padding to the right
+    on each dimension in order to get same length for all.
+
+    Parameters
+    ----------
+    array : list
+        List of tensor we wish to pad together.
+    mode : str
+        Padding mode see numpy.pad documentation.
+    value : float
+        Padding value see numpy.pad documentation.
+
+    Returns
+    -------
+    batched : numpy array
+        Padded numpy array.
+    valid_vals : list
+        List containing proportion for each dimension of original, non-padded values.
+
+    """
+
+    if not len(array):
+        raise IndexError("Tensors list must not be empty")
+
+    if len(array) == 1:
+        # if there is only one tensor in the batch we simply unsqueeze it.
+        return np.expand_dims(array[0], 0), np.array([1.0], dtype="float32")
+    if not (any(
+        [array[i].ndim == array[0].ndim for i in range(1, len(array))])):
+        raise IndexError("All array must have same number of dimensions")
+
+    # FIXME we limit the support here: we allow padding of only the first dimension
+    # need to remove this when feat extraction is updated to handle multichannel.
+    max_shape = []
+    for dim in range(array[0].ndim):
+        if dim != 0:
+            if not all(
+                [x.shape[dim] == array[0].shape[dim] for x in array[1:]]):
+                raise EnvironmentError(
+                    "Tensors should have same dimensions except for the first one"
+                )
+        max_shape.append(max([x.shape[dim] for x in array]))
+
+    batched = []
+    valid = []
+    for t in array:
+        # for each tensor we apply pad_right_to
+        padded, valid_percent = pad_right_to(
+            t, max_shape, mode=mode, value=value)
+        batched.append(padded)
+        valid.append(valid_percent[0])
+
+    batched = np.stack(batched)
+
+    return batched, np.array(valid, dtype="float32")
+
+
+np_str_obj_array_pattern = re.compile(r"[SaUO]")
+
+
+def pad_right_to(
+        array: np.ndarray,
+        target_shape: (list, tuple),
+        mode="constant",
+        value=0, ):
+    """
+    This function takes a numpy of arbitrary shape and pads it to target
+    shape by appending values on the right.
+
+    Parameters
+    ----------
+    array : input numpy array
+        Input tensor whose dimension we need to pad.
+    target_shape : (list, tuple)
+        Target shape we want for the target tensor its len must be equal to tensor.ndim
+    mode : str
+        Pad mode, please refer to numpy.pad documentation.
+    value : float
+        Pad value, please refer to numpy.pad documentation.
+
+    Returns
+    -------
+    array : numpy array
+        Padded numpy array.
+    valid_vals : list
+        List containing proportion for each dimension of original, non-padded values.
+    """
+    assert len(target_shape) == array.ndim
+    pads = []  # this contains the abs length of the padding for each dimension.
+    valid_vals = []  # this contains the relative lengths for each dimension.
+    i = len(target_shape) - 1  # iterating over target_shape ndims
+    j = 0
+    while i >= 0:
+        assert (target_shape[i] >= array.shape[i]
+                ), "Target shape must be >= original shape for every dim"
+        pads.extend([0, target_shape[i] - array.shape[i]])
+        valid_vals.append(array.shape[j] / target_shape[j])
+        i -= 1
+        j += 1
+    array = np.pad(array, pads, mode, constant_values=(value, value))
+
+    return array, valid_vals
+
+
+def mod_default_collate(batch):
+    """Makes a tensor from list of batch values.
+
+    Note that this doesn't need to zip(*) values together
+    as PaddedBatch connects them already (by key).
+
+    Here the idea is not to error out.
+    """
+    elem = batch[0]
+    elem_type = type(elem)
+    if isinstance(elem, paddle.Tensor):
+        out = None
+        try:
+            if paddle.io.get_worker_info() is not None:
+
+                # If we're in a background process, concatenate directly into a
+                # shared memory tensor to avoid an extra copy
+                numel = sum([x.numel() for x in batch])
+                storage = elem.storage()._new_shared(numel)
+                out = elem.new(storage)
+            return paddle.stack(batch, 0, name=out)
+        except RuntimeError:  # Unequal size:
+            return batch
+    elif (elem_type.__module__ == "numpy" and elem_type.__name__ != "str_" and
+          elem_type.__name__ != "string_"):
+        try:
+            if (elem_type.__name__ == "ndarray" or
+                    elem_type.__name__ == "memmap"):
+                # array of string classes and object
+                if np_str_obj_array_pattern.search(elem.dtype.str) is not None:
+                    return batch
+                return mod_default_collate(
+                    [paddle.to_tensor(b, dtype=b.dtype) for b in batch])
+            elif elem.shape == ():  # scalars
+                return paddle.to_tensor(batch, dtype=batch.dtype)
+        except RuntimeError:  # Unequal size
+            return batch
+    elif isinstance(elem, float):
+        return paddle.to_tensor(batch, dtype=paddle.float64)
+    elif isinstance(elem, int):
+        return paddle.to_tensor(batch, dtype=paddle.int64)
+    else:
+        return batch
diff --git a/paddlespeech/s2t/io/speechbrain/dataio.py b/paddlespeech/s2t/io/speechbrain/dataio.py
new file mode 100755
index 000000000..e5e6f7661
--- /dev/null
+++ b/paddlespeech/s2t/io/speechbrain/dataio.py
@@ -0,0 +1,845 @@
+# Copyright (c) 2023 speechbrain Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# 
+# Modified from speechbrain 2023 (https://github.com/speechbrain/speechbrain/blob/develop/speechbrain/dataio/dataio.py)
+"""
+Data reading and writing.
+
+Authors
+ * Mirco Ravanelli 2020
+ * Aku Rouhe 2020
+ * Ju-Chieh Chou 2020
+ * Samuele Cornell 2020
+ * Abdel HEBA 2020
+"""
+import csv
+import hashlib
+import json
+import logging
+import os
+import pickle
+import re
+import time
+
+import numpy as np
+import soundfile
+logger = logging.getLogger(__name__)
+import paddle
+
+
+def load_data_json(json_path, replacements={}):
+    """Loads JSON and recursively formats string values.
+
+    Arguments
+    ----------
+    json_path : str
+        Path to CSV file.
+    replacements : dict
+        (Optional dict), e.g., {"data_folder": "/home/PaddleSpeech/data"}.
+        This is used to recursively format all string values in the data.
+
+    Returns
+    -------
+    dict
+        JSON data with replacements applied.
+
+
+    """
+    with open(json_path, "r") as f:
+        out_json = json.load(f)
+    _recursive_format(out_json, replacements)
+    return out_json
+
+
+def _recursive_format(data, replacements):
+    # Data: dict or list, replacements : dict
+    # Replaces string keys in replacements by their values
+    # at all levels of data (in str values)
+    # Works in-place.
+    if isinstance(data, dict):
+        for key, item in data.items():
+            if isinstance(item, dict) or isinstance(item, list):
+                _recursive_format(item, replacements)
+            elif isinstance(item, str):
+                data[key] = item.format_map(replacements)
+            # If not dict, list or str, do nothing
+    if isinstance(data, list):
+        for i, item in enumerate(data):
+            if isinstance(item, dict) or isinstance(item, list):
+                _recursive_format(item, replacements)
+            elif isinstance(item, str):
+                data[i] = item.format_map(replacements)
+            # If not dict, list or str, do nothing
+
+
+def load_data_csv(csv_path, replacements={}):
+    """Loads CSV and formats string values.
+
+    Uses the legacy CSV data format, where the CSV must have an
+    'ID' field.
+    If there is a field called duration, it is interpreted as a float.
+    The rest of the fields are left as they are (legacy _format and _opts fields
+    are not used to load the data in any special way).
+
+    Bash-like string replacements with $to_replace are supported.
+
+    Arguments
+    ----------
+    csv_path : str
+        Path to CSV file.
+    replacements : dict
+        (Optional dict), e.g., {"data_folder": "/home/PaddleSpeech/data"}
+        This is used to recursively format all string values in the data.
+
+    Returns
+    -------
+    dict
+        CSV data with replacements applied.
+    """
+
+    with open(csv_path, newline="") as csvfile:
+        result = {}
+        reader = csv.DictReader(csvfile, skipinitialspace=True)
+        variable_finder = re.compile(r"\$([\w.]+)")
+        for row in reader:
+            # ID:
+            try:
+                data_id = row["ID"]
+                del row["ID"]  # This is used as a key in result, instead.
+            except KeyError:
+                raise KeyError("CSV has to have an 'ID' field, with unique ids"
+                               " for all data points")
+            if data_id in result:
+                raise ValueError(f"Duplicate id: {data_id}")
+            # Replacements:
+            for key, value in row.items():
+                try:
+                    row[key] = variable_finder.sub(
+                        lambda match: str(replacements[match[1]]), value)
+                except KeyError:
+                    raise KeyError(f"The item {value} requires replacements "
+                                   "which were not supplied.")
+            # Duration:
+            if "duration" in row:
+                row["duration"] = float(row["duration"])
+            result[data_id] = row
+    return result
+
+
+def read_audio(waveforms_obj):
+    """General audio loading, based on a custom notation.
+
+    Expected use case is in conjunction with Datasets
+    specified by JSON.
+
+    The custom notation:
+
+    The annotation can be just a path to a file:
+    "/path/to/wav1.wav"
+
+    Or can specify more options in a dict:
+    {"file": "/path/to/wav2.wav",
+    "start": 8000,
+    "stop": 16000
+    }
+
+    Arguments
+    ----------
+    waveforms_obj : str, dict
+        Audio reading annotation, see above for format.
+
+    Returns
+    -------
+    paddle.Tensor
+        Audio tensor with shape: (samples, ).
+    """
+    if isinstance(waveforms_obj, str):
+        audio, _ = soundfile.read(waveforms_obj, dtype="float32")
+        return audio
+
+    path = waveforms_obj["file"]
+    start = waveforms_obj.get("start", 0)
+    # Default stop to start -> if not specified, num_frames becomes 0
+    stop = waveforms_obj.get("stop", start)
+    num_frames = stop - start
+    audio, fs = soundfile.read(
+        path, start=start, stop=start + num_frames, dtype="float32")
+    return audio
+
+
+def read_audio_multichannel(waveforms_obj):
+    """General audio loading, based on a custom notation.
+
+    Expected use case is in conjunction with Datasets
+    specified by JSON.
+
+    The custom notation:
+
+    The annotation can be just a path to a file:
+    "/path/to/wav1.wav"
+
+    Multiple (possibly multi-channel) files can be specified, as long as they
+    have the same length:
+    {"files": [
+        "/path/to/wav1.wav",
+        "/path/to/wav2.wav"
+        ]
+    }
+
+    Or you can specify a single file more succinctly:
+    {"files": "/path/to/wav2.wav"}
+
+    Offset number samples and stop number samples also can be specified to read
+    only a segment within the files.
+    {"files": [
+        "/path/to/wav1.wav",
+        "/path/to/wav2.wav"
+        ]
+    "start": 8000
+    "stop": 16000
+    }
+
+    Arguments
+    ----------
+    waveforms_obj : str, dict
+        Audio reading annotation, see above for format.
+
+    Returns
+    -------
+    paddle.Tensor
+        Audio tensor with shape: (samples, ).
+    """
+    if isinstance(waveforms_obj, str):
+        audio, _ = soundfile.read(waveforms_obj, dtype="float32")
+        audio = paddle.to_tensor(audio)
+        return audio
+
+    files = waveforms_obj["files"]
+    if not isinstance(files, list):
+        files = [files]
+
+    waveforms = []
+    start = waveforms_obj.get("start", 0)
+    # Default stop to start -> if not specified, num_frames becomes 0
+    stop = waveforms_obj.get("stop", start - 1)
+    num_frames = stop - start
+    for f in files:
+        audio, fs = soundfile.read(
+            path, start=start, stop=start + num_frames, dtype="float32")
+        audio = paddle.to_tensor(audio)
+        waveforms.append(audio)
+
+    out = paddle.concat(waveforms, 0)
+    return out
+
+
+def write_audio(filepath, audio, samplerate):
+    """Write audio on disk. It is basically a wrapper to support saving
+    audio signals in format (audio, channels).
+
+    Arguments
+    ---------
+    filepath: path
+        Path where to save the audio file.
+    audio : paddle.Tensor
+        Audio file in the expected format (signal, channels).
+    samplerate: int
+        Sample rate (e.g., 16000).
+
+    """
+    if len(audio.shape) == 2:
+        audio = audio.transpose([1, 0])
+    elif len(audio.shape) == 1:
+        audio = audio.unsqueeze(0)
+
+    soundfile.write(filepath, audio, samplerate)
+
+
+def load_pickle(pickle_path):
+    """Utility function for loading .pkl pickle files.
+
+    Arguments
+    ---------
+    pickle_path : str
+        Path to pickle file.
+
+    Returns
+    -------
+    out : object
+        Python object loaded from pickle.
+    """
+    with open(pickle_path, "rb") as f:
+        out = pickle.load(f)
+    return out
+
+
+def to_floatTensor(x: (list, tuple, np.ndarray)):
+    """
+    Arguments
+    ---------
+    x : (list, tuple, np.ndarray)
+        Input data to be converted to paddle float.
+
+    Returns
+    -------
+    tensor : paddle.tensor
+        Data now in paddle.tensor float datatype.
+    """
+    return paddle.to_tensor(x, dtype='float32')
+
+
+def to_doubleTensor(x: (list, tuple, np.ndarray)):
+    """
+    Arguments
+    ---------
+    x : (list, tuple, np.ndarray)
+        Input data to be converted to paddle double.
+
+    Returns
+    -------
+    tensor : paddle.tensor
+        Data now in paddle.tensor double datatype.
+    """
+    return paddle.to_tensor(x, dtype='float64')
+
+
+def to_longTensor(x: (list, tuple, np.ndarray)):
+    """
+    Arguments
+    ---------
+    x : (list, tuple, np.ndarray)
+        Input data to be converted to paddle long.
+
+    Returns
+    -------
+    tensor : paddle.tensor
+        Data now in paddle.tensor long datatype.
+    """
+    return paddle.to_tensor(x, dtype='int64')
+
+
+def convert_index_to_lab(batch, ind2lab):
+    """Convert a batch of integer IDs to string labels.
+
+    Arguments
+    ---------
+    batch : list
+        List of lists, a batch of sequences.
+    ind2lab : dict
+        Mapping from integer IDs to labels.
+
+    Returns
+    -------
+    list
+        List of lists, same size as batch, with labels from ind2lab.
+
+    """
+    return [[ind2lab[int(index)] for index in seq] for seq in batch]
+
+
+def relative_time_to_absolute(batch, relative_lens, rate):
+    """Converts relative length to the absolute duration.
+
+    Operates on batch level.
+
+    Arguments
+    ---------
+    batch : paddle.tensor
+        Sequences to determine the duration for.
+    relative_lens : paddle.tensor
+        The relative length of each sequence in batch. The longest sequence in
+        the batch needs to have relative length 1.0.
+    rate : float
+        The rate at which sequence elements occur in real-world time. Sample
+        rate, if batch is raw wavs (recommended) or 1/frame_shift if batch is
+        features. This has to have 1/s as the unit.
+
+    Returns
+    ------:
+    paddle.tensor
+        Duration of each sequence in seconds.
+
+    """
+    max_len = batch.shape[1]
+    durations = paddle.round(relative_lens * max_len) / rate
+    return durations
+
+
+class IterativeCSVWriter:
+    """Write CSV files a line at a time.
+
+    Arguments
+    ---------
+    outstream : file-object
+        A writeable stream
+    data_fields : list
+        List of the optional keys to write. Each key will be expanded, 
+        producing three fields: key, key_format, key_opts.
+    """
+
+    def __init__(self, outstream, data_fields, defaults={}):
+        self._outstream = outstream
+        self.fields = ["ID", "duration"] + self._expand_data_fields(data_fields)
+        self.defaults = defaults
+        self._outstream.write(",".join(self.fields))
+
+    def set_default(self, field, value):
+        """Sets a default value for the given CSV field.
+
+        Arguments
+        ---------
+        field : str
+            A field in the CSV.
+        value
+            The default value.
+        """
+        if field not in self.fields:
+            raise ValueError(f"{field} is not a field in this CSV!")
+        self.defaults[field] = value
+
+    def write(self, *args, **kwargs):
+        """Writes one data line into the CSV.
+
+        Arguments
+        ---------
+        *args
+            Supply every field with a value in positional form OR.
+        **kwargs
+            Supply certain fields by key. The ID field is mandatory for all
+            lines, but others can be left empty.
+        """
+        if args and kwargs:
+            raise ValueError(
+                "Use either positional fields or named fields, but not both.")
+        if args:
+            if len(args) != len(self.fields):
+                raise ValueError("Need consistent fields")
+            to_write = [str(arg) for arg in args]
+        if kwargs:
+            if "ID" not in kwargs:
+                raise ValueError("I'll need to see some ID")
+            full_vals = self.defaults.copy()
+            full_vals.update(kwargs)
+            to_write = [str(full_vals.get(field, "")) for field in self.fields]
+        self._outstream.write("\n")
+        self._outstream.write(",".join(to_write))
+
+    def write_batch(self, *args, **kwargs):
+        """Writes a batch of lines into the CSV.
+
+        Here each argument should be a list with the same length.
+
+        Arguments
+        ---------
+        *args
+            Supply every field with a value in positional form OR.
+        **kwargs
+            Supply certain fields by key. The ID field is mandatory for all
+            lines, but others can be left empty.
+        """
+        if args and kwargs:
+            raise ValueError(
+                "Use either positional fields or named fields, but not both.")
+        if args:
+            if len(args) != len(self.fields):
+                raise ValueError("Need consistent fields")
+            for arg_row in zip(*args):
+                self.write(*arg_row)
+        if kwargs:
+            if "ID" not in kwargs:
+                raise ValueError("I'll need to see some ID")
+            keys = kwargs.keys()
+            for value_row in zip(*kwargs.values()):
+                kwarg_row = dict(zip(keys, value_row))
+                self.write(**kwarg_row)
+
+    @staticmethod
+    def _expand_data_fields(data_fields):
+        expanded = []
+        for data_field in data_fields:
+            expanded.append(data_field)
+            expanded.append(data_field + "_format")
+            expanded.append(data_field + "_opts")
+        return expanded
+
+
+def write_txt_file(data, filename, sampling_rate=None):
+    """Write data in text format.
+
+    Arguments
+    ---------
+    data : str, list, paddle.tensor, numpy.ndarray
+        The data to write in the text file.
+    filename : str
+        Path to file where to write the data.
+    sampling_rate : None
+        Not used, just here for interface compatibility.
+
+    Returns
+    -------
+    None
+
+    """
+    del sampling_rate  # Not used.
+    # Check if the path of filename exists
+    os.makedirs(os.path.dirname(filename), exist_ok=True)
+    with open(filename, "w") as fout:
+        if isinstance(data, paddle.Tensor):
+            data = data.tolist()
+        if isinstance(data, np.ndarray):
+            data = data.tolist()
+        if isinstance(data, list):
+            for line in data:
+                print(line, file=fout)
+        if isinstance(data, str):
+            print(data, file=fout)
+
+
+def write_stdout(data, filename=None, sampling_rate=None):
+    """Write data to standard output.
+
+    Arguments
+    ---------
+    data : str, list, paddle.Tensor, numpy.ndarray
+        The data to write in the text file.
+    filename : None
+        Not used, just here for compatibility.
+    sampling_rate : None
+        Not used, just here for compatibility.
+
+    Returns
+    -------
+    None
+
+    """
+    # Managing paddle.Tensor
+    if isinstance(data, paddle.Tensor):
+        data = data.tolist()
+    # Managing np.ndarray
+    if isinstance(data, np.ndarray):
+        data = data.tolist()
+    if isinstance(data, list):
+        for line in data:
+            print(line)
+    if isinstance(data, str):
+        print(data)
+
+
+def length_to_mask(length, max_len=None, dtype=None, device=None):
+    """Creates a binary mask for each sequence.
+    Arguments
+    ---------
+    length : LongTensor
+        Containing the length of each sequence in the batch. Must be 1D.
+    max_len : int
+        Max length for the mask, also the size of the second dimension.
+    dtype : dtype, default: None
+        The dtype of the generated mask.
+    device: device, default: None
+        The device to put the mask variable.
+
+    Returns
+    -------
+    mask : tensor
+        The binary mask.
+
+    """
+    assert len(length.shape) == 1
+
+    if max_len is None:
+        max_len = length.max().long().item()  # using arange to generate mask
+    mask = paddle.arange(
+        max_len, dtype=length.dtype).expand(
+            [len(length), max_len]) < length.unsqueeze(1)
+
+    if dtype is None:
+        dtype = length.dtype
+
+    if device is None:
+        device = length.device
+
+    mask = paddle.to_tensor(mask, dtype=dtype)
+    return mask
+
+
+def read_kaldi_lab(kaldi_ali, kaldi_lab_opts):
+    """Read labels in kaldi format.
+
+    Uses kaldi IO.
+
+    Arguments
+    ---------
+    kaldi_ali : str
+        Path to directory where kaldi alignments are stored.
+    kaldi_lab_opts : str
+        A string that contains the options for reading the kaldi alignments.
+
+    Returns
+    -------
+    lab : dict
+        A dictionary containing the labels.
+
+    Note
+    ----
+    This depends on kaldi-io-for-python. Install it separately.
+    See: https://github.com/vesis84/kaldi-io-for-python
+    ```
+    """
+    # EXTRA TOOLS
+    try:
+        import kaldi_io
+    except ImportError:
+        raise ImportError("Could not import kaldi_io. Install it to use this.")
+    # Reading the Kaldi labels
+    lab = {
+        k: v
+        for k, v in kaldi_io.read_vec_int_ark(
+            "gunzip -c " + kaldi_ali + "/ali*.gz | " + kaldi_lab_opts + " " +
+            kaldi_ali + "/final.mdl ark:- ark:-|")
+    }
+    return lab
+
+
+def get_md5(file):
+    """Get the md5 checksum of an input file.
+
+    Arguments
+    ---------
+    file : str
+        Path to file for which compute the checksum.
+
+    Returns
+    -------
+    md5
+        Checksum for the given filepath.
+    """
+    # Lets read stuff in 64kb chunks!
+    BUF_SIZE = 65536
+    md5 = hashlib.md5()
+    # Computing md5
+    with open(file, "rb") as f:
+        while True:
+            data = f.read(BUF_SIZE)
+            if not data:
+                break
+            md5.update(data)
+    return md5.hexdigest()
+
+
+def save_md5(files, out_file):
+    """Saves the md5 of a list of input files as a pickled dict into a file.
+
+    Arguments
+    ---------
+    files : list
+        List of input files from which we will compute the md5.
+    outfile : str
+        The path where to store the output pkl file.
+
+    Returns
+    -------
+    None
+    """
+    # Initialization of the dictionary
+    md5_dict = {}
+    # Computing md5 for all the files in the list
+    for file in files:
+        md5_dict[file] = get_md5(file)
+    # Saving dictionary in pkl format
+    save_pkl(md5_dict, out_file)
+
+
+def save_pkl(obj, file):
+    """Save an object in pkl format.
+
+    Arguments
+    ---------
+    obj : object
+        Object to save in pkl format
+    file : str
+        Path to the output file
+    sampling_rate : int
+        Sampling rate of the audio file, TODO: this is not used?
+
+    """
+    with open(file, "wb") as f:
+        pickle.dump(obj, f)
+
+
+def load_pkl(file):
+    """Loads a pkl file.
+
+    For an example, see `save_pkl`.
+
+    Arguments
+    ---------
+    file : str
+        Path to the input pkl file.
+
+    Returns
+    -------
+    The loaded object.
+    """
+
+    # Deals with the situation where two processes are trying
+    # to access the same label dictionary by creating a lock
+    count = 100
+    while count > 0:
+        if os.path.isfile(file + ".lock"):
+            time.sleep(1)
+            count -= 1
+        else:
+            break
+
+    try:
+        open(file + ".lock", "w").close()
+        with open(file, "rb") as f:
+            return pickle.load(f)
+    finally:
+        if os.path.isfile(file + ".lock"):
+            os.remove(file + ".lock")
+
+
+def prepend_bos_token(label, bos_index):
+    """Create labels with <bos> token at the beginning.
+
+    Arguments
+    ---------
+    label : IntTensor
+        Containing the original labels. Must be of size: [batch_size, max_length].
+    bos_index : int
+        The index for <bos> token.
+
+    Returns
+    -------
+    new_label : tensor
+        The new label with <bos> at the beginning.
+
+    """
+    new_label = label.long().clone()
+    batch_size = label.shape[0]
+
+    bos = new_label.new_zeros(batch_size, 1).fill_(bos_index)
+    new_label = paddle.concat([bos, new_label], axis=1)
+    return new_label
+
+
+def append_eos_token(label, length, eos_index):
+    """Create labels with <eos> token appended.
+
+    Arguments
+    ---------
+    label : IntTensor
+        Containing the original labels. Must be of size: [batch_size, max_length]
+    length : LongTensor
+        Containing the original length of each label sequences. Must be 1D.
+    eos_index : int
+        The index for <eos> token.
+
+    Returns
+    -------
+    new_label : tensor
+        The new label with <eos> appended.
+
+    """
+    new_label = paddle.to_tensor(label, dtype="int32").clone()
+    batch_size = label.shape[0]
+
+    pad = paddle.zeros([batch_size, 1], dtype=new_label.dtype)
+
+    new_label = paddle.concat([new_label, pad], dim=1)
+    new_label[paddle.arange(batch_size), paddle.to_tensor(
+        length, dtype="int64")] = eos_index
+    return new_label
+
+
+def merge_char(sequences, space="_"):
+    """Merge characters sequences into word sequences.
+
+    Arguments
+    ---------
+    sequences : list
+        Each item contains a list, and this list contains a character sequence.
+    space : string
+        The token represents space. Default: _
+
+    Returns
+    -------
+    The list contains word sequences for each sentence.
+
+    """
+    results = []
+    for seq in sequences:
+        words = "".join(seq).split(space)
+        results.append(words)
+    return results
+
+
+def merge_csvs(data_folder, csv_lst, merged_csv):
+    """Merging several csv files into one file.
+
+    Arguments
+    ---------
+    data_folder : string
+        The folder to store csv files to be merged and after merging.
+    csv_lst : list
+        Filenames of csv file to be merged.
+    merged_csv : string
+        The filename to write the merged csv file.
+
+    """
+    write_path = os.path.join(data_folder, merged_csv)
+    if os.path.isfile(write_path):
+        logger.info("Skipping merging. Completed in previous run.")
+    with open(os.path.join(data_folder, csv_lst[0])) as f:
+        header = f.readline()
+    lines = []
+    for csv_file in csv_lst:
+        with open(os.path.join(data_folder, csv_file)) as f:
+            for i, line in enumerate(f):
+                if i == 0:
+                    # Checking header
+                    if line != header:
+                        raise ValueError("Different header for "
+                                         f"{csv_lst[0]} and {csv}.")
+                    continue
+                lines.append(line)
+    with open(write_path, "w") as f:
+        f.write(header)
+        for line in lines:
+            f.write(line)
+    logger.info(f"{write_path} is created.")
+
+
+def split_word(sequences, space="_"):
+    """Split word sequences into character sequences.
+
+    Arguments
+    ---------
+    sequences : list
+        Each item contains a list, and this list contains a words sequence.
+    space : string
+        The token represents space. Default: _
+
+    Returns
+    -------
+    The list contains word sequences for each sentence.
+
+    """
+    results = []
+    for seq in sequences:
+        chars = list(space.join(seq))
+        results.append(chars)
+    return results
diff --git a/paddlespeech/s2t/io/speechbrain/dataloader.py b/paddlespeech/s2t/io/speechbrain/dataloader.py
new file mode 100755
index 000000000..ed0fe9043
--- /dev/null
+++ b/paddlespeech/s2t/io/speechbrain/dataloader.py
@@ -0,0 +1,172 @@
+# Copyright (c) 2023 speechbrain Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# 
+# Modified from speechbrain 2023 (https://github.com/speechbrain/speechbrain/blob/develop/speechbrain/dataio/dataloader.py)
+"""Paddle compatible DataLoaders
+
+Essentially we extend Paddle DataLoader by adding the ability to save the
+data loading state, so that a checkpoint may be saved in the middle of an
+epoch.
+
+Authors:
+  * Aku Rouhe 2020
+"""
+import collections
+import functools
+import logging
+import warnings
+
+import paddle
+from paddle.io import DataLoader
+
+from paddlespeech.s2t.io.speechbrain.data_utils import batch_pad_right
+from paddlespeech.s2t.io.speechbrain.data_utils import mod_default_collate
+from paddlespeech.s2t.io.speechbrain.dataset import DynamicItemDataset
+from paddlespeech.s2t.io.speechbrain.sampler import ReproducibleRandomSampler
+PaddedData = collections.namedtuple("PaddedData", ["data", "lengths"])
+import numpy
+
+
+class Wav2vec2DataLoader(DataLoader):
+    def __init__(self,
+                 dataset,
+                 batch_size=1,
+                 shuffle=False,
+                 sampler=None,
+                 batch_sampler=None,
+                 num_workers=0,
+                 collate_fn=None,
+                 pin_memory=False,
+                 drop_last=False,
+                 timeout=0,
+                 worker_init_fn=None,
+                 multiprocessing_context=None,
+                 generator=None):
+        if isinstance(dataset[0], (tuple, list)):
+            return_list = True
+        else:
+            return_list = False
+
+        super().__init__(
+            dataset,
+            feed_list=None,
+            places=None,
+            return_list=return_list,
+            batch_sampler=batch_sampler,
+            batch_size=batch_size,
+            shuffle=shuffle,
+            drop_last=drop_last,
+            collate_fn=collate_fn,
+            num_workers=num_workers,
+            use_buffer_reader=True,
+            use_shared_memory=False,
+            timeout=timeout,
+            worker_init_fn=worker_init_fn)
+        if sampler is not None:
+            self.batch_sampler.sampler = sampler
+
+
+def PaddedBatch(
+        examples,
+        padded_keys=None,
+        device_prep_keys=None,
+        padding_func=batch_pad_right,
+        padding_kwargs={},
+        nonpadded_stack=True, ):
+    __length = len(examples)
+    __keys = list(examples[0].keys())
+    __padded_keys = []
+    __device_prep_keys = []
+    res = {}
+    for key in __keys:
+        values = [example[key] for example in examples]
+        # Default convert usually does the right thing (numpy2tensor etc.)
+        # values = default_convert(values)
+        if (padded_keys is not None and key in padded_keys) or (
+                padded_keys is None and isinstance(values[0], numpy.ndarray)):
+            # Padding and PaddedData
+            __padded_keys.append(key)
+
+            padded = PaddedData(*padding_func(values, **padding_kwargs))
+            res[key] = padded
+        else:
+            # Default collate usually does the right thing
+            # (convert lists of equal sized tensors to batch tensors, etc.)
+            if nonpadded_stack:
+                values = mod_default_collate(values)
+            res[key] = values
+        if (device_prep_keys is not None and key in device_prep_keys) or (
+                device_prep_keys is None and
+                isinstance(values[0], paddle.Tensor)):
+            __device_prep_keys.append(key)
+    return res
+
+
+def make_dataloader(dataset, stage, **loader_kwargs):
+    """Makes a basic DataLoader.
+
+    For DynamicItemDatasets (which return dicts), use
+    PaddedBatch as the default collate_fn.
+
+    Shuffling gets implemented by ReproducibleRandomSampler.
+
+    If the Dataset is not an IterableDataset, the DataLoader
+    is a SaveableDataLoader.
+
+    If the Dataset is a webdataset.dataset.Composable, set default
+    batch_size = None.
+
+    Can also loop over the underlying dataloader continuously,
+    and stop iterations at nominal epoch lengths.
+
+    Arguments
+    ---------
+    dataset : Dataset
+        The dataset to make a DataLoader for.
+    looped_nominal_epoch : None, int
+        If an integer is given, loop the underlying DataLoader infinitely and
+        set a nominal epoch length in batches (or whatever the DataLoader
+        yields).
+    **loader_kwargs : dict
+        Keyword args to DataLoader, see Paddle DataLoader for
+        options.
+
+    Returns
+    -------
+    DataLoader
+        If looped_nominal_epoch is None
+    LoopedLoader
+        If looped_nominal_epoch is not None
+    """
+    # PaddedBatch as default collation for DynamicItemDataset
+    if "collate_fn" not in loader_kwargs and isinstance(dataset,
+                                                        DynamicItemDataset):
+        loader_kwargs["collate_fn"] = PaddedBatch
+    # Reproducible random sampling
+    if loader_kwargs.get("shuffle", False):
+        if loader_kwargs.get("sampler") is not None:
+            raise ValueError("Cannot specify both shuffle=True and a "
+                             "sampler in loader_kwargs")
+        sampler = ReproducibleRandomSampler(dataset)
+        loader_kwargs["sampler"] = sampler
+        # Should delete shuffle because you can't set both Sampler and
+        # shuffle
+        # NOTE: the dict of loader options may get used elsewhere!
+        # However, this del doesn't touch those because loader_kwargs comes
+        # from a **kwargs dict.
+        del loader_kwargs["shuffle"]
+    # Create the loader
+    dataloader = Wav2vec2DataLoader(dataset, **loader_kwargs)
+    return dataloader
diff --git a/paddlespeech/s2t/io/speechbrain/dataset.py b/paddlespeech/s2t/io/speechbrain/dataset.py
new file mode 100755
index 000000000..136275b77
--- /dev/null
+++ b/paddlespeech/s2t/io/speechbrain/dataset.py
@@ -0,0 +1,371 @@
+# Copyright (c) 2023 speechbrain Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# 
+# Modified from speechbrain 2023 (https://github.com/speechbrain/speechbrain/blob/develop/speechbrain/dataio/dataset.py)
+import contextlib
+import copy
+import logging
+from types import MethodType
+
+from paddle.io import Dataset
+
+from paddlespeech.s2t.io.speechbrain.data_pipeline import DataPipeline
+from paddlespeech.s2t.io.speechbrain.dataio import load_data_csv
+from paddlespeech.s2t.io.speechbrain.dataio import load_data_json
+
+logger = logging.getLogger(__name__)
+
+
+class DynamicItemDataset(Dataset):
+    """Dataset that reads, wrangles, and produces dicts.
+
+    Each data point dict provides some items (by key), for example, a path to a
+    wavefile with the key "wav_file". When a data point is fetched from this
+    Dataset, more items are produced dynamically, based on pre-existing items
+    and other dynamic created items. For example, a dynamic item could take the
+    wavfile path and load the audio from the disk.
+
+    The dynamic items can depend on other dynamic items: a suitable evaluation
+    order is used automatically,  as long as there are no circular dependencies.
+
+    A specified list of keys is collected in the output dict. These can be items
+    in the original data or dynamic items. If some dynamic items are not
+    requested, nor depended on by other requested items, they won't be computed.
+    So for example if a user simply wants to iterate over the text, the
+    time-consuming audio loading can be skipped.
+
+    About the format:
+    Takes a dict of dicts as the collection of data points to read/wrangle.
+    The top level keys are data point IDs.
+    Each data point (example) dict should have the same keys, corresponding to
+    different items in that data point.
+
+    Altogether the data collection could look like this:
+
+    >>> data = {
+    ...  "spk1utt1": {
+    ...      "wav_file": "/path/to/spk1utt1.wav",
+    ...      "text": "hello world",
+    ...      "speaker": "spk1",
+    ...      },
+    ...  "spk1utt2": {
+    ...      "wav_file": "/path/to/spk1utt2.wav",
+    ...      "text": "how are you world",
+    ...      "speaker": "spk1",
+    ...      }
+    ... }
+
+    NOTE
+    ----
+        The top-level key, the data point id, is implicitly added as an item
+        in the data point, with the key "id"
+
+    Each dynamic item is configured by three things: a key, a func, and a list
+    of argkeys. The key should be unique among all the items (dynamic or not) in
+    each data point. The func is any callable, and it returns the dynamic item's
+    value. The callable is called with the values of other items as specified
+    by the argkeys list (as positional args, passed in the order specified by
+    argkeys).
+
+    Arguments
+    ---------
+    data : dict
+        Dictionary containing single data points (e.g. utterances).
+    dynamic_items : list, optional
+        Configuration for the dynamic items produced when fetching an example.
+        List of DynamicItems or dicts with the format::
+            func: <callable> # To be called
+            takes: <list> # key or list of keys of args this takes
+            provides: key # key or list of keys that this provides
+    output_keys : dict, list, optional
+        List of keys (either directly available in data or dynamic items)
+        to include in the output dict when data points are fetched.
+
+        If a dict is given; it is used to map internal keys to output keys.
+        From the output_keys dict key:value pairs the key appears outside,
+        and value is the internal key.
+    """
+
+    def __init__(
+            self,
+            data,
+            dynamic_items=[],
+            output_keys=[], ):
+        self.data = data
+        self.data_ids = list(self.data.keys())
+        static_keys = list(self.data[self.data_ids[0]].keys())
+        if "id" in static_keys:
+            raise ValueError("The key 'id' is reserved for the data point id.")
+        else:
+            static_keys.append("id")
+        self.pipeline = DataPipeline(static_keys, dynamic_items)
+        self.set_output_keys(output_keys)
+
+    def __len__(self):
+        return len(self.data_ids)
+
+    def __getitem__(self, index):
+        data_id = self.data_ids[index]
+        data_point = self.data[data_id]
+        return self.pipeline.compute_outputs({"id": data_id, **data_point})
+
+    def add_dynamic_item(self, func, takes=None, provides=None):
+        """Makes a new dynamic item available on the dataset.
+
+        Two calling conventions. For DynamicItem objects, just use:
+        add_dynamic_item(dynamic_item).
+        But otherwise, should use:
+        add_dynamic_item(func, takes, provides).
+
+        Arguments
+        ---------
+        func : callable, DynamicItem
+            If a DynamicItem is given, adds that directly. Otherwise a
+            DynamicItem is created, and this specifies the callable to use. If
+            a generator function is given, then create a GeneratorDynamicItem.
+            Otherwise creates a normal DynamicItem.
+        takes : list, str
+            List of keys. When func is called, each key is resolved to
+            either an entry in the data or the output of another dynamic_item.
+            The func is then called with these as positional arguments,
+            in the same order as specified here.
+            A single arg can be given directly.
+        provides : str
+            Unique key or keys that this provides.
+        """
+        self.pipeline.add_dynamic_item(func, takes, provides)
+
+    def set_output_keys(self, keys):
+        """Use this to change the output keys.
+
+        These are the keys that are actually evaluated when a data point
+        is fetched from the dataset.
+
+        Arguments
+        ---------
+        keys : dict, list
+            List of keys (str) to produce in output.
+
+            If a dict is given; it is used to map internal keys to output keys.
+            From the output_keys dict key:value pairs the key appears outside,
+            and value is the internal key.
+        """
+        self.pipeline.set_output_keys(keys)
+
+    @contextlib.contextmanager
+    def output_keys_as(self, keys):
+        """Context manager to temporarily set output keys.
+
+        NOTE
+        ----
+        Not thread-safe. While in this context manager, the output keys
+        are affected for any call.
+        """
+        saved_output = self.pipeline.output_mapping
+        self.pipeline.set_output_keys(keys)
+        yield self
+        self.pipeline.set_output_keys(saved_output)
+
+    def filtered_sorted(
+            self,
+            key_min_value={},
+            key_max_value={},
+            key_test={},
+            sort_key=None,
+            reverse=False,
+            select_n=None, ):
+        """Get a filtered and/or sorted version of this, shares static data.
+
+        The reason to implement these operations in the same method is that
+        computing some dynamic items may be expensive, and this way the
+        filtering and sorting steps don't need to compute the dynamic items
+        twice.
+
+        Arguments
+        ---------
+        key_min_value : dict
+            Map from key (in data or in dynamic items) to limit, will only keep
+            data_point if data_point[key] >= limit
+        key_max_value : dict
+            Map from key (in data or in dynamic items) to limit, will only keep
+            data_point if data_point[key] <= limit
+        key_test : dict
+            Map from key (in data or in dynamic items) to func, will only keep
+            data_point if bool(func(data_point[key])) == True
+        sort_key : None, str
+            If not None, sort by data_point[sort_key]. Default is ascending
+            order.
+        reverse : bool
+            If True, sort in descending order.
+        select_n : None, int
+            If not None, only keep (at most) the first n filtered data_points.
+            The possible sorting is applied, but only on the first n data
+            points found. Meant for debugging.
+
+        Returns
+        -------
+        FilteredSortedDynamicItemDataset
+            Shares the static data, but has its own output keys and
+            dynamic items (initially deep copied from this, so they have the
+            same dynamic items available)
+
+        NOTE
+        ----
+        Temporarily changes the output keys!
+        """
+        filtered_sorted_ids = self._filtered_sorted_ids(
+            key_min_value,
+            key_max_value,
+            key_test,
+            sort_key,
+            reverse,
+            select_n, )
+        return FilteredSortedDynamicItemDataset(
+            self, filtered_sorted_ids)  # NOTE: defined below
+
+    def _filtered_sorted_ids(
+            self,
+            key_min_value={},
+            key_max_value={},
+            key_test={},
+            sort_key=None,
+            reverse=False,
+            select_n=None, ):
+        """Returns a list of data ids, fulfilling the sorting and filtering."""
+
+        def combined_filter(computed):
+            """Applies filter."""
+            for key, limit in key_min_value.items():
+                # NOTE: docstring promises >= so using that.
+                # Mathematically could also use < for nicer syntax, but
+                # maybe with some super special weird edge case some one can
+                # depend on the >= operator
+                if computed[key] >= limit:
+                    continue
+                return False
+            for key, limit in key_max_value.items():
+                if computed[key] <= limit:
+                    continue
+                return False
+            for key, func in key_test.items():
+                if bool(func(computed[key])):
+                    continue
+                return False
+            return True
+
+        temp_keys = (set(key_min_value.keys()) | set(key_max_value.keys()) |
+                     set(key_test.keys()) |
+                     set([] if sort_key is None else [sort_key]))
+        filtered_ids = []
+        with self.output_keys_as(temp_keys):
+            for i, data_id in enumerate(self.data_ids):
+                if select_n is not None and len(filtered_ids) == select_n:
+                    break
+                data_point = self.data[data_id]
+                data_point["id"] = data_id
+                computed = self.pipeline.compute_outputs(data_point)
+                if combined_filter(computed):
+                    if sort_key is not None:
+                        # Add (main sorting index, current index, data_id)
+                        # So that we maintain current sorting and don't compare
+                        # data_id values ever.
+                        filtered_ids.append((computed[sort_key], i, data_id))
+                    else:
+                        filtered_ids.append(data_id)
+        if sort_key is not None:
+            filtered_sorted_ids = [
+                tup[2] for tup in sorted(filtered_ids, reverse=reverse)
+            ]
+        else:
+            filtered_sorted_ids = filtered_ids
+        return filtered_sorted_ids
+
+    @classmethod
+    def from_json(cls,
+                  json_path,
+                  replacements={},
+                  dynamic_items=[],
+                  output_keys=[]):
+        """Load a data prep JSON file and create a Dataset based on it."""
+        data = load_data_json(json_path, replacements)
+        return cls(data, dynamic_items, output_keys)
+
+    @classmethod
+    def from_csv(cls,
+                 csv_path,
+                 replacements={},
+                 dynamic_items=[],
+                 output_keys=[]):
+        """Load a data prep CSV file and create a Dataset based on it."""
+        data = load_data_csv(csv_path, replacements)
+        return cls(data, dynamic_items, output_keys)
+
+    @classmethod
+    def from_arrow_dataset(cls,
+                           dataset,
+                           replacements={},
+                           dynamic_items=[],
+                           output_keys=[]):
+        """Loading a prepared huggingface dataset"""
+
+        # define an unbound method to generate puesdo keys
+        def keys(self):
+            "Returns the keys."
+            return [i for i in range(dataset.__len__())]
+
+        # bind this method to arrow dataset
+        dataset.keys = MethodType(keys, dataset)
+        return cls(dataset, dynamic_items, output_keys)
+
+
+class FilteredSortedDynamicItemDataset(DynamicItemDataset):
+    """Possibly filtered, possibly sorted DynamicItemDataset.
+
+    Shares the static data (reference).
+    Has its own dynamic_items and output_keys (deepcopy).
+    """
+
+    def __init__(self, from_dataset, data_ids):
+        self.data = from_dataset.data
+        self.data_ids = data_ids
+        self.pipeline = copy.deepcopy(from_dataset.pipeline)
+
+    @classmethod
+    def from_json(cls,
+                  json_path,
+                  replacements={},
+                  dynamic_items=None,
+                  output_keys=None):
+        raise TypeError("Cannot create SubsetDynamicItemDataset directly!")
+
+    @classmethod
+    def from_csv(cls,
+                 csv_path,
+                 replacements={},
+                 dynamic_items=None,
+                 output_keys=None):
+        raise TypeError("Cannot create SubsetDynamicItemDataset directly!")
+
+
+def add_dynamic_item(datasets, func, takes=None, provides=None):
+    """Helper for adding the same item to multiple datasets."""
+    for dataset in datasets:
+        dataset.add_dynamic_item(func, takes, provides)
+
+
+def set_output_keys(datasets, output_keys):
+    """Helper for setting the same item to multiple datasets."""
+    for dataset in datasets:
+        dataset.set_output_keys(output_keys)
diff --git a/paddlespeech/s2t/io/speechbrain/depgraph.py b/paddlespeech/s2t/io/speechbrain/depgraph.py
new file mode 100755
index 000000000..501b1d508
--- /dev/null
+++ b/paddlespeech/s2t/io/speechbrain/depgraph.py
@@ -0,0 +1,237 @@
+# Copyright (c) 2023 speechbrain Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# 
+# Modified from speechbrain 2023 (https://github.com/speechbrain/speechbrain/blob/develop/speechbrain/utils/depgraph.py)
+"""A dependency graph for finding evaluation order.
+
+Authors:
+    * Aku Rouhe 2020
+"""
+import collections
+import uuid
+
+
+class CircularDependencyError(ValueError):
+    """
+    An error caused by running into circular dependencies while searching for
+    an evaluation order in a DependencyGraph.
+    """
+
+    pass
+
+
+DGNode = collections.namedtuple("DGNode", ["key", "edges", "data"])
+
+# A node in DependencyGraph.
+
+
+class DependencyGraph:
+    """General-purpose dependency graph.
+
+    Essentially a directed acyclic graph.
+    Usually used to find an evaluation order for e.g. variable substitution
+    The relation that an edge between A and B represents is:
+    "A depends on B, i.e. B should be evaluated before A"
+
+    Nodes can be added explicitly or they can be created implicitly
+    while adding edges.
+    Nodes have keys, which should be some hashable value that identifies
+    the elements the graph represents in your use case. E.G. they can just
+    be the variable name you want to substitute.
+    However, if needed, more generally you can attach any data to a node
+    (e.g. a path in your tree), and if so desired, a unique key can be
+    created for you. You'll only need to know that key while adding edges
+    to/from it.
+    Implicit keys and explicit keys can also be mixed.
+    """
+
+    def __init__(self):
+        self.digraph = []
+        self.key2ind = {}
+        # Guard for manual duplicates (but not implicitly added ones)
+        self._manually_added_keys = []
+
+    @staticmethod
+    def get_unique_key():
+        """Returns a unique hashable identifier."""
+        return uuid.uuid4()
+
+    def add_node(self, key=None, data=None):
+        """Adds a node explicitly.
+
+        Arguments
+        ---------
+        key : hashable, optional
+            If not given, a key is created for you.
+        data : Any, optional
+            Any additional data you wish to attach to this node.
+
+        Returns
+        -------
+        hashable
+            The key that was used (either yours or generated).
+
+        Raises
+        ------
+        ValueError
+            If node with the given key has already been added explicitly
+            (with this method, not "add_edge").
+        """
+        if key is None:
+            key = self.get_unique_key()
+        elif key in self._manually_added_keys:
+            raise ValueError("Adding duplicate node: {key}".format(key=key))
+        else:
+            self._manually_added_keys.append(key)
+        if key in self.key2ind:  # Implicitly added already; don't add again.
+            ind = self.key2ind[key]
+            node = self.digraph[ind]
+            # All that this operation can do is add data:
+            self.digraph[ind] = DGNode(node.key, node.edges, data)
+            return key
+        self.key2ind[key] = len(self.digraph)
+        self.digraph.append(DGNode(key, [], data))
+        return key
+
+    def add_edge(self, from_key, to_key):
+        """Adds an edge, and implicitly also creates nodes for keys which have
+        not been seen before. This will not let you add data to your nodes.
+        The relation encodes: "from_key depends on to_key"
+        (to_key must be evaluated before from_key).
+
+        Arguments
+        ---------
+        from_key : hashable
+            The key which depends on.
+        to_key : hashable
+            The key which is depended on.
+
+        Returns
+        -------
+        None
+        """
+        from_ind = self._get_ind_and_add_if_new(from_key)
+        to_ind = self._get_ind_and_add_if_new(to_key)
+        edges_list = self.digraph[from_ind].edges
+        if to_ind not in edges_list:
+            edges_list.append(to_ind)
+
+    def _get_ind_and_add_if_new(self, key):
+        # Used internally to implicitly add nodes for unseen keys
+        if key not in self.key2ind:
+            self.key2ind[key] = len(self.digraph)
+            self.digraph.append(DGNode(key, [], None))
+        return self.key2ind[key]
+
+    def is_valid(self):
+        """Checks if an evaluation order can be found.
+
+        A dependency graph is evaluatable if there are no circular
+        dependencies, i.e., the graph is acyclic.
+
+        Returns
+        -------
+        bool
+            Indicating if the graph is evaluatable.
+        """
+        return not self._find_first_cycle()
+
+    def get_evaluation_order(self, selected_keys=None):
+        """Finds one valid evaluation order.
+
+        There can be many different valid
+        orders.
+        NOTE: Generates output one DGNode at a time. May generate DGNodes
+        before it finds a circular dependency. If you really need to know
+        whether an order can be found, check is_valid() first. However,
+        the algorithm for finding cycles is essentially the same as the one
+        used for finding an evaluation order, so for very large graphs...
+        Ah well, but maybe then you should be using some other solution
+        anyway.
+
+        Arguments
+        ---------
+        selected_keys : list, None
+            List of keys. If not None, only the selected keys are guaranteed
+            in the evaluation order (along with the keys they depend on).
+
+        Yields
+        ------
+        DGNode
+            The added DGNodes in a valid evaluation order.
+            See the DGNode namedtuple above.
+
+        Raises
+        ------
+        CircularDependencyError
+            If a circular dependency is found.
+        """
+        seen_ever = set()
+
+        def toposort(root_ind, visited):
+            """Implementation of topsort."""
+            nonlocal seen_ever
+            here = visited + [root_ind]
+            if root_ind in visited:
+                raise CircularDependencyError("{cycle}".format(
+                    cycle=" -> ".join(str(self.digraph[i].key) for i in here)))
+            if root_ind in seen_ever:
+                return  # Yield nothing
+            seen_ever = seen_ever.union(set([root_ind]))
+            for to_ind in self.digraph[root_ind].edges:
+                for ind in toposort(to_ind, visited=here):
+                    yield ind
+            yield root_ind
+
+        if selected_keys is None:
+            start_inds = range(len(self.digraph))
+        else:
+            start_inds = [self.key2ind[key] for key in selected_keys]
+
+        for start_ind in start_inds:
+            for ind in toposort(start_ind, []):
+                yield self.digraph[ind]
+
+    def _find_first_cycle(self):
+        """Depth-first search based algorithm for finding cycles in the graph."""
+        seen_ever = set()
+
+        def cycle_dfs(root_ind, visited):
+            """Implementation of cycle_dfs."""
+            nonlocal seen_ever
+            print(root_ind, visited)
+            here = visited + [root_ind]
+            if root_ind in visited:
+                return here
+            if root_ind in seen_ever:
+                return []
+            seen_ever = seen_ever.union(set([root_ind]))
+            for to_ind in self.digraph[root_ind].edges:
+                cycle = cycle_dfs(to_ind, here)
+                if cycle:
+                    return cycle
+            return []
+
+        for ind in range(len(self.digraph)):
+            if ind not in seen_ever:
+                cycle = cycle_dfs(ind, [])
+                if cycle:
+                    return cycle
+        return []
+
+    def __contains__(self, key):
+        # Allows the syntax:
+        # 'key' in dependency_graph
+        return key in self.key2ind
diff --git a/paddlespeech/s2t/io/speechbrain/make_dataloader.py b/paddlespeech/s2t/io/speechbrain/make_dataloader.py
new file mode 100755
index 000000000..e6a622d7d
--- /dev/null
+++ b/paddlespeech/s2t/io/speechbrain/make_dataloader.py
@@ -0,0 +1,118 @@
+# Copyright (c) 2023 speechbrain Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Modified from speechbrain 2023 (https://github.com/speechbrain/speechbrain/blob/develop/speechbrain/core.py)
+import paddlespeech.s2t.io.speechbrain.dataloader
+
+
+def _train_loader_specifics(self, dataset, loader_kwargs):
+    sampler = loader_kwargs.get("sampler", None)
+    # Shuffling should really only matter for the train stage. Shuffling
+    # will also lead to more padding in batches if the order was otherwise
+    # sorted by length.
+    shuffle = loader_kwargs.get("shuffle", False)
+    if shuffle and not self.distributed_launch:
+        if sampler is not None:
+            raise ValueError("Cannot specify both shuffle=True"
+                             "and a sampler in loader_kwargs")
+        sampler = ReproducibleRandomSampler(dataset)
+        self.train_sampler = sampler
+        loader_kwargs["sampler"] = self.train_sampler
+        # Delete the shuffle flag, since you cannot specify both a sampler and
+        # shuffling:
+        del loader_kwargs["shuffle"]
+
+    # Possibly make a DistributedSampler or a wrapper for some other sampler
+    if self.distributed_launch and not isinstance(dataset, IterableDataset):
+        drop_last = loader_kwargs.get("drop_last", False)
+        # num_replicas arg is equal to world_size
+        # and retrieved automatically within
+        # DistributedSampler obj.
+        if sampler is not None:
+            self.train_sampler = DistributedSamplerWrapper(
+                sampler,
+                rank=self.rank,
+                drop_last=drop_last,
+                shuffle=shuffle, )
+
+            # with DistributedSamplerWrapper, one must disable shuffling for dataloader
+            loader_kwargs["shuffle"] = False
+            loader_kwargs["sampler"] = self.train_sampler
+        elif loader_kwargs.get("batch_sampler") is None:
+            # no sampler and batch-sampler
+            self.train_sampler = DistributedSampler(
+                dataset, rank=self.rank, shuffle=True, drop_last=drop_last)
+
+            # with DistributedSamplerWrapper, one must disable shuffling for dataloader
+            loader_kwargs["shuffle"] = False
+            loader_kwargs["sampler"] = self.train_sampler
+        else:  # batch_sampler was specified
+            self.train_sampler = DistributedSamplerWrapper(
+                loader_kwargs.get("batch_sampler", None),
+                rank=self.rank,
+                shuffle=True, )
+            loader_kwargs["batch_sampler"] = self.train_sampler
+    elif self.distributed_launch and isinstance(dataset, IterableDataset):
+        logger.warning("Cannot automatically solve distributed sampling "
+                       "for IterableDataset.")
+    return loader_kwargs
+
+
+def make_dataloader(self, dataset, stage, **loader_kwargs):
+    """Creates DataLoaders for Datasets.
+
+        This is used by ``fit()`` and ``evaluate()`` if they just receive
+        Datasets.
+
+        Alternatively, this can be called from outside the Brain subclass.
+        In that case, the DataLoader should be passed to ``fit()`` in place
+        of the dataset.
+
+        The Stage.TRAIN DataLoader is handled specially. It has extra args for
+        shuffle and drop_last. In DDP a DistributedSampler is created (unless
+        the dataset is an IterableDataset).
+
+        NOTE
+        ----
+        Some important DataLoader arguments are passed via **loader_kwargs,
+        e.g., batch_size, num_workers, pin_memory.
+
+        NOTE
+        ----
+        By default, ``evaluate()`` specifies ckpt_prefix=None to stop the test
+        DataLoader being added to the checkpointer. If you need to add a
+        recoverable after saving checkpoints (e.g., at test time, after
+        checkpointing the training), and still be able to recover reasonably,
+        you should probably specify ``allow_partial_load=True``.
+
+        Arguments
+        ---------
+        dataset : Dataset
+            A set of data to use to create data loader. If the Dataset is a
+            DynamicItemDataset, PaddedBatch is used as the default collate_fn,
+            unless specified in loader_kwargs.
+        stage : Stage
+            The stage of the experiment: Stage.TRAIN, Stage.VALID, Stage.TEST
+        ckpt_prefix : str, None
+            Prefix to use for SaveableDataLoader Checkpoint name. The Stage
+            name is added to this to create the full key. Set to None to not
+            save the DataLoader.
+        **loader_kwargs : dict
+            Additional keyword arguments to the DataLoader.
+            E.g., batch_size, num_workers, pin_memory.
+        """
+
+    dataloader_ = dataloader.make_dataloader(dataset, **loader_kwargs)
+    return dataloader_
diff --git a/paddlespeech/s2t/io/speechbrain/sampler.py b/paddlespeech/s2t/io/speechbrain/sampler.py
new file mode 100755
index 000000000..ba13193eb
--- /dev/null
+++ b/paddlespeech/s2t/io/speechbrain/sampler.py
@@ -0,0 +1,503 @@
+# Copyright (c) 2023 speechbrain Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# 
+# Modified from speechbrain 2023 (https://github.com/speechbrain/speechbrain/blob/develop/speechbrain/dataio/sampler.py)
+"""compatible samplers.
+
+These determine the order of iteration through a dataset.
+
+Authors:
+  * Aku Rouhe 2020
+  * Samuele Cornell 2020
+  * Ralf Leibold 2020
+  * Artem Ploujnikov 2021
+  * Andreas Nautsch 2021
+"""
+import logging
+from collections import Counter
+from typing import List
+
+import numpy as np
+import paddle
+from paddle.io import RandomSampler
+from paddle.io import Sampler
+from paddle.io import WeightedRandomSampler
+from scipy.stats import lognorm
+
+from paddlespeech.s2t.io.speechbrain.dataset import DynamicItemDataset
+
+logger = logging.getLogger(__name__)
+
+
+class ReproducibleRandomSampler(RandomSampler):
+    """A modification of RandomSampler which always returns the same values.
+
+    Also look at `paddle.io.RandomSampler`. This has mostly
+    the same behaviour and arguments, except for adding 'seed' and 'epoch' and
+    not supporting 'generator'.
+
+    Note
+    ----
+    Call `set_epoch` before every epoch. Otherwise, the sampler will produce the
+    same sequence of indices every epoch.
+
+    Arguments
+    ---------
+    data_source : Dataset
+        The data source to sample indices for.
+    seed : int
+        The base seed to use for the random number generator. It is recommended
+        to use a value which has a good mix of 0 and 1 bits.
+    epoch : int
+        The epoch to start at.
+
+    """
+
+    def __init__(self, data_source, seed=563375142, epoch=0, **kwargs):
+        if "generator" in kwargs:
+            MSG = ("Cannot give a separate generator when using " +
+                   "ReproducibleRandomSampler")
+            raise ValueError(MSG)
+        super().__init__(data_source, **kwargs)
+        self.seed = int(seed)
+        self.epoch = epoch
+        self.gen = paddle.seed(1)
+
+    def set_epoch(self, epoch):
+        """
+        You can also just access self.epoch, but we maintain this interface
+        to mirror paddle.io.DistributedBatchSampler
+        """
+        self.epoch = epoch
+
+    def __iter__(self):
+        self.gen.manual_seed(self.seed + self.epoch)
+        return super().__iter__()
+
+
+class ReproducibleWeightedRandomSampler(WeightedRandomSampler):
+    """A reproducible modification of WeightedRandomSampler.
+
+    Also look at `paddle.io.WeightedRandomSampler`. This has the
+    the same behaviour and arguments, except for adding 'seed' and 'epoch' and
+    not supporting 'generator'.
+
+    Note
+    ----
+    Call `set_epoch` before every epoch. Otherwise, the sampler will produce the
+    same sequence of indices every epoch.
+
+    Arguments
+    ---------
+    weights : sequence of float
+        Weights for each index. Doesn't need to sum to one.
+    num_samples : int
+        Number of samples to draw
+    replacement : bool
+        To draw with replacement or not (within an epoch of num_samples).
+    seed : int
+        The base seed to use for the random number generator. It is recommended
+        to use a value which has a good mix of 0 and 1 bits.
+    epoch : int
+        The epoch to start at.
+    """
+
+    def __init__(
+            self,
+            weights,
+            num_samples,
+            replacement,
+            seed=129491412,
+            epoch=0,
+            **kwargs, ):
+        if "generator" in kwargs:
+            MSG = ("Cannot give a separate generator when using " +
+                   "ReproducibleRandomSampler")
+            raise ValueError(MSG)
+        super().__init__(weights, num_samples, replacement, **kwargs)
+        self.seed = int(seed)
+        self.epoch = epoch
+        self.gen = paddle.seed(1)
+
+    def set_epoch(self, epoch):
+        """
+        You can also just access self.epoch, but we maintain this interface
+        to mirror paddle.io.DistributedBatchSampler
+        """
+        self.epoch = epoch
+
+    def __iter__(self):
+        self.gen.manual_seed(self.seed + self.epoch)
+        return super().__iter__()
+
+
+class DynamicBatchSampler(Sampler):
+    """This BatchSampler batches examples together by grouping them by their length.
+
+    Every example in the batch have approximately the same length and
+    thus padding is minimized.
+    This enables faster training on datasets
+    where length of examples can vary significantly (e.g Librispeech).
+    Inspired by: https://www.tensorflow.org/api_docs/python/tf/data/experimental/bucket_by_sequence_length
+
+    Dynamic batching is performed by specifying a max_batch_length which is the
+    upper limit for the sum of the length of examples in a batch:
+    e.g., if ex1 has length 4, ex2 length 5 and if max_batch_length is set to 6
+    ex1 and ex2 will be placed, alone, in two distinct batches.
+
+    Length for each example can be obtained in two manners.
+    If the input dataset is a DynamicItemDataset it can be obtained by specifying a
+    length_func. Default assumes a "duration" entry is in the annotation.
+    Length for each example can also be passed to this class upon instantiation
+    by specifying a list containing the length for each example and passing it to
+    lengths_list.
+
+    Examples are grouped together by defining a set of possible discrete intervals
+    (buckets). Examples whose length fall into these intervals can be batched together.
+
+    The number of buckets can be specified by using the arg num_buckets.
+    There is usually an optimal range for the value of this argument.
+
+    If num_buckets == 1, all examples can be batched together. You have maximum randomization
+    but your training speed will be slower due to the fact that a large amount of the values will be padding
+    as long and short examples can be batched together.
+    As the number of buckets grows only examples with similar
+    length can be grouped together.
+    This trades-off speed with randomization.
+    TLDR: Low number -> better randomization, High number -> faster training.
+    NOTE THAT: if set too high the training speed will decrease. If num_buckets -> number of examples in the 
+    dataset the batch size will be small impacting training speed and possibly performance.
+
+    The buckets can also be specified by passing a list to the bucket_boundaries
+    argument instead of specifying a left_bucket_length and a bucket_length_multiplier.
+
+    """
+
+    def __init__(
+            self,
+            dataset,
+            max_batch_length: int,
+            num_buckets: int=None,
+            length_func=lambda x: x["duration"],
+            shuffle: bool=True,
+            batch_ordering: str="random",
+            max_batch_ex: int=None,
+            bucket_boundaries: List[int]=[],
+            lengths_list: List[int]=None,
+            seed: int=42,
+            epoch: int=0,
+            drop_last: bool=False,
+            verbose: bool=False, ):
+        self._dataset = dataset
+        self._ex_lengths = {}
+        ex_ids = self._dataset.data_ids
+        self.verbose = verbose
+
+        # We do not put a default on num_buckets to encourage users to play with this parameter
+        if num_buckets is None and len(bucket_boundaries) == 0:
+            raise RuntimeError(
+                "Please specify either num_buckets or bucket boundaries."
+                "Check the docs, and/or the tutorial !")
+
+        if lengths_list is not None:
+            # take length of examples from this argument and bypass length_key
+            for indx in range(len(lengths_list)):
+                self._ex_lengths[str(indx)] = lengths_list[indx]
+        else:
+            # use length func
+            if not isinstance(dataset, DynamicItemDataset):
+                raise NotImplementedError(
+                    "Dataset should be a DynamicItemDataset when using length function"
+                )
+            for indx in range(len(self._dataset)):
+                self._ex_lengths[str(indx)] = length_func(
+                    self._dataset.data[ex_ids[indx]])
+
+        if len(bucket_boundaries) > 0:
+            if not all([x >= 0 for x in bucket_boundaries]):
+                raise ValueError(
+                    "All elements in bucket boundaries should be non-negative (>= 0)."
+                )
+            if not len(set(bucket_boundaries)) == len(bucket_boundaries):
+                raise ValueError(
+                    "Bucket_boundaries should not contain duplicates.")
+            np.testing.assert_array_equal(
+                np.array(bucket_boundaries),
+                np.array(sorted(bucket_boundaries)),
+                err_msg="The arg bucket_boundaries should be an ascending sorted list of non negative values values!",
+            )
+            self._bucket_boundaries = np.array(sorted(bucket_boundaries))
+        else:
+            # use num_buckets
+            self._bucket_boundaries = np.array(
+                self._get_boundaries_through_warping(
+                    max_batch_length=max_batch_length,
+                    num_quantiles=num_buckets, ))
+
+        self._max_batch_length = max_batch_length
+        self._shuffle_ex = shuffle
+        self._batch_ordering = batch_ordering
+        self._seed = seed
+        self._drop_last = drop_last
+        if max_batch_ex is None:
+            max_batch_ex = np.inf
+        self._max_batch_ex = max_batch_ex
+        # Calculate bucket lengths - how often does one bucket boundary fit into max_batch_length?
+        self._bucket_lens = [
+            max(1, int(max_batch_length / self._bucket_boundaries[i]))
+            for i in range(len(self._bucket_boundaries))
+        ] + [1]
+        self._epoch = epoch
+        self._generate_batches()
+
+    def get_durations(self, batch):
+        """Gets durations of the elements in the batch."""
+        return [self._ex_lengths[str(idx)] for idx in batch]
+
+    def _get_boundaries_through_warping(
+            self,
+            max_batch_length: int,
+            num_quantiles: int, ) -> List[int]:
+
+        # NOTE: the following lines do not cover that there is only one example in the dataset
+        # warp frames (duration) distribution of train data
+        logger.info("Batch quantisation in latent space")
+        # linspace set-up
+        num_boundaries = num_quantiles + 1
+        # create latent linearly equal spaced buckets
+        latent_boundaries = np.linspace(
+            1 / num_boundaries,
+            num_quantiles / num_boundaries,
+            num_quantiles, )
+        # get quantiles using lognormal distribution
+        quantiles = lognorm.ppf(latent_boundaries, 1)
+        # scale up to to max_batch_length
+        bucket_boundaries = quantiles * max_batch_length / quantiles[-1]
+        # compute resulting bucket length multipliers
+        length_multipliers = [
+            bucket_boundaries[x + 1] / bucket_boundaries[x]
+            for x in range(num_quantiles - 1)
+        ]
+        # logging
+        logger.info(
+            "Latent bucket boundary - buckets: {} - length multipliers: {}".
+            format(
+                list(map("{:.2f}".format, bucket_boundaries)),
+                list(map("{:.2f}".format, length_multipliers)), ))
+        return list(sorted(bucket_boundaries))
+
+    def _permute_batches(self):
+
+        if self._batch_ordering == "random":
+            # deterministically shuffle based on epoch and seed
+            gen = paddle.seed(1)
+            gen.manual_seed(self._seed + self._epoch)
+            sampler = paddle.randperm(
+                len(self._batches)).tolist()  # type: ignore
+            tmp = []
+            for idx in sampler:
+                tmp.append(self._batches[idx])
+            self._batches = tmp
+
+        elif self._batch_ordering == "ascending":
+            self._batches = sorted(
+                self._batches,
+                key=lambda x: max([self._ex_lengths[str(idx)] for idx in x]), )
+        elif self._batch_ordering == "descending":
+            self._batches = sorted(
+                self._batches,
+                key=lambda x: max([self._ex_lengths[str(idx)] for idx in x]),
+                reverse=True, )
+        else:
+            raise NotImplementedError
+
+    def _generate_batches(self):
+        logger.info("DynamicBatchSampler: Generating dynamic batches")
+        if self._shuffle_ex:
+            # deterministically shuffle based on epoch and seed
+            gen = paddle.seed(1)
+            gen.manual_seed(self._seed + self._epoch)
+            sampler = paddle.randperm(
+                len(self._dataset)).tolist()  # type: ignore
+        else:
+            # take examples as they are: e.g. they have been sorted
+            sampler = range(len(self._dataset))  # type: ignore
+
+        self._batches = []
+        bucket_batches = [[] for i in self._bucket_lens]
+
+        stats_tracker = [{
+            "min": np.inf,
+            "max": -np.inf,
+            "tot": 0,
+            "n_ex": 0
+        } for i in self._bucket_lens]
+
+        for idx in sampler:
+            # length of pre-sampled audio
+            item_len = self._ex_lengths[str(idx)]
+            # bucket to fill up most padding
+            bucket_id = np.searchsorted(self._bucket_boundaries, item_len)
+            # fill audio's duration into that bucket
+            bucket_batches[bucket_id].append(idx)
+
+            stats_tracker[bucket_id]["min"] = min(
+                stats_tracker[bucket_id]["min"], item_len)
+            stats_tracker[bucket_id]["max"] = max(
+                stats_tracker[bucket_id]["max"], item_len)
+            stats_tracker[bucket_id]["tot"] += item_len
+            stats_tracker[bucket_id]["n_ex"] += 1
+            # track #samples - why not duration/#frames; rounded up?
+            # keep track of durations, if necessary
+
+            if (len(bucket_batches[bucket_id]) >= self._bucket_lens[bucket_id]
+                    or len(bucket_batches[bucket_id]) >= self._max_batch_ex):
+                self._batches.append(bucket_batches[bucket_id])
+                bucket_batches[bucket_id] = []
+                # keep track of durations
+
+            # Dump remaining batches
+        if not self._drop_last:
+            for batch in bucket_batches:
+                if batch:
+                    self._batches.append(batch)
+
+        self._permute_batches()  # possibly reorder batches
+
+        if self._epoch == 0:  # only log at first epoch
+            # frames per batch & their padding remaining
+            boundaries = [0] + self._bucket_boundaries.tolist()
+
+            for bucket_indx in range(len(self._bucket_boundaries)):
+                try:
+                    num_batches = stats_tracker[bucket_indx]["tot"] // (
+                        self._max_batch_length)
+                    pad_factor = (stats_tracker[bucket_indx]["max"] -
+                                  stats_tracker[bucket_indx]["min"]) / (
+                                      stats_tracker[bucket_indx]["tot"] /
+                                      stats_tracker[bucket_indx]["n_ex"])
+                except ZeroDivisionError:
+                    num_batches = 0
+                    pad_factor = 0
+
+                logger.info((
+                    "DynamicBatchSampler: Bucket {} with boundary {:.1f}-{:.1f} and "
+                    +
+                    "batch_size {}: Num Examples {:.1f}, Num Full Batches {:.3f}, Pad Factor {:.3f}."
+                ).format(
+                    bucket_indx,
+                    boundaries[bucket_indx],
+                    boundaries[bucket_indx + 1],
+                    self._bucket_lens[bucket_indx],
+                    stats_tracker[bucket_indx]["n_ex"],
+                    num_batches,
+                    pad_factor * 100, ))
+
+            if self.verbose:
+                batch_stats = {
+                    "tot_frames": [],
+                    "tot_pad_frames": [],
+                    "pad_%": [],
+                }
+                for batch in self._batches:
+                    tot_frames = sum(
+                        [self._ex_lengths[str(idx)] for idx in batch])
+                    batch_stats["tot_frames"].append(tot_frames)
+                    max_frames = max(
+                        [self._ex_lengths[str(idx)] for idx in batch])
+                    tot_pad = sum([
+                        max_frames - self._ex_lengths[str(idx)] for idx in batch
+                    ])
+                    batch_stats["tot_pad_frames"].append(tot_pad)
+                    batch_stats["pad_%"].append(tot_pad / tot_frames * 100)
+
+                padding_details = "Batch {} with {:.1f} frames with {} files - {:.1f} padding, {:.2f} (%) of total."
+                padding_details = "DynamicBatchSampler: " + padding_details
+                for i in range(len(self._batches)):
+                    logger.info(
+                        padding_details.format(
+                            i,
+                            batch_stats["tot_frames"][i],
+                            len(self._batches[i]),
+                            batch_stats["tot_pad_frames"][i],
+                            batch_stats["pad_%"][i], ))
+
+    def __iter__(self):
+        for batch in self._batches:
+            yield batch
+        if self._shuffle_ex:  # re-generate examples if ex_ordering == "random"
+            self._generate_batches()
+        if self._batch_ordering == "random":
+            # we randomly permute the batches only --> faster
+            self._permute_batches()
+
+    def set_epoch(self, epoch):
+        """
+        You can also just access self.epoch, but we maintain this interface
+        to mirror paddle.io.DistributedBatchSampler
+        """
+        self._epoch = epoch
+        self._generate_batches()
+
+    def __len__(self):
+        return len(self._batches)
+
+
+class BalancingDataSampler(ReproducibleWeightedRandomSampler):
+    """A data sampler that takes a single key from the dataset and
+    ensures an approximately equal distribution by that key
+
+    Arguments
+    ---------
+    dataset: DynamicItemDataset
+        the dataset form which samples will be drawn
+    key: str
+        the key from which samples will be taken
+    num_samples : int
+        Number of samples to draw
+    replacement : bool
+        To draw with replacement or not (within an epoch of num_samples).
+    seed : int
+        The base seed to use for the random number generator. It is recommended
+        to use a value which has a good mix of 0 and 1 bits.
+    epoch : int
+        The epoch to start at.
+
+    """
+
+    def __init__(
+            self,
+            dataset,
+            key,
+            num_samples=None,
+            replacement=True,
+            seed=563375142,
+            epoch=0,
+            **kwargs, ):
+        self.dataset = dataset
+        self.key = key
+        if not num_samples:
+            num_samples = len(dataset)
+        weights = self._compute_weights()
+        super().__init__(weights, num_samples, replacement, seed, epoch,
+                         **kwargs)
+
+    def _compute_weights(self):
+        with self.dataset.output_keys_as([self.key]):
+            class_ids = [item[self.key] for item in self.dataset]
+            class_counter = Counter(class_ids)
+        weights = 1 / paddle.to_tensor(
+            [class_counter[class_id] for class_id in class_ids])
+        return weights
diff --git a/paddlespeech/s2t/io/speechbrain/sb_pipeline.py b/paddlespeech/s2t/io/speechbrain/sb_pipeline.py
new file mode 100755
index 000000000..0a5cf82f1
--- /dev/null
+++ b/paddlespeech/s2t/io/speechbrain/sb_pipeline.py
@@ -0,0 +1,156 @@
+# Copyright (c) 2023 speechbrain Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# 
+# Modified from speechbrain 2023 (https://github.com/speechbrain/speechbrain/blob/develop/recipes/AISHELL-1/ASR/CTC/train_with_wav2vec.py)
+import data_pipeline
+import dataio
+import numpy
+import paddle
+import tqdm
+import transformers
+from dataloader import make_dataloader
+from hyperpyyaml import load_hyperpyyaml
+
+import dataset
+
+
+def dataio_prepare(hparams):
+    """This function prepares the datasets to be used in the brain class.
+    It also defines the data processing pipeline through user-defined functions."""
+    data_folder = hparams["data_folder"]
+
+    train_data = dataset.DynamicItemDataset.from_csv(
+        csv_path=hparams["train_data"],
+        replacements={"data_root": data_folder}, )
+
+    if hparams["sorting"] == "ascending":
+        # we sort training data to speed up training and get better results.
+        train_data = train_data.filtered_sorted(sort_key="duration")
+        # when sorting do not shuffle in dataloader ! otherwise is pointless
+        hparams["train_dataloader_opts"]["shuffle"] = False
+
+    elif hparams["sorting"] == "descending":
+        train_data = train_data.filtered_sorted(
+            sort_key="duration", reverse=True)
+        # when sorting do not shuffle in dataloader ! otherwise is pointless
+        hparams["train_dataloader_opts"]["shuffle"] = False
+
+    elif hparams["sorting"] == "random":
+        pass
+
+    else:
+        raise NotImplementedError(
+            "sorting must be random, ascending or descending")
+
+    valid_data = dataset.DynamicItemDataset.from_csv(
+        csv_path=hparams["valid_data"],
+        replacements={"data_root": data_folder}, )
+    valid_data = valid_data.filtered_sorted(sort_key="duration")
+
+    test_data = dataset.DynamicItemDataset.from_csv(
+        csv_path=hparams["test_data"],
+        replacements={"data_root": data_folder}, )
+    test_data = test_data.filtered_sorted(sort_key="duration")
+
+    datasets = [train_data, valid_data, test_data]
+
+    # Defining tokenizer and loading it
+    tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-chinese')
+
+    # 2. Define audio pipeline:
+    @data_pipeline.takes("wav")
+    @data_pipeline.provides("sig")
+    def audio_pipeline(wav):
+        sig = dataio.read_audio(wav)
+        return sig
+
+    dataset.add_dynamic_item(datasets, audio_pipeline)
+
+    # 3. Define text pipeline:
+    @data_pipeline.takes("transcript")
+    @data_pipeline.provides("wrd", "tokens_list", "tokens")
+    def text_pipeline(wrd):
+        wrd = "".join(wrd.split(" "))
+        yield wrd
+        tokens_list = tokenizer(wrd)["input_ids"]
+        yield tokens_list
+        tokens = numpy.array(tokens_list, dtype="int64")
+        yield tokens
+
+    dataset.add_dynamic_item(datasets, text_pipeline)
+
+    # 4. Set output:
+    dataset.set_output_keys(
+        datasets,
+        ["id", "sig", "wrd", "tokens"], )
+
+    # 5. If Dynamic Batching is used, we instantiate the needed samplers.
+    train_batch_sampler = None
+    valid_batch_sampler = None
+    if hparams["dynamic_batching"]:
+        from sampler import DynamicBatchSampler  # noqa
+
+        dynamic_hparams = hparams["dynamic_batch_sampler"]
+        num_buckets = dynamic_hparams["num_buckets"]
+
+        train_batch_sampler = DynamicBatchSampler(
+            train_data,
+            dynamic_hparams["max_batch_len"],
+            num_buckets=num_buckets,
+            length_func=lambda x: x["duration"],
+            shuffle=dynamic_hparams["shuffle_ex"],
+            batch_ordering=dynamic_hparams["batch_ordering"], )
+
+        valid_batch_sampler = DynamicBatchSampler(
+            valid_data,
+            dynamic_hparams["max_batch_len"],
+            num_buckets=num_buckets,
+            length_func=lambda x: x["duration"],
+            shuffle=dynamic_hparams["shuffle_ex"],
+            batch_ordering=dynamic_hparams["batch_ordering"], )
+
+    return (train_data, valid_data, test_data, tokenizer, train_batch_sampler,
+            valid_batch_sampler, )
+
+
+hparams_file = 'train_with_wav2vec.yaml'
+with open(hparams_file) as fin:
+    hparams = load_hyperpyyaml(fin, None)
+
+(train_data, valid_data, test_data, tokenizer, train_bsampler,
+ valid_bsampler, ) = dataio_prepare(hparams)
+
+train_dataloader_opts = hparams["train_dataloader_opts"]
+valid_dataloader_opts = hparams["valid_dataloader_opts"]
+
+if train_bsampler is not None:
+    train_dataloader_opts = {
+        "batch_sampler": train_bsampler,
+        "num_workers": hparams["num_workers"],
+    }
+
+if valid_bsampler is not None:
+    valid_dataloader_opts = {"batch_sampler": valid_bsampler}
+
+train_set = make_dataloader(train_data, stage='train', **train_dataloader_opts)
+
+valid_set = make_dataloader(
+    valid_data,
+    stage='train',
+    **valid_dataloader_opts, )
+
+for batch in valid_set:
+    print(batch)
+print('done')  # exit()
diff --git a/paddlespeech/s2t/models/wav2vec2/__init__.py b/paddlespeech/s2t/models/wav2vec2/__init__.py
index 3a12a9cf3..1ad761712 100644
--- a/paddlespeech/s2t/models/wav2vec2/__init__.py
+++ b/paddlespeech/s2t/models/wav2vec2/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddlespeech/s2t/models/wav2vec2/processing/signal_processing.py b/paddlespeech/s2t/models/wav2vec2/processing/signal_processing.py
index 0c4ade7b7..7267e2211 100644
--- a/paddlespeech/s2t/models/wav2vec2/processing/signal_processing.py
+++ b/paddlespeech/s2t/models/wav2vec2/processing/signal_processing.py
@@ -1,10 +1,5 @@
-# Authors
-#  * Peter Plantinga 2020
-#  * Francois Grondin 2020
-#  * William Aris 2020
-#  * Samuele Cornell 2020
-#  * Sarthak Yadav 2022
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2023 speechbrain Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,7 +12,16 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# Modified from speechbrain(https://github.com/speechbrain/speechbrain/blob/develop/speechbrain/processing/signal_processing.py)
+# Modified from speechbrain 2023 (https://github.com/speechbrain/speechbrain/blob/develop/speechbrain/processing/signal_processing.py)
+"""
+Low level signal processing utilities
+Authors
+ * Peter Plantinga 2020
+ * Francois Grondin 2020
+ * William Aris 2020
+ * Samuele Cornell 2020
+ * Sarthak Yadav 2022
+"""
 import numpy as np
 import paddle
 
diff --git a/paddlespeech/s2t/models/wav2vec2/processing/speech_augmentation.py b/paddlespeech/s2t/models/wav2vec2/processing/speech_augmentation.py
index 9224549a4..5482ed561 100644
--- a/paddlespeech/s2t/models/wav2vec2/processing/speech_augmentation.py
+++ b/paddlespeech/s2t/models/wav2vec2/processing/speech_augmentation.py
@@ -1,5 +1,4 @@
-# Authors
-# * Peter Plantinga 2020
+# Copyright (c) 2023 speechbrain Authors. All Rights Reserved.
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -14,6 +13,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # Modified from speechbrain(https://github.com/speechbrain/speechbrain/blob/develop/speechbrain/processing/speech_augmentation.py)
+"""Classes for mutating speech data for data augmentation.
+This module provides classes that produce realistic distortions of speech
+data for the purpose of training speech processing models. The list of
+distortions includes adding noise, adding reverberation, changing speed,
+and more. All the classes are of type `torch.nn.Module`. This gives the
+possibility to have end-to-end differentiability and
+backpropagate the gradient through them. In addition, all operations
+are expected to be performed on the GPU (where available) for efficiency.
+
+Authors
+ * Peter Plantinga 2020
+"""
 import math
 
 import paddle
@@ -64,7 +75,6 @@ class SpeedPerturb(nn.Layer):
 
         # Initialize index of perturbation
         self.samp_index = 0
-
         # Initialize resamplers
         self.resamplers = []
         for speed in self.speeds:
@@ -89,7 +99,6 @@ class SpeedPerturb(nn.Layer):
 
         # Don't perturb (return early) 1-`perturb_prob` portion of the batches
         if paddle.rand([1]) > self.perturb_prob:
-
             return waveform.clone()
         # Perform a random perturbation
         self.samp_index = paddle.randint(len(self.speeds), shape=(1, ))[0]
@@ -456,10 +465,6 @@ class DropFreq(nn.Layer):
             high=self.drop_count_high + 1,
             shape=(1, ), )
 
-        # Pick a frequency to drop
-        drop_range = self.drop_freq_high - self.drop_freq_low
-        drop_frequency = (
-            paddle.rand(drop_count) * drop_range + self.drop_freq_low)
         # Filter parameters
         filter_length = 101
         pad = filter_length // 2
@@ -467,13 +472,19 @@ class DropFreq(nn.Layer):
         # Start with delta function
         drop_filter = paddle.zeros([1, filter_length, 1])
         drop_filter[0, pad, 0] = 1
-        # Subtract each frequency
-        for frequency in drop_frequency:
-            notch_kernel = notch_filter(
-                frequency,
-                filter_length,
-                self.drop_width, )
-            drop_filter = convolve1d(drop_filter, notch_kernel, pad)
+
+        if drop_count.shape == 0:
+            # Pick a frequency to drop
+            drop_range = self.drop_freq_high - self.drop_freq_low
+            drop_frequency = (
+                paddle.rand(drop_count) * drop_range + self.drop_freq_low)
+            # Subtract each frequency
+            for frequency in drop_frequency:
+                notch_kernel = notch_filter(
+                    frequency,
+                    filter_length,
+                    self.drop_width, )
+                drop_filter = convolve1d(drop_filter, notch_kernel, pad)
 
         # Apply filter
         dropped_waveform = convolve1d(dropped_waveform, drop_filter, pad)
@@ -736,8 +747,7 @@ class SpecAugment(paddle.nn.Layer):
         # compute center and corresponding window
         c = paddle.randint(window, time - window, (1, ))[0]
         w = paddle.randint(c - window, c + window, (1, ))[0] + 1
-        # c = 5
-        # w = 10
+
         left = paddle.nn.functional.interpolate(
             x[:, :, :c],
             (w, x.shape[3]),
diff --git a/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py b/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py
old mode 100644
new mode 100755
index dc6c6d1d3..baa7392eb
--- a/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py
+++ b/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from collections import defaultdict
+from turtle import Turtle
 from typing import Dict
 from typing import List
 from typing import Tuple
@@ -57,18 +58,24 @@ class Wav2vec2ASR(nn.Layer):
     def forward(self, wav, wavs_lens_rate, target, target_lens):
         if self.normalize_wav:
             wav = F.layer_norm(wav, wav.shape)
+
         # Extract wav2vec output
         out = self.wav2vec2(wav)[0]
         # We normalize the output if required
         if self.output_norm:
             out = F.layer_norm(out, out.shape)
-        if self.train and hasattr(self.config, 'spec_augment'):
+
+        if self.training and hasattr(self.config, 'spec_augment'):
             feats = self.spec_augment(out)
         else:
             feats = out
+
         x = self.enc(feats)
+
         x_lens = (wavs_lens_rate * x.shape[1]).round().astype(paddle.int64)
+
         ctc_loss = self.ctc(x, x_lens, target, target_lens)
+
         return ctc_loss
 
     @paddle.no_grad()
@@ -77,50 +84,60 @@ class Wav2vec2ASR(nn.Layer):
                text_feature: Dict[str, int],
                decoding_method: str,
                beam_size: int,
-               tokenizer: str=None):
+               tokenizer: str=None,
+               sb_pipeline=False):
         batch_size = feats.shape[0]
 
         if decoding_method == 'ctc_prefix_beam_search' and batch_size > 1:
-            raise ValueError(
+            logger.error(
                 f"decoding mode {decoding_method} must be running with batch_size == 1"
             )
+            logger.error(f"current batch_size is {batch_size}")
 
         if decoding_method == 'ctc_greedy_search':
-            if tokenizer is None:
+            if tokenizer is None and sb_pipeline is False:
                 hyps = self.ctc_greedy_search(feats)
                 res = [text_feature.defeaturize(hyp) for hyp in hyps]
                 res_tokenids = [hyp for hyp in hyps]
             else:
-                hyps = self.ctc_greedy_search(feats)
+                if sb_pipeline is True:
+                    hyps = self.ctc_greedy_search(feats.unsqueeze(-1))
+                else:
+                    hyps = self.ctc_greedy_search(feats)
                 res = []
                 res_tokenids = []
                 for sequence in hyps:
-                    # Decode token terms to words
+                    # Decode token terms to words 
                     predicted_tokens = text_feature.convert_ids_to_tokens(
                         sequence)
-                    tmp_res = []
-                    tmp_res_tokenids = []
-                    for c in predicted_tokens:
-                        if c == "[CLS]":
-                            continue
-                        elif c == "[SEP]" or c == "[PAD]":
-                            break
-                        else:
-                            tmp_res.append(c)
-                            tmp_res_tokenids.append(text_feature.vocab[c])
-                    res.append(''.join(tmp_res))
-                    res_tokenids.append(tmp_res_tokenids)
+                tmp_res = []
+                tmp_res_tokenids = []
+                for c in predicted_tokens:
+                    if c == "[CLS]":
+                        continue
+                    elif c == "[SEP]" or c == "[PAD]":
+                        break
+                    else:
+                        tmp_res.append(c)
+                        tmp_res_tokenids.append(text_feature.vocab[c])
+                res.append(''.join(tmp_res))
+                res_tokenids.append(tmp_res_tokenids)
+
         # ctc_prefix_beam_search and attention_rescoring only return one
         # result in List[int], change it to List[List[int]] for compatible
         # with other batch decoding mode
         elif decoding_method == 'ctc_prefix_beam_search':
             assert feats.shape[0] == 1
-            if tokenizer is None:
+            if tokenizer is None and sb_pipeline is False:
                 hyp = self.ctc_prefix_beam_search(feats, beam_size)
                 res = [text_feature.defeaturize(hyp)]
                 res_tokenids = [hyp]
             else:
-                hyp = self.ctc_prefix_beam_search(feats, beam_size)
+                if sb_pipeline is True:
+                    hyp = self.ctc_prefix_beam_search(
+                        feats.unsqueeze(-1), beam_size)
+                else:
+                    hyp = self.ctc_prefix_beam_search(feats, beam_size)
                 res = []
                 res_tokenids = []
                 predicted_tokens = text_feature.convert_ids_to_tokens(hyp)
@@ -290,13 +307,10 @@ class Wav2vec2Base(nn.Layer):
     @classmethod
     def from_config(cls, configs: dict):
         """init model.
-
         Args:
             configs (dict): config dict.
-
         Raises:
             ValueError: raise when using not support encoder type.
-
         Returns:
             nn.Layer: Wav2Vec2Base
         """
diff --git a/paddlespeech/s2t/models/whisper/__init__.py b/paddlespeech/s2t/models/whisper/__init__.py
index 98ab23610..b78dece8a 100644
--- a/paddlespeech/s2t/models/whisper/__init__.py
+++ b/paddlespeech/s2t/models/whisper/__init__.py
@@ -1,5 +1,5 @@
 # MIT License, Copyright (c) 2022 OpenAI.
-# Copyright (c) 2022 PaddlePaddle Authors and . All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 # 
 # Modified from OpenAI Whisper 2022 (https://github.com/openai/whisper/whisper/__init__.py)
 from paddlespeech.s2t.models.whisper.whipser import decode
diff --git a/paddlespeech/s2t/models/whisper/tokenizer.py b/paddlespeech/s2t/models/whisper/tokenizer.py
index 1e1aea044..e8b201bcc 100644
--- a/paddlespeech/s2t/models/whisper/tokenizer.py
+++ b/paddlespeech/s2t/models/whisper/tokenizer.py
@@ -1,5 +1,5 @@
 # MIT License, Copyright (c) 2022 OpenAI.
-# Copyright (c) 2022 PaddlePaddle Authors and . All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 # 
 # Modified from OpenAI Whisper 2022 (https://github.com/openai/whisper/whisper/tokenizer.py)
 import os
diff --git a/paddlespeech/s2t/models/whisper/utils.py b/paddlespeech/s2t/models/whisper/utils.py
index d067af7d2..5528f9604 100644
--- a/paddlespeech/s2t/models/whisper/utils.py
+++ b/paddlespeech/s2t/models/whisper/utils.py
@@ -1,5 +1,5 @@
 # MIT License, Copyright (c) 2022 OpenAI.
-# Copyright (c) 2022 PaddlePaddle Authors and . All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 # 
 # Modified from OpenAI Whisper 2022 (https://github.com/openai/whisper/whisper/utils.py)
 import zlib
diff --git a/paddlespeech/s2t/models/whisper/whipser.py b/paddlespeech/s2t/models/whisper/whipser.py
index 9cf9a9eca..a28013e4b 100644
--- a/paddlespeech/s2t/models/whisper/whipser.py
+++ b/paddlespeech/s2t/models/whisper/whipser.py
@@ -1,5 +1,5 @@
 # MIT License, Copyright (c) 2022 OpenAI.
-# Copyright (c) 2022 PaddlePaddle Authors and . All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Modified from OpenAI Whisper 2022 (https://github.com/openai/whisper/whisper)
 import os
diff --git a/paddlespeech/s2t/training/gradclip.py b/paddlespeech/s2t/training/gradclip.py
index 26ac501e2..be6fcf589 100644
--- a/paddlespeech/s2t/training/gradclip.py
+++ b/paddlespeech/s2t/training/gradclip.py
@@ -43,8 +43,8 @@ class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm):
             if g.type == core.VarDesc.VarType.SELECTED_ROWS:
                 merge_grad = layers.merge_selected_rows(g)
                 merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
-            square = layers.square(merge_grad)
-            sum_square = layers.reduce_sum(square)
+            square = paddle.square(merge_grad)
+            sum_square = paddle.sum(square)
             sum_square_list.append(sum_square)
 
             # debug log, not dump all since slow down train process
@@ -57,23 +57,24 @@ class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm):
             return params_grads
 
         global_norm_var = layers.concat(sum_square_list)
-        global_norm_var = layers.reduce_sum(global_norm_var)
-        global_norm_var = layers.sqrt(global_norm_var)
+        global_norm_var = paddle.sum(global_norm_var)
+        global_norm_var = paddle.sqrt(global_norm_var)
+
         # debug log
         logger.debug(f"Grad Global Norm: {float(global_norm_var)}!!!!")
 
         max_global_norm = layers.fill_constant(
             shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm)
-        clip_var = layers.elementwise_div(
+        clip_var = paddle.divide(
             x=max_global_norm,
-            y=layers.elementwise_max(x=global_norm_var, y=max_global_norm))
+            y=paddle.maximum(x=global_norm_var, y=max_global_norm))
         for i, (p, g) in enumerate(params_grads):
             if g is None:
                 continue
             if getattr(p, 'need_clip', True) is False:
                 params_and_grads.append((p, g))
                 continue
-            new_grad = layers.elementwise_mul(x=g, y=clip_var)
+            new_grad = paddle.multiply(x=g, y=clip_var)
             params_and_grads.append((p, new_grad))
 
             # debug log, not dump all since slow down train process
diff --git a/paddlespeech/s2t/training/optimizer.py b/paddlespeech/s2t/training/optimizer/__init__.py
similarity index 99%
rename from paddlespeech/s2t/training/optimizer.py
rename to paddlespeech/s2t/training/optimizer/__init__.py
index f7f70c570..aafdc5b6a 100644
--- a/paddlespeech/s2t/training/optimizer.py
+++ b/paddlespeech/s2t/training/optimizer/__init__.py
@@ -19,7 +19,6 @@ from typing import Text
 import paddle
 from paddle.optimizer import Optimizer
 from paddle.regularizer import L2Decay
-
 from paddlespeech.s2t.training.gradclip import ClipGradByGlobalNormWithLog
 from paddlespeech.s2t.utils.dynamic_import import dynamic_import
 from paddlespeech.s2t.utils.dynamic_import import instance_class
diff --git a/paddlespeech/s2t/training/optimizer/adadelta.py b/paddlespeech/s2t/training/optimizer/adadelta.py
new file mode 100644
index 000000000..900b697c5
--- /dev/null
+++ b/paddlespeech/s2t/training/optimizer/adadelta.py
@@ -0,0 +1,239 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+from paddle.fluid import framework
+from paddle.optimizer import Optimizer
+
+__all__ = []
+
+
+class SimpleAdadelta(Optimizer):
+    r"""
+    **Notes: This API does not support sparse parameter optimization.**
+
+    Adadelta Optimizer. Please refer to this for details:
+    `ADADELTA: AN ADAPTIVE LEARNING RATE METHOD <https://arxiv.org/abs/1212.5701>`_.
+
+    The update is done as follows:
+
+    .. math::
+
+        E(g_t^2) &= \rho * E(g_{t-1}^2) + (1-\rho) * g^2
+
+        learning\_rate &= \sqrt{ ( E(dx_{t-1}^2) + \epsilon ) / ( E(g_t^2) + \epsilon ) }
+
+        E(dx_t^2) &= \rho * E(dx_{t-1}^2) + (1-\rho) * (-g*learning\_rate)^2
+
+    Args:
+        learning_rate (float|Tensor|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
+            It can be a float value, a ``Tensor`` with a float type or a LearningRateDecay. The default value is 0.001.
+        epsilon (float): a small float number for numeric stability. Default 1.0e-6.
+        rho (float): a floating point value indicating the decay rate. Default 0.95.
+        parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \
+            This parameter is required in dygraph mode. And you can specify different options for \
+            different parameter groups such as the learning rate, weight decay, etc, \
+            then the parameters are list of dict. Note that the learning_rate in paramter groups \
+            represents the scale of base learning_rate. \
+            The default value is None in static mode, at this time all parameters will be updated.
+        weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
+            It canbe a float value as coeff of L2 regularization or \
+            :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
+            If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
+            the regularization setting here in optimizer will be ignored for this parameter. \
+            Otherwise, the regularization setting here in optimizer will take effect. \
+            Default None, meaning there is no regularization.
+        foreach (bool, optional): whether foreach implementation of optimizer is used. The default value is None.
+        maximize (bool, optional): maximize the params based on the objective, instead of minimizing.
+            The default value is False.
+        name (str, optional): The default value is None. Normally there is no need for user
+                to set this property. For more information, please refer to
+                :ref:`api_guide_Name` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddlespeech.s2t.training.optimizer.adadelta import SimpleAdadelta
+
+            inp = paddle.uniform([10, 10], dtype="float32", min=-0.1, max=0.1)
+            linear = paddle.nn.Linear(10, 10)
+            out = linear(inp)
+            loss = paddle.mean(out)
+            adadelta = SimpleAdadelta(learning_rate=0.1, parameters=linear.parameters(), weight_decay=0.01)
+            out.backward()
+            adadelta.step()
+            adadelta.clear_grad()
+
+    """
+
+    def __init__(
+            self,
+            learning_rate=0.001,
+            epsilon=1.0e-6,
+            rho=0.95,
+            parameters=None,
+            weight_decay=0.0,
+            foreach=None,
+            maximize=False,
+            name=None, ):
+        if learning_rate is None:
+            raise ValueError("learning_rate is not set.")
+        if epsilon is None:
+            raise ValueError("epsilon is not set.")
+        if rho is None:
+            raise ValueError("rho is not set.")
+        super(SimpleAdadelta, self).__init__(
+            learning_rate=learning_rate,
+            parameters=parameters,
+            weight_decay=weight_decay,
+            name=name, )
+
+        self._epsilon = epsilon
+        self._rho = rho
+
+        self.state = 0  # self.state is 0 or 1, use to control init square_avgs and acc_deltas
+        self._weight_decay = weight_decay
+        self._learning_rate = learning_rate
+        self._foreach = foreach
+        self._maximize = maximize
+        self.square_avgs = []
+        self.acc_deltas = []
+
+    @paddle.no_grad()
+    @framework.dygraph_only
+    def step(self):
+        """Performs a single optimization step.
+
+        Args:
+            closure (Callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        if not isinstance(self._parameter_list[0], dict):
+            params_grads = []
+            for param in self._parameter_list:
+                if param.stop_gradient:
+                    continue
+                if param._grad_ivar() is not None:
+                    grad_var = param._grad_ivar()
+
+                    params_grads.append((param, grad_var))
+                    if self.state == 0:
+                        self.square_avg = paddle.zeros_like(param)
+                        self.acc_delta = paddle.zeros_like(param)
+                        self.square_avgs.append(self.square_avg)
+                        self.acc_deltas.append(self.acc_delta)
+
+        else:
+            # optimize parameters in groups
+            params_grads = []
+            for idx, param_group in enumerate(self._param_groups):
+                for param in param_group['params']:
+                    if param.stop_gradient:
+                        continue
+                    if param._grad_ivar() is not None:
+                        grad_var = param._grad_ivar()
+                        params_grads.append((param, grad_var))
+                        if self.state == 0:
+                            self.square_avg = paddle.zeros_like(param)
+                            self.acc_delta = paddle.zeros_like(param)
+                            self.square_avgs.append(self.square_avg)
+                            self.acc_deltas.append(self.acc_delta)
+
+        self.state = 1
+        adadelta(
+            params_grads,
+            square_avgs=self.square_avgs,
+            acc_deltas=self.acc_deltas,
+            learning_rate=self._learning_rate,
+            rho=self._rho,
+            epsilon=self._epsilon,
+            weight_decay=self._weight_decay,
+            foreach=self._foreach,
+            maximize=self._maximize)
+
+
+def adadelta(params_grads,
+             square_avgs,
+             acc_deltas,
+             foreach=None,
+             *,
+             learning_rate: float,
+             rho: float,
+             epsilon: float,
+             weight_decay: float,
+             maximize: bool):
+
+    if foreach is None:
+        # if foreach is None, set False
+        foreach = False
+    if not foreach:
+        # optimizer is used
+        func = _single_tensor_adadelta
+
+    func(
+        params_grads,
+        square_avgs,
+        acc_deltas,
+        learning_rate=learning_rate,
+        rho=rho,
+        epsilon=epsilon,
+        weight_decay=weight_decay,
+        maximize=maximize)
+
+
+def _single_tensor_adadelta(params_grads,
+                            square_avgs,
+                            acc_deltas,
+                            *,
+                            learning_rate: float,
+                            rho: float,
+                            epsilon: float,
+                            weight_decay: float,
+                            maximize: bool):
+    """
+    Calculate variables(square_avgs, acc_deltas) and update parameters.
+    """
+
+    for (params_grad, square_avg, acc_delta) in zip(params_grads, square_avgs,
+                                                    acc_deltas):
+        param, grad = params_grad
+        grad = grad if not maximize else -grad
+        if weight_decay != 0:
+            grad.set_value(grad.add(paddle.multiply(param, weight_decay)))
+
+        if paddle.is_complex(param):
+            square_avg = paddle.as_real(square_avg)
+            acc_delta = paddle.as_real(acc_delta)
+            grad = paddle.as_real(grad)
+        # square_avg = square_avg * rho + (1-rho) * grad * grad
+        square_avg.set_value(
+            paddle.multiply(square_avg, paddle.to_tensor(rho)).add(
+                paddle.multiply(paddle.to_tensor(1 - rho), grad.square())))
+        # std = (square_avg + eps).sqrt()
+        std = square_avg.add(paddle.to_tensor(epsilon)).sqrt_()
+        # delta = std / (acc_delta + eps).sqrt() * grad
+        delta = (paddle.multiply(
+            paddle.divide(
+                acc_delta.add(paddle.to_tensor(epsilon)).sqrt_(), std), grad))
+        # acc_delta = acc_delta * rho + (1-rho) * delta * delta
+        acc_delta.set_value(
+            paddle.multiply(acc_delta, paddle.to_tensor(rho)).add(
+                paddle.multiply(paddle.to_tensor(1 - rho), delta.square())))
+        if paddle.is_complex(param):
+            delta = paddle.as_real(delta)
+        # param = param - delta*learning_rate
+        param.set_value(
+            param.add(
+                paddle.multiply(
+                    delta.astype('float32'), paddle.to_tensor(-learning_rate))))
diff --git a/paddlespeech/t2s/datasets/get_feats.py b/paddlespeech/t2s/datasets/get_feats.py
index 21458f152..a90f1a417 100644
--- a/paddlespeech/t2s/datasets/get_feats.py
+++ b/paddlespeech/t2s/datasets/get_feats.py
@@ -102,7 +102,7 @@ class Pitch():
 
     def _convert_to_continuous_f0(self, f0: np.ndarray) -> np.ndarray:
         if (f0 == 0).all():
-            print("All frames seems to be unvoiced.")
+            print("All frames seems to be unvoiced, this utt will be removed.")
             return f0
 
         # padding start and end of f0 sequence
diff --git a/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py b/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py
index e450aa1a0..c43dafb3c 100644
--- a/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py
@@ -437,7 +437,7 @@ if __name__ == '__main__':
 
     vocab_phones = {}
 
-    with open(args.phones_dict, 'rt') as f:
+    with open(args.phones_dict, 'rt', encoding='utf-8') as f:
         phn_id = [line.strip().split() for line in f.readlines()]
     for phn, id in phn_id:
         vocab_phones[phn] = int(id)
diff --git a/paddlespeech/t2s/exps/ernie_sat/train.py b/paddlespeech/t2s/exps/ernie_sat/train.py
index 75a666bb1..c98d691be 100644
--- a/paddlespeech/t2s/exps/ernie_sat/train.py
+++ b/paddlespeech/t2s/exps/ernie_sat/train.py
@@ -109,7 +109,7 @@ def train_sp(args, config):
         num_workers=config.num_workers)
     print("dataloaders done!")
 
-    with open(args.phones_dict, "r") as f:
+    with open(args.phones_dict, 'rt', encoding='utf-8') as f:
         phn_id = [line.strip().split() for line in f.readlines()]
     vocab_size = len(phn_id)
     print("vocab_size:", vocab_size)
diff --git a/paddlespeech/t2s/exps/fastspeech2/preprocess.py b/paddlespeech/t2s/exps/fastspeech2/preprocess.py
index f4acdc60b..521b9a880 100644
--- a/paddlespeech/t2s/exps/fastspeech2/preprocess.py
+++ b/paddlespeech/t2s/exps/fastspeech2/preprocess.py
@@ -54,8 +54,15 @@ def process_sentence(config: Dict[str, Any],
     record = None
     if utt_id in sentences:
         # reading, resampling may occur
-        wav, _ = librosa.load(str(fp), sr=config.fs)
-        if len(wav.shape) != 1:
+        wav, _ = librosa.load(
+            str(fp), sr=config.fs,
+            mono=False) if "canton" in str(fp) else librosa.load(
+                str(fp), sr=config.fs)
+        if len(wav.shape) == 2 and "canton" in str(fp):
+            # Remind that Cantonese datasets should be placed in ~/datasets/canton_all. Otherwise, it may cause problem.
+            wav = wav[0]
+            wav = np.ascontiguousarray(wav)
+        elif len(wav.shape) != 1:
             return record
         max_value = np.abs(wav).max()
         if max_value > 1.0:
@@ -102,6 +109,8 @@ def process_sentence(config: Dict[str, Any],
         np.save(mel_path, logmel)
         # extract pitch and energy
         f0 = pitch_extractor.get_pitch(wav, duration=np.array(durations))
+        if (f0 == 0).all():
+            return None
         assert f0.shape[0] == len(durations)
         f0_dir = output_dir / "data_pitch"
         f0_dir.mkdir(parents=True, exist_ok=True)
@@ -282,7 +291,20 @@ def main():
                 test_wav_files += wav_files[-sub_num_dev:]
             else:
                 train_wav_files += wav_files
-
+    elif args.dataset == "canton":
+        sub_num_dev = 5
+        wav_dir = rootdir / "WAV"
+        train_wav_files = []
+        dev_wav_files = []
+        test_wav_files = []
+        for speaker in os.listdir(wav_dir):
+            wav_files = sorted(list((wav_dir / speaker).rglob("*.wav")))
+            if len(wav_files) > 100:
+                train_wav_files += wav_files[:-sub_num_dev * 2]
+                dev_wav_files += wav_files[-sub_num_dev * 2:-sub_num_dev]
+                test_wav_files += wav_files[-sub_num_dev:]
+            else:
+                train_wav_files += wav_files
     elif args.dataset == "ljspeech":
         wav_files = sorted(list((rootdir / "wavs").rglob("*.wav")))
         # split data into 3 sections
diff --git a/paddlespeech/t2s/exps/fastspeech2/train.py b/paddlespeech/t2s/exps/fastspeech2/train.py
index d31e62a82..97626db0b 100644
--- a/paddlespeech/t2s/exps/fastspeech2/train.py
+++ b/paddlespeech/t2s/exps/fastspeech2/train.py
@@ -67,7 +67,7 @@ def train_sp(args, config):
     if args.speaker_dict is not None:
         print("multiple speaker fastspeech2!")
         collate_fn = fastspeech2_multi_spk_batch_fn
-        with open(args.speaker_dict, 'rt') as f:
+        with open(args.speaker_dict, 'rt', encoding='utf-8') as f:
             spk_id = [line.strip().split() for line in f.readlines()]
         spk_num = len(spk_id)
         fields += ["spk_id"]
@@ -123,7 +123,7 @@ def train_sp(args, config):
         num_workers=config.num_workers)
     print("dataloaders done!")
 
-    with open(args.phones_dict, "r") as f:
+    with open(args.phones_dict, 'rt', encoding='utf-8') as f:
         phn_id = [line.strip().split() for line in f.readlines()]
     vocab_size = len(phn_id)
     print("vocab_size:", vocab_size)
diff --git a/paddlespeech/t2s/exps/sentences_canton.txt b/paddlespeech/t2s/exps/sentences_canton.txt
new file mode 100644
index 000000000..5ab5f7f36
--- /dev/null
+++ b/paddlespeech/t2s/exps/sentences_canton.txt
@@ -0,0 +1,20 @@
+001 白云山爬过一次嘅，好远啊，爬上去都成两个钟
+002 睇书咯，番屋企，而家好多人好少睇书噶喎
+003 因为如果唔考试嘅话，工资好低噶
+004 冇固定噶，你中意休边日就边日噶
+005 即系太迟嘅话咧，落班太迟嘅话就喺出边食啲咯
+006 是非有公理，慎言莫冒犯别人
+007 遇上冷风雨，休太认真
+008 痴线蜘蛛条蜘蛛丝痴住枝树枝
+009 一蚊一斤鸡，一蚊一斤龟，究竟係鸡贵定係龟贵
+010 错就要认，打要企定
+011 宜家唔系事必要你讲，但系你所讲嘅说话将会成为呈堂证供
+012 人生有几多个十年，不如活得痛快
+013 嘢可以乱食，话唔可以乱讲
+014 你唔好噉心急入市先喇，淡淡定，有钱剩，睇定啲先再决定喇
+015 仔，你唔好喺度搞搞震，冇帮衬喇
+016 米话我地人穷就要任人踩，滴水都会成流水浸街
+017 佢晨早啪奶茶，同场追加奶绿，又狂怼西米露，喫啫啫猪脚煲
+018 喂！三点几嚟，饮茶先啦，做咁多都冇用嘅，老细唔锡你嘅嚟
+019 嗱嗱声即刻走去搵嘢做，人必须知道自己嘅用途
+020 人人都揸住枝苏格兰场非工业用国际线路自动溶雪16哇佬风油軚垂直升降镭射彩色洗衣干衣气垫毛笔一枝
\ No newline at end of file
diff --git a/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py b/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py
index 644ec250d..d05dfafcf 100644
--- a/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py
@@ -39,18 +39,18 @@ def evaluate(args, speedyspeech_config, pwg_config):
 
     # construct dataset for evaluation
     sentences = []
-    with open(args.text, 'rt') as f:
+    with open(args.text, 'rt', encoding='utf-8') as f:
         for line in f:
             items = line.strip().split()
             utt_id = items[0]
             sentence = "".join(items[1:])
             sentences.append((utt_id, sentence))
 
-    with open(args.phones_dict, "r") as f:
+    with open(args.phones_dict, 'rt', encoding='utf-8') as f:
         phn_id = [line.strip().split() for line in f.readlines()]
     vocab_size = len(phn_id)
     print("vocab_size:", vocab_size)
-    with open(args.tones_dict, "r") as f:
+    with open(args.tones_dict, 'rt', encoding='utf-8') as f:
         tone_id = [line.strip().split() for line in f.readlines()]
     tone_size = len(tone_id)
     print("tone_size:", tone_size)
diff --git a/paddlespeech/t2s/exps/speedyspeech/train.py b/paddlespeech/t2s/exps/speedyspeech/train.py
index 7b422e64f..c90090daa 100644
--- a/paddlespeech/t2s/exps/speedyspeech/train.py
+++ b/paddlespeech/t2s/exps/speedyspeech/train.py
@@ -70,7 +70,7 @@ def train_sp(args, config):
     if args.speaker_dict is not None:
         print("multiple speaker speedyspeech!")
         collate_fn = speedyspeech_multi_spk_batch_fn
-        with open(args.speaker_dict, 'rt') as f:
+        with open(args.speaker_dict, 'rt', encoding='utf-8') as f:
             spk_id = [line.strip().split() for line in f.readlines()]
         spk_num = len(spk_id)
         fields += ["spk_id"]
@@ -133,11 +133,11 @@ def train_sp(args, config):
         collate_fn=collate_fn,
         num_workers=config.num_workers)
     print("dataloaders done!")
-    with open(args.phones_dict, "r") as f:
+    with open(args.phones_dict, 'rt', encoding='utf-8') as f:
         phn_id = [line.strip().split() for line in f.readlines()]
     vocab_size = len(phn_id)
     print("vocab_size:", vocab_size)
-    with open(args.tones_dict, "r") as f:
+    with open(args.tones_dict, 'rt', encoding='utf-8') as f:
         tone_id = [line.strip().split() for line in f.readlines()]
     tone_size = len(tone_id)
     print("tone_size:", tone_size)
diff --git a/paddlespeech/t2s/exps/syn_utils.py b/paddlespeech/t2s/exps/syn_utils.py
index 6b693440c..dd3b4d553 100644
--- a/paddlespeech/t2s/exps/syn_utils.py
+++ b/paddlespeech/t2s/exps/syn_utils.py
@@ -33,6 +33,7 @@ from paddlespeech.t2s.datasets.am_batch_fn import *
 from paddlespeech.t2s.datasets.data_table import DataTable
 from paddlespeech.t2s.datasets.vocoder_batch_fn import Clip_static
 from paddlespeech.t2s.frontend import English
+from paddlespeech.t2s.frontend.canton_frontend import CantonFrontend
 from paddlespeech.t2s.frontend.mix_frontend import MixFrontend
 from paddlespeech.t2s.frontend.zh_frontend import Frontend
 from paddlespeech.t2s.modules.normalizer import ZScore
@@ -106,12 +107,12 @@ def get_chunks(data, block_size: int, pad_size: int):
 def get_sentences(text_file: Optional[os.PathLike], lang: str='zh'):
     # construct dataset for evaluation
     sentences = []
-    with open(text_file, 'rt') as f:
+    with open(text_file, 'rt', encoding='utf-8') as f:
         for line in f:
             if line.strip() != "":
                 items = re.split(r"\s+", line.strip(), 1)
                 utt_id = items[0]
-                if lang == 'zh':
+                if lang in {'zh', 'canton'}:
                     sentence = "".join(items[1:])
                 elif lang == 'en':
                     sentence = " ".join(items[1:])
@@ -132,8 +133,8 @@ def get_test_dataset(test_metadata: List[Dict[str, Any]],
     converters = {}
     if am_name == 'fastspeech2':
         fields = ["utt_id", "text"]
-        if am_dataset in {"aishell3", "vctk",
-                          "mix"} and speaker_dict is not None:
+        if am_dataset in {"aishell3", "vctk", "mix",
+                          "canton"} and speaker_dict is not None:
             print("multiple speaker fastspeech2!")
             fields += ["spk_id"]
         elif voice_cloning:
@@ -177,8 +178,8 @@ def get_dev_dataloader(dev_metadata: List[Dict[str, Any]],
     converters = {}
     if am_name == 'fastspeech2':
         fields = ["utt_id", "text"]
-        if am_dataset in {"aishell3", "vctk",
-                          "mix"} and speaker_dict is not None:
+        if am_dataset in {"aishell3", "vctk", "mix",
+                          "canton"} and speaker_dict is not None:
             print("multiple speaker fastspeech2!")
             collate_fn = fastspeech2_multi_spk_batch_fn_static
             fields += ["spk_id"]
@@ -266,6 +267,8 @@ def get_frontend(lang: str='zh',
             phone_vocab_path=phones_dict,
             tone_vocab_path=tones_dict,
             use_rhy=use_rhy)
+    elif lang == 'canton':
+        frontend = CantonFrontend(phone_vocab_path=phones_dict)
     elif lang == 'en':
         frontend = English(phone_vocab_path=phones_dict)
     elif lang == 'mix':
@@ -302,6 +305,10 @@ def run_frontend(frontend: object,
         if get_tone_ids:
             tone_ids = input_ids["tone_ids"]
             outs.update({'tone_ids': tone_ids})
+    elif lang == 'canton':
+        input_ids = frontend.get_input_ids(
+            text, merge_sentences=merge_sentences, to_tensor=to_tensor)
+        phone_ids = input_ids["phone_ids"]
     elif lang == 'en':
         input_ids = frontend.get_input_ids(
             text, merge_sentences=merge_sentences, to_tensor=to_tensor)
@@ -311,7 +318,7 @@ def run_frontend(frontend: object,
             text, merge_sentences=merge_sentences, to_tensor=to_tensor)
         phone_ids = input_ids["phone_ids"]
     else:
-        print("lang should in {'zh', 'en', 'mix'}!")
+        print("lang should in {'zh', 'en', 'mix', 'canton'}!")
     outs.update({'phone_ids': phone_ids})
     return outs
 
@@ -325,17 +332,17 @@ def get_am_inference(am: str='fastspeech2_csmsc',
                      tones_dict: Optional[os.PathLike]=None,
                      speaker_dict: Optional[os.PathLike]=None,
                      return_am: bool=False):
-    with open(phones_dict, "r") as f:
+    with open(phones_dict, 'rt', encoding='utf-8') as f:
         phn_id = [line.strip().split() for line in f.readlines()]
     vocab_size = len(phn_id)
     tone_size = None
     if tones_dict is not None:
-        with open(tones_dict, "r") as f:
+        with open(tones_dict, 'rt', encoding='utf-8') as f:
             tone_id = [line.strip().split() for line in f.readlines()]
         tone_size = len(tone_id)
     spk_num = None
     if speaker_dict is not None:
-        with open(speaker_dict, 'rt') as f:
+        with open(speaker_dict, 'rt', encoding='utf-8') as f:
             spk_id = [line.strip().split() for line in f.readlines()]
         spk_num = len(spk_id)
     odim = am_config.n_mels
@@ -411,8 +418,8 @@ def am_to_static(am_inference,
     am_name = am[:am.rindex('_')]
     am_dataset = am[am.rindex('_') + 1:]
     if am_name == 'fastspeech2':
-        if am_dataset in {"aishell3", "vctk",
-                          "mix"} and speaker_dict is not None:
+        if am_dataset in {"aishell3", "vctk", "mix",
+                          "canton"} and speaker_dict is not None:
             am_inference = jit.to_static(
                 am_inference,
                 input_spec=[
@@ -424,8 +431,8 @@ def am_to_static(am_inference,
                 am_inference, input_spec=[InputSpec([-1], dtype=paddle.int64)])
 
     elif am_name == 'speedyspeech':
-        if am_dataset in {"aishell3", "vctk",
-                          "mix"} and speaker_dict is not None:
+        if am_dataset in {"aishell3", "vctk", "mix",
+                          "canton"} and speaker_dict is not None:
             am_inference = jit.to_static(
                 am_inference,
                 input_spec=[
@@ -575,7 +582,7 @@ def get_am_output(
     get_tone_ids = False
     if am_name == 'speedyspeech':
         get_tone_ids = True
-    if am_dataset in {"aishell3", "vctk", "mix"} and speaker_dict:
+    if am_dataset in {"aishell3", "vctk", "mix", "canton"} and speaker_dict:
         get_spk_id = True
         spk_id = np.array([spk_id])
 
diff --git a/paddlespeech/t2s/exps/synthesize.py b/paddlespeech/t2s/exps/synthesize.py
index a8e18150e..70e52244f 100644
--- a/paddlespeech/t2s/exps/synthesize.py
+++ b/paddlespeech/t2s/exps/synthesize.py
@@ -136,7 +136,8 @@ def parse_args():
         choices=[
             'speedyspeech_csmsc', 'fastspeech2_csmsc', 'fastspeech2_ljspeech',
             'fastspeech2_aishell3', 'fastspeech2_vctk', 'tacotron2_csmsc',
-            'tacotron2_ljspeech', 'tacotron2_aishell3', 'fastspeech2_mix'
+            'tacotron2_ljspeech', 'tacotron2_aishell3', 'fastspeech2_mix',
+            'fastspeech2_canton'
         ],
         help='Choose acoustic model type of tts task.')
     parser.add_argument(
diff --git a/paddlespeech/t2s/exps/synthesize_e2e.py b/paddlespeech/t2s/exps/synthesize_e2e.py
index 625002477..3b87d9e16 100644
--- a/paddlespeech/t2s/exps/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/synthesize_e2e.py
@@ -119,7 +119,7 @@ def evaluate(args):
                     # acoustic model
                     if am_name == 'fastspeech2':
                         # multi speaker
-                        if am_dataset in {"aishell3", "vctk", "mix"}:
+                        if am_dataset in {"aishell3", "vctk", "mix", "canton"}:
                             spk_id = paddle.to_tensor(args.spk_id)
                             mel = am_inference(part_phone_ids, spk_id)
                         else:
@@ -167,7 +167,8 @@ def parse_args():
         choices=[
             'speedyspeech_csmsc', 'speedyspeech_aishell3', 'fastspeech2_csmsc',
             'fastspeech2_ljspeech', 'fastspeech2_aishell3', 'fastspeech2_vctk',
-            'tacotron2_csmsc', 'tacotron2_ljspeech', 'fastspeech2_mix'
+            'tacotron2_csmsc', 'tacotron2_ljspeech', 'fastspeech2_mix',
+            'fastspeech2_canton'
         ],
         help='Choose acoustic model type of tts task.')
     parser.add_argument(
diff --git a/paddlespeech/t2s/exps/tacotron2/train.py b/paddlespeech/t2s/exps/tacotron2/train.py
index 69ff80e46..db88009a8 100644
--- a/paddlespeech/t2s/exps/tacotron2/train.py
+++ b/paddlespeech/t2s/exps/tacotron2/train.py
@@ -119,7 +119,7 @@ def train_sp(args, config):
         num_workers=config.num_workers)
     print("dataloaders done!")
 
-    with open(args.phones_dict, "r") as f:
+    with open(args.phones_dict, 'rt', encoding='utf-8') as f:
         phn_id = [line.strip().split() for line in f.readlines()]
     vocab_size = len(phn_id)
     print("vocab_size:", vocab_size)
diff --git a/paddlespeech/t2s/exps/transformer_tts/train.py b/paddlespeech/t2s/exps/transformer_tts/train.py
index da48b6b99..d49baad99 100644
--- a/paddlespeech/t2s/exps/transformer_tts/train.py
+++ b/paddlespeech/t2s/exps/transformer_tts/train.py
@@ -114,7 +114,7 @@ def train_sp(args, config):
         num_workers=config.num_workers)
     print("dataloaders done!")
 
-    with open(args.phones_dict, "r") as f:
+    with open(args.phones_dict, 'rt', encoding='utf-8') as f:
         phn_id = [line.strip().split() for line in f.readlines()]
     vocab_size = len(phn_id)
     print("vocab_size:", vocab_size)
diff --git a/paddlespeech/t2s/exps/vits/normalize.py b/paddlespeech/t2s/exps/vits/normalize.py
index 514cbef8e..24e15765e 100644
--- a/paddlespeech/t2s/exps/vits/normalize.py
+++ b/paddlespeech/t2s/exps/vits/normalize.py
@@ -187,7 +187,7 @@ def main():
             record["spk_emb"] = str(item["spk_emb"])
 
         output_metadata.append(record)
-    output_metadata.sort(key=itemgetter('feats_lengths'))
+    output_metadata.sort(key=itemgetter('feats_lengths'), reverse=True)
     output_metadata_path = Path(args.dumpdir) / "metadata.jsonl"
     with jsonlines.open(output_metadata_path, 'w') as writer:
         for item in output_metadata:
diff --git a/paddlespeech/t2s/exps/vits/preprocess.py b/paddlespeech/t2s/exps/vits/preprocess.py
index 2b1a40834..d6b226a20 100644
--- a/paddlespeech/t2s/exps/vits/preprocess.py
+++ b/paddlespeech/t2s/exps/vits/preprocess.py
@@ -166,7 +166,7 @@ def process_sentences(config,
                     if record:
                         results.append(record)
 
-    results.sort(key=itemgetter("feats_lengths"))
+    results.sort(key=itemgetter("feats_lengths"), reverse=True)
     with jsonlines.open(output_dir / "metadata.jsonl", 'w') as writer:
         for item in results:
             writer.write(item)
diff --git a/paddlespeech/t2s/exps/vits/train.py b/paddlespeech/t2s/exps/vits/train.py
index 07301db56..0e74bf631 100644
--- a/paddlespeech/t2s/exps/vits/train.py
+++ b/paddlespeech/t2s/exps/vits/train.py
@@ -78,7 +78,7 @@ def train_sp(args, config):
     if args.speaker_dict is not None:
         print("multiple speaker vits!")
         collate_fn = vits_multi_spk_batch_fn
-        with open(args.speaker_dict, 'rt') as f:
+        with open(args.speaker_dict, 'rt', encoding='utf-8') as f:
             spk_id = [line.strip().split() for line in f.readlines()]
         spk_num = len(spk_id)
         fields += ["spk_id"]
@@ -110,7 +110,7 @@ def train_sp(args, config):
     train_sampler = ErnieSATSampler(
         train_dataset,
         batch_size=config.batch_size,
-        shuffle=True,
+        shuffle=False,
         drop_last=True)
     dev_sampler = ErnieSATSampler(
         dev_dataset,
@@ -132,7 +132,7 @@ def train_sp(args, config):
         num_workers=config.num_workers)
     print("dataloaders done!")
 
-    with open(args.phones_dict, "r") as f:
+    with open(args.phones_dict, 'rt', encoding='utf-8') as f:
         phn_id = [line.strip().split() for line in f.readlines()]
     vocab_size = len(phn_id)
     print("vocab_size:", vocab_size)
diff --git a/paddlespeech/t2s/frontend/canton_frontend.py b/paddlespeech/t2s/frontend/canton_frontend.py
new file mode 100644
index 000000000..f81526839
--- /dev/null
+++ b/paddlespeech/t2s/frontend/canton_frontend.py
@@ -0,0 +1,106 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Dict
+from typing import List
+
+import numpy as np
+import paddle
+import ToJyutping
+
+from paddlespeech.t2s.frontend.zh_normalization.text_normlization import TextNormalizer
+
+INITIALS = [
+    'p', 'b', 't', 'd', 'ts', 'dz', 'k', 'g', 'kw', 'gw', 'f', 'h', 'l', 'm',
+    'ng', 'n', 's', 'y', 'w', 'c', 'z', 'j'
+]
+INITIALS += ['sp', 'spl', 'spn', 'sil']
+
+
+def get_lines(cantons: List[str]):
+    phones = []
+    for canton in cantons:
+        for consonant in INITIALS:
+            if canton.startswith(consonant):
+                c, v = canton[:len(consonant)], canton[len(consonant):]
+                phones = phones + [c, v]
+    return phones
+
+
+class CantonFrontend():
+    def __init__(self, phone_vocab_path: str):
+        self.text_normalizer = TextNormalizer()
+        self.punc = "：，；。？！“”‘’':,;.?!"
+
+        self.vocab_phones = {}
+        if phone_vocab_path:
+            with open(phone_vocab_path, 'rt', encoding='utf-8') as f:
+                phn_id = [line.strip().split() for line in f.readlines()]
+            for phn, id in phn_id:
+                self.vocab_phones[phn] = int(id)
+
+    # if merge_sentences, merge all sentences into one phone sequence
+    def _g2p(self, sentences: List[str],
+             merge_sentences: bool=True) -> List[List[str]]:
+        phones_list = []
+        for sentence in sentences:
+            phones_str = ToJyutping.get_jyutping_text(sentence)
+            phones_split = get_lines(phones_str.split(' '))
+            phones_list.append(phones_split)
+        return phones_list
+
+    def _p2id(self, phonemes: List[str]) -> np.ndarray:
+        # replace unk phone with sp
+        phonemes = [
+            phn if phn in self.vocab_phones else "sp" for phn in phonemes
+        ]
+        phone_ids = [self.vocab_phones[item] for item in phonemes]
+        return np.array(phone_ids, np.int64)
+
+    def get_phonemes(self,
+                     sentence: str,
+                     merge_sentences: bool=True,
+                     print_info: bool=False) -> List[List[str]]:
+        sentences = self.text_normalizer.normalize(sentence)
+        phonemes = self._g2p(sentences, merge_sentences=merge_sentences)
+        if print_info:
+            print("----------------------------")
+            print("text norm results:")
+            print(sentences)
+            print("----------------------------")
+            print("g2p results:")
+            print(phonemes)
+            print("----------------------------")
+        return phonemes
+
+    def get_input_ids(self,
+                      sentence: str,
+                      merge_sentences: bool=True,
+                      print_info: bool=False,
+                      to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]:
+
+        phonemes = self.get_phonemes(
+            sentence, merge_sentences=merge_sentences, print_info=print_info)
+        result = {}
+        temp_phone_ids = []
+
+        for phones in phonemes:
+            if phones:
+                phone_ids = self._p2id(phones)
+                # if use paddle.to_tensor() in onnxruntime, the first time will be too low
+                if to_tensor:
+                    phone_ids = paddle.to_tensor(phone_ids)
+                temp_phone_ids.append(phone_ids)
+        if temp_phone_ids:
+            result["phone_ids"] = temp_phone_ids
+        return result
diff --git a/paddlespeech/t2s/frontend/phonectic.py b/paddlespeech/t2s/frontend/phonectic.py
index 261db80a8..af86d9b80 100644
--- a/paddlespeech/t2s/frontend/phonectic.py
+++ b/paddlespeech/t2s/frontend/phonectic.py
@@ -58,7 +58,7 @@ class English(Phonetics):
         self.punc = "：，；。？！“”‘’':,;.?!"
         self.text_normalizer = TextNormalizer()
         if phone_vocab_path:
-            with open(phone_vocab_path, 'rt') as f:
+            with open(phone_vocab_path, 'rt', encoding='utf-8') as f:
                 phn_id = [line.strip().split() for line in f.readlines()]
             for phn, id in phn_id:
                 self.vocab_phones[phn] = int(id)
diff --git a/paddlespeech/t2s/frontend/zh_frontend.py b/paddlespeech/t2s/frontend/zh_frontend.py
index ddd8cf5c7..35b97a93a 100644
--- a/paddlespeech/t2s/frontend/zh_frontend.py
+++ b/paddlespeech/t2s/frontend/zh_frontend.py
@@ -138,18 +138,18 @@ class Frontend():
             "拐儿", "聋儿", "乞儿", "患儿", "幼儿", "孤儿", "婴儿", "婴幼儿", "连体儿", "脑瘫儿",
             "流浪儿", "体弱儿", "混血儿", "蜜雪儿", "舫儿", "祖儿", "美儿", "应采儿", "可儿", "侄儿",
             "孙儿", "侄孙儿", "女儿", "男儿", "红孩儿", "花儿", "虫儿", "马儿", "鸟儿", "猪儿", "猫儿",
-            "狗儿"
+            "狗儿", "少儿"
         }
 
         self.vocab_phones = {}
         self.vocab_tones = {}
         if phone_vocab_path:
-            with open(phone_vocab_path, 'rt') as f:
+            with open(phone_vocab_path, 'rt', encoding='utf-8') as f:
                 phn_id = [line.strip().split() for line in f.readlines()]
             for phn, id in phn_id:
                 self.vocab_phones[phn] = int(id)
         if tone_vocab_path:
-            with open(tone_vocab_path, 'rt') as f:
+            with open(tone_vocab_path, 'rt', encoding='utf-8') as f:
                 tone_id = [line.strip().split() for line in f.readlines()]
             for tone, id in tone_id:
                 self.vocab_tones[tone] = int(id)
diff --git a/paddlespeech/t2s/models/tacotron2/tacotron2_updater.py b/paddlespeech/t2s/models/tacotron2/tacotron2_updater.py
index 09e6827d0..1db9248ae 100644
--- a/paddlespeech/t2s/models/tacotron2/tacotron2_updater.py
+++ b/paddlespeech/t2s/models/tacotron2/tacotron2_updater.py
@@ -113,16 +113,18 @@ class Tacotron2Updater(StandardUpdater):
         loss.backward()
         optimizer.step()
 
+        if self.use_guided_attn_loss:
+            report("train/attn_loss", float(attn_loss))
+            losses_dict["attn_loss"] = float(attn_loss)
+        
         report("train/l1_loss", float(l1_loss))
         report("train/mse_loss", float(mse_loss))
         report("train/bce_loss", float(bce_loss))
-        report("train/attn_loss", float(attn_loss))
         report("train/loss", float(loss))
 
         losses_dict["l1_loss"] = float(l1_loss)
         losses_dict["mse_loss"] = float(mse_loss)
         losses_dict["bce_loss"] = float(bce_loss)
-        losses_dict["attn_loss"] = float(attn_loss)
         losses_dict["loss"] = float(loss)
         self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
                               for k, v in losses_dict.items())
@@ -202,17 +204,19 @@ class Tacotron2Evaluator(StandardEvaluator):
             attn_loss = self.attn_loss(
                 att_ws=att_ws, ilens=batch["text_lengths"] + 1, olens=olens_in)
             loss = loss + attn_loss
+        
+        if self.use_guided_attn_loss:
+            report("eval/attn_loss", float(attn_loss))
+            losses_dict["attn_loss"] = float(attn_loss)
 
         report("eval/l1_loss", float(l1_loss))
         report("eval/mse_loss", float(mse_loss))
         report("eval/bce_loss", float(bce_loss))
-        report("eval/attn_loss", float(attn_loss))
         report("eval/loss", float(loss))
 
         losses_dict["l1_loss"] = float(l1_loss)
         losses_dict["mse_loss"] = float(mse_loss)
         losses_dict["bce_loss"] = float(bce_loss)
-        losses_dict["attn_loss"] = float(attn_loss)
         losses_dict["loss"] = float(loss)
         self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
                               for k, v in losses_dict.items())
diff --git a/paddlespeech/t2s/modules/diffusion.py b/paddlespeech/t2s/modules/diffusion.py
index 52fe84ceb..be684ce38 100644
--- a/paddlespeech/t2s/modules/diffusion.py
+++ b/paddlespeech/t2s/modules/diffusion.py
@@ -40,7 +40,7 @@ class WaveNetDenoiser(nn.Layer):
         layers (int, optional): 
             Number of residual blocks inside, by default 20
         stacks (int, optional):
-            The number of groups to split the residual blocks into, by default 4
+            The number of groups to split the residual blocks into, by default 5
             Within each group, the dilation of the residual block grows exponentially.
         residual_channels (int, optional): 
             Residual channel of the residual blocks, by default 256
@@ -64,7 +64,7 @@ class WaveNetDenoiser(nn.Layer):
             out_channels: int=80,
             kernel_size: int=3,
             layers: int=20,
-            stacks: int=4,
+            stacks: int=5,
             residual_channels: int=256,
             gate_channels: int=512,
             skip_channels: int=256,
@@ -72,7 +72,7 @@ class WaveNetDenoiser(nn.Layer):
             dropout: float=0.,
             bias: bool=True,
             use_weight_norm: bool=False,
-            init_type: str="kaiming_uniform", ):
+            init_type: str="kaiming_normal", ):
         super().__init__()
 
         # initialize parameters
@@ -118,18 +118,15 @@ class WaveNetDenoiser(nn.Layer):
                 bias=bias)
             self.conv_layers.append(conv)
 
+        final_conv = nn.Conv1D(skip_channels, out_channels, 1, bias_attr=True)
+        nn.initializer.Constant(0.0)(final_conv.weight)
         self.last_conv_layers = nn.Sequential(nn.ReLU(),
                                               nn.Conv1D(
                                                   skip_channels,
                                                   skip_channels,
                                                   1,
                                                   bias_attr=True),
-                                              nn.ReLU(),
-                                              nn.Conv1D(
-                                                  skip_channels,
-                                                  out_channels,
-                                                  1,
-                                                  bias_attr=True))
+                                              nn.ReLU(), final_conv)
 
         if use_weight_norm:
             self.apply_weight_norm()
@@ -200,10 +197,6 @@ class GaussianDiffusion(nn.Layer):
     Args:
         denoiser (Layer, optional): 
             The model used for denoising noises.
-            In fact, the denoiser model performs the operation 
-            of producing a output with more noises from the noisy input. 
-            Then we use the diffusion algorithm to calculate 
-            the input with the output to get the denoised result.
         num_train_timesteps (int, optional): 
             The number of timesteps between the noise and the real during training, by default 1000.
         beta_start (float, optional): 
@@ -233,7 +226,8 @@ class GaussianDiffusion(nn.Layer):
         >>>     def callback(index, timestep, num_timesteps, sample):
         >>>         nonlocal pbar
         >>>         if pbar is None:
-        >>>             pbar = tqdm(total=num_timesteps-index)
+        >>>             pbar = tqdm(total=num_timesteps)
+        >>>             pbar.update(index)
         >>>         pbar.update()
         >>> 
         >>>     return callback
@@ -247,7 +241,7 @@ class GaussianDiffusion(nn.Layer):
         >>> diffusion = GaussianDiffusion(denoiser, num_train_timesteps=ds, num_max_timesteps=K_step)
         >>> with paddle.no_grad():
         >>>     sample = diffusion.inference(
-        >>>         paddle.randn(x.shape), c, x, 
+        >>>         paddle.randn(x.shape), c, ref_x=x_in, 
         >>>         num_inference_steps=infer_steps,
         >>>         scheduler_type=scheduler_type,
         >>>         callback=create_progress_callback())
@@ -262,7 +256,7 @@ class GaussianDiffusion(nn.Layer):
         >>> diffusion = GaussianDiffusion(denoiser, num_train_timesteps=ds, num_max_timesteps=K_step)
         >>> with paddle.no_grad():
         >>>     sample = diffusion.inference(
-        >>>         paddle.randn(x.shape), c, x_in, 
+        >>>         paddle.randn(x.shape), c, ref_x=x_in, 
         >>>         num_inference_steps=infer_steps,
         >>>         scheduler_type=scheduler_type,
         >>>         callback=create_progress_callback())
@@ -277,11 +271,11 @@ class GaussianDiffusion(nn.Layer):
         >>> diffusion = GaussianDiffusion(denoiser, num_train_timesteps=ds, num_max_timesteps=K_step)
         >>> with paddle.no_grad():
         >>>     sample = diffusion.inference(
-        >>>         paddle.randn(x.shape), c, None, 
+        >>>         paddle.randn(x.shape), c, ref_x=x_in, 
         >>>         num_inference_steps=infer_steps,
         >>>         scheduler_type=scheduler_type,
         >>>         callback=create_progress_callback())
-        100%|█████| 25/25 [00:01<00:00, 19.75it/s]
+        100%|█████| 34/34 [00:01<00:00, 19.75it/s]
         >>> 
         >>> # ds=1000, K_step=100, scheduler=pndm, infer_step=50, from aux fs2 mel output
         >>> ds = 1000
@@ -292,11 +286,11 @@ class GaussianDiffusion(nn.Layer):
         >>> diffusion = GaussianDiffusion(denoiser, num_train_timesteps=ds, num_max_timesteps=K_step)
         >>> with paddle.no_grad():
         >>>     sample = diffusion.inference(
-        >>>         paddle.randn(x.shape), c, x, 
+        >>>         paddle.randn(x.shape), c, ref_x=x_in, 
         >>>         num_inference_steps=infer_steps,
         >>>         scheduler_type=scheduler_type,
         >>>         callback=create_progress_callback())
-        100%|█████| 5/5 [00:00<00:00, 23.80it/s]
+        100%|█████| 14/14 [00:00<00:00, 23.80it/s]
 
     """
 
@@ -366,6 +360,8 @@ class GaussianDiffusion(nn.Layer):
                   num_inference_steps: Optional[int]=1000,
                   strength: Optional[float]=None,
                   scheduler_type: Optional[str]="ddpm",
+                  clip_noise: Optional[bool]=True,
+                  clip_noise_range: Optional[Tuple[float, float]]=(-1, 1),
                   callback: Optional[Callable[[int, int, int, paddle.Tensor],
                                               None]]=None,
                   callback_steps: Optional[int]=1):
@@ -386,6 +382,10 @@ class GaussianDiffusion(nn.Layer):
             scheduler_type (str, optional):
                 Noise scheduler for generate noises. 
                 Choose a great scheduler can skip many denoising step, by default 'ddpm'.
+            clip_noise (bool, optional):
+                Whether to clip each denoised output, by default True.
+            clip_noise_range (tuple, optional):
+                denoised output min and max value range after clip, by default (-1, 1).
             callback (Callable[[int,int,int,Tensor], None], optional):
                 Callback function during denoising steps.
 
@@ -446,6 +446,9 @@ class GaussianDiffusion(nn.Layer):
 
         # denoising loop
         denoised_output = noisy_input
+        if clip_noise:
+            n_min, n_max = clip_noise_range
+            denoised_output = paddle.clip(denoised_output, n_min, n_max)
         num_warmup_steps = len(
             timesteps) - num_inference_steps * scheduler.order
         for i, t in enumerate(timesteps):
@@ -457,6 +460,8 @@ class GaussianDiffusion(nn.Layer):
             # compute the previous noisy sample x_t -> x_t-1
             denoised_output = scheduler.step(noise_pred, t,
                                              denoised_output).prev_sample
+            if clip_noise:
+                denoised_output = paddle.clip(denoised_output, n_min, n_max)
 
             # call the callback, if provided
             if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and
diff --git a/setup.py b/setup.py
index 212d3b109..69739b3b8 100644
--- a/setup.py
+++ b/setup.py
@@ -37,9 +37,7 @@ base = [
     "g2pM",
     "h5py",
     "inflect",
-    "jieba",
     "jsonlines",
-    "kaldiio",
     "librosa==0.8.1",
     "loguru",
     "matplotlib",
@@ -51,37 +49,29 @@ base = [
     "paddlenlp>=2.4.8",
     "ppdiffusers>=0.9.0",
     "paddlespeech_feat",
-    "Pillow>=9.0.0",
-    "praatio==5.0.0",
-    "protobuf>=3.1.0, <=3.20.0",
+    "praatio>=5.0.0",
     "pypinyin<=0.44.0",
     "pypinyin-dict",
     "python-dateutil",
-    "pyworld==0.2.12",
-    "resampy==0.2.2",
+    "pyworld>=0.2.12",
+    "resampy",
     "sacrebleu",
-    "scipy",
-    "sentencepiece~=0.1.96",
-    "soundfile~=0.10",
     "textgrid",
     "timer",
-    "tqdm",
+    "ToJyutping",
     "typeguard",
-    "visualdl",
     "webrtcvad",
     "yacs~=0.1.8",
     "prettytable",
     "zhon",
-    "colorlog",
-    "pathos==0.2.8",
     "braceexpand",
     "pyyaml",
-    "pybind11",
-    "paddleslim==2.3.4",
-    "paddleaudio>=1.0.2",
+    "paddleslim>=2.3.4",
+    "paddleaudio>=1.1.0",
+    "hyperpyyaml",
 ]
 
-server = ["fastapi", "uvicorn", "pattern_singleton", "websockets"]
+server = ["pattern_singleton", "websockets"]
 
 requirements = {
     "install":
@@ -304,7 +294,8 @@ setup_info = dict(
     },
 
     # Package info
-    packages=find_packages(include=('paddlespeech*')),
+    packages=find_packages(
+        include=['paddlespeech*'], exclude=['utils', 'third_party']),
     zip_safe=True,
     classifiers=[
         'Development Status :: 5 - Production/Stable',
diff --git a/tests/test_tipc/prepare.sh b/tests/test_tipc/prepare.sh
index cb05a1d0f..9ff81bd8b 100755
--- a/tests/test_tipc/prepare.sh
+++ b/tests/test_tipc/prepare.sh
@@ -73,6 +73,9 @@ if [[ ${MODE} = "benchmark_train" ]];then
         mkdir -p BZNSYP
         unrar x BZNSYP.rar BZNSYP
         wget -nc https://paddlespeech.bj.bcebos.com/Parakeet/benchmark/durations.txt
+        # 避免网络问题导致的 nltk_data 无法下载使程序 hang 住
+        wget -nc https://paddlespeech.bj.bcebos.com/Parakeet/tools/nltk_data.tar.gz
+        tar -xzf nltk_data.tar.gz -C ${HOME}
         # 数据预处理
         python ../paddlespeech/t2s/exps/gan_vocoder/preprocess.py --rootdir=BZNSYP/ --dumpdir=dump --num-cpu=20 --cut-sil=True --dur-file=durations.txt --config=../examples/csmsc/voc1/conf/default.yaml
         python ../utils/compute_statistics.py --metadata=dump/train/raw/metadata.jsonl --field-name="feats"
diff --git a/third_party/ctc_decoders/setup.py b/third_party/ctc_decoders/setup.py
index c13f3df99..5ae5b3bf6 100644
--- a/third_party/ctc_decoders/setup.py
+++ b/third_party/ctc_decoders/setup.py
@@ -129,7 +129,7 @@ decoders_module = [
 
 setup(
     name='paddlespeech_ctcdecoders',
-    version='0.2.0',
+    version='0.2.2',
     description="CTC decoders in paddlespeech",
     author="PaddlePaddle Speech and Language Team",
     author_email="paddlesl@baidu.com",