diff --git a/.github/CODE_OF_CONDUCT.md b/.github/CODE_OF_CONDUCT.md
new file mode 100644
index 00000000..33d53d9f
--- /dev/null
+++ b/.github/CODE_OF_CONDUCT.md
@@ -0,0 +1,77 @@
+# Contributor Covenant Code of Conduct
+
+## Our Pledge
+
+In the interest of fostering an open and welcoming environment, we as
+contributors and maintainers pledge to making participation in our project and
+our community a harassment-free experience for everyone, regardless of age, body
+size, disability, ethnicity, sex characteristics, gender identity and expression,
+level of experience, education, socio-economic status, nationality, personal
+appearance, race, religion, or sexual identity and orientation.
+
+## Our Standards
+
+Examples of behavior that contributes to creating a positive environment
+include:
+
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+
+Examples of unacceptable behavior by participants include:
+
+* The use of sexualized language or imagery and unwelcome sexual attention or
+ advances
+* Racial or political allusions          
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic
+ address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+ professional setting
+
+## Our Responsibilities
+
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+
+## Scope
+
+This Code of Conduct applies both within project spaces and in public spaces
+when an individual is representing the project or its community. Examples of
+representing a project or community include using an official project e-mail
+address, posting via an official social media account, or acting as an appointed
+representative at an online or offline event. Representation of a project may be
+further defined and clarified by project maintainers.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the project team at paddlespeech@baidu.com. All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an incident.
+Further details of specific enforcement policies may be posted separately.
+
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
+available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
+
+[homepage]: https://www.contributor-covenant.org
+
+For answers to common questions about this code of conduct, see
+https://www.contributor-covenant.org/faq
diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md
new file mode 100644
index 00000000..1ff47330
--- /dev/null
+++ b/.github/CONTRIBUTING.md
@@ -0,0 +1,30 @@
+# 💡 paddlespeech 提交代码须知
+
+### Discussed in https://github.com/PaddlePaddle/PaddleSpeech/discussions/1326
+
+<div type='discussions-op-text'>
+
+<sup>Originally posted by **yt605155624** January 12, 2022</sup>
+1. 写完代码之后可以用我们的 pre-commit 检查一下代码格式，注意只改自己修改的代码的格式即可，其他的代码有可能也被改了格式，不要 add 就好
+```
+pip install pre-commit
+pre-commit run --file 你修改的代码
+```
+2. 提交 commit 中增加必要信息跳过不必要的 CI
+- 提交 asr 相关代码
+```text
+git commit -m "xxxxxx, test=asr"
+```
+- 提交 tts 相关代码
+```text
+git commit -m "xxxxxx, test=tts"
+```
+- 仅修改文档
+```text
+git commit -m "xxxxxx, test=doc"
+```
+注意：
+1. 虽然跳过了 CI，但是还要先排队排到才能跳过，所以非自己方向看到 pending 不要着急 🤣
+2. 在 `git commit --amend` 的时候才加 `test=xxx` 可能不太有效
+3. 一个 pr 多次提交 commit 注意每次都要加 `test=xxx`，因为每个 commit 都会触发 CI
+4. 删除 python 环境中已经安装好的 paddlespeech，否则可能会影响 import paddlespeech 的顺序</div>
diff --git a/.github/ISSUE_TEMPLATE/bug-report-tts.md b/.github/ISSUE_TEMPLATE/bug-report-tts.md
index 64b33c32..e2322c23 100644
--- a/.github/ISSUE_TEMPLATE/bug-report-tts.md
+++ b/.github/ISSUE_TEMPLATE/bug-report-tts.md
@@ -3,7 +3,6 @@ name: "\U0001F41B TTS Bug Report"
 about: Create a report to help us improve
 title: "[TTS]XXXX"
 labels: Bug, T2S
-assignees: yt605155624
 
 ---
 
diff --git a/.github/stale.yml b/.github/stale.yml
index da19b660..6b0da9b9 100644
--- a/.github/stale.yml
+++ b/.github/stale.yml
@@ -6,7 +6,8 @@ daysUntilClose: 30
 exemptLabels:
   - Roadmap 
   - Bug
-  - New Feature 
+  - feature request
+  - Tips
 # Label to use when marking an issue as stale
 staleLabel: Stale
 # Comment to post when marking an issue as stale. Set to `false` to disable
@@ -17,4 +18,4 @@ markComment: >
 unmarkComment: false
 # Comment to post when closing a stale issue. Set to `false` to disable
 closeComment: >
-  This issue is closed. Please re-open if needed.
\ No newline at end of file
+  This issue is closed. Please re-open if needed.
diff --git a/.gitignore b/.gitignore
index 75f56b60..4a0c4331 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,6 +15,7 @@
 *.egg-info
 build
 *output/
+.history
 
 audio/dist/
 audio/fc_patch/
diff --git a/.pre-commit-hooks/copyright-check.hook b/.pre-commit-hooks/copyright-check.hook
index 761edbc0..5a409e06 100644
--- a/.pre-commit-hooks/copyright-check.hook
+++ b/.pre-commit-hooks/copyright-check.hook
@@ -19,7 +19,7 @@ import subprocess
 import platform
 
 COPYRIGHT = '''
-Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -128,4 +128,4 @@ def main(argv=None):
 
 
 if __name__ == '__main__':
-    exit(main())
\ No newline at end of file
+    exit(main())
diff --git a/README.md b/README.md
index 0a12ec04..9ed82311 100644
--- a/README.md
+++ b/README.md
@@ -97,26 +97,47 @@
   </thead>
   <tbody>
    <tr>
-      <td >Life was like a box of chocolates, you never know what you're gonna get.</td>
+      <td>Life was like a box of chocolates, you never know what you're gonna get.</td>
       <td align = "center">
       <a href="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/tacotron2_ljspeech_waveflow_samples_0.2/sentence_1.wav" rel="nofollow">
             <img align="center" src="./docs/images/audio_icon.png" width="200" style="max-width: 100%;"></a><br>
       </td>
     </tr>
     <tr>
-      <td >早上好，今天是2020/10/29，最低温度是-3°C。</td>
+      <td>早上好，今天是2020/10/29，最低温度是-3°C。</td>
       <td align = "center">
       <a href="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/001.wav" rel="nofollow">
             <img align="center" src="./docs/images/audio_icon.png" width="200" style="max-width: 100%;"></a><br>
       </td>
     </tr>
     <tr>
-      <td >季姬寂，集鸡，鸡即棘鸡。棘鸡饥叽，季姬及箕稷济鸡。鸡既济，跻姬笈，季姬忌，急咭鸡，鸡急，继圾几，季姬急，即籍箕击鸡，箕疾击几伎，伎即齑，鸡叽集几基，季姬急极屐击鸡，鸡既殛，季姬激，即记《季姬击鸡记》。</td>
+      <td>季姬寂，集鸡，鸡即棘鸡。棘鸡饥叽，季姬及箕稷济鸡。鸡既济，跻姬笈，季姬忌，急咭鸡，鸡急，继圾几，季姬急，即籍箕击鸡，箕疾击几伎，伎即齑，鸡叽集几基，季姬急极屐击鸡，鸡既殛，季姬激，即记《季姬击鸡记》。</td>
       <td align = "center">
       <a href="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/jijiji.wav" rel="nofollow">
             <img align="center" src="./docs/images/audio_icon.png" width="200" style="max-width: 100%;"></a><br>
       </td>
     </tr>
+    <tr>
+      <td>大家好，我是 parrot 虚拟老师，我们来读一首诗，我与春风皆过客，I and the spring breeze are passing by，你携秋水揽星河，you take the autumn water to take the galaxy。</td>
+      <td align = "center">
+      <a href="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/labixiaoxin.wav" rel="nofollow">
+            <img align="center" src="./docs/images/audio_icon.png" width="200" style="max-width: 100%;"></a><br>
+      </td>
+    </tr>
+    <tr>
+      <td>宜家唔系事必要你讲，但系你所讲嘅说话将会变成呈堂证供。</td>
+      <td align = "center">
+      <a href="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/chengtangzhenggong.wav" rel="nofollow">
+            <img align="center" src="./docs/images/audio_icon.png" width="200" style="max-width: 100%;"></a><br>
+      </td>
+    </tr>
+    <tr>
+      <td>各个国家有各个国家嘅国歌</td>
+      <td align = "center">
+      <a href="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/gegege.wav" rel="nofollow">
+            <img align="center" src="./docs/images/audio_icon.png" width="200" style="max-width: 100%;"></a><br>
+      </td>
+    </tr>
   </tbody>
 </table>
 
@@ -157,16 +178,24 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision
   - 🧩  *Cascaded models application*: as an extension of the typical traditional audio tasks, we combine the workflows of the aforementioned tasks with other fields like Natural language processing (NLP) and Computer Vision (CV).
 
 ### Recent Update
-- 🎉 2022.12.02: Add [end-to-end Prosody Prediction pipeline](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3_rhy) (including using prosody labels in Acoustic Model).
-- 🎉 2022.11.30: Add [TTS Android Demo](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/TTSAndroid).
+- 🔥 2023.04.06: Add [subtitle file (.srt format) generation example](./demos/streaming_asr_server).
+- 🔥 2023.03.14: Add SVS(Singing Voice Synthesis) examples with Opencpop dataset, including [DiffSinger](./examples/opencpop/svs1)、[PWGAN](./examples/opencpop/voc1) and [HiFiGAN](./examples/opencpop/voc5), the effect is continuously optimized.
+- 👑 2023.03.09: Add [Wav2vec2ASR-zh](./examples/aishell/asr3).
+- 🎉 2023.03.07: Add [TTS ARM Linux C++ Demo (with C++ Chinese Text Frontend)](./demos/TTSArmLinux).
+- 🔥 2023.03.03 Add Voice Conversion [StarGANv2-VC synthesize pipeline](./examples/vctk/vc3).
+- 🎉 2023.02.16: Add [Cantonese TTS](./examples/canton/tts3).
+- 🔥 2023.01.10: Add [code-switch asr CLI and Demos](./demos/speech_recognition).
+- 👑 2023.01.06: Add [code-switch asr tal_cs recipe](./examples/tal_cs/asr1/).
+- 🎉 2022.12.02: Add [end-to-end Prosody Prediction pipeline](./examples/csmsc/tts3_rhy) (including using prosody labels in Acoustic Model).
+- 🎉 2022.11.30: Add [TTS Android Demo](./demos/TTSAndroid).
 - 🤗 2022.11.28: PP-TTS and PP-ASR demos are available in [AIStudio](https://aistudio.baidu.com/aistudio/modelsoverview) and [official website
  of paddlepaddle](https://www.paddlepaddle.org.cn/models).
 - 👑 2022.11.18: Add [Whisper CLI and Demos](https://github.com/PaddlePaddle/PaddleSpeech/pull/2640), support multi language recognition and translation.
-- 🔥 2022.11.18: Add [Wav2vec2 CLI and Demos](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/demos/speech_ssl), Support ASR and Feature Extraction.
+- 🔥 2022.11.18: Add [Wav2vec2 CLI and Demos](./demos/speech_ssl), Support ASR and Feature Extraction.
 - 🎉 2022.11.17: Add [male voice for TTS](https://github.com/PaddlePaddle/PaddleSpeech/pull/2660).
 - 🔥 2022.11.07: Add [U2/U2++ C++ High Performance Streaming ASR Deployment](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/runtime/examples/u2pp_ol/wenetspeech).
 - 👑 2022.11.01: Add [Adversarial Loss](https://arxiv.org/pdf/1907.04448.pdf) for [Chinese English mixed TTS](./examples/zh_en_tts/tts3).
-- 🔥 2022.10.26: Add [Prosody Prediction](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/rhy) for TTS.
+- 🔥 2022.10.26: Add [Prosody Prediction](./examples/other/rhy) for TTS.
 - 🎉 2022.10.21: Add [SSML](https://github.com/PaddlePaddle/PaddleSpeech/discussions/2538) for TTS Chinese Text Frontend.
 - 👑 2022.10.11: Add [Wav2vec2ASR-en](./examples/librispeech/asr3), wav2vec2.0 fine-tuning for ASR on LibriSpeech.
 - 🔥 2022.09.26: Add Voice Cloning, TTS finetune, and [ERNIE-SAT](https://arxiv.org/abs/2211.03545) in [PaddleSpeech Web Demo](./demos/speech_web).
@@ -180,16 +209,16 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision
 - 🎉 2022.06.22: All TTS models support ONNX format.
 - 🍀 2022.06.17: Add [PaddleSpeech Web Demo](./demos/speech_web).
 - 👑 2022.05.13: Release [PP-ASR](./docs/source/asr/PPASR.md)、[PP-TTS](./docs/source/tts/PPTTS.md)、[PP-VPR](docs/source/vpr/PPVPR.md).
-- 👏🏻  2022.05.06: `PaddleSpeech Streaming Server` is available for `Streaming ASR` with `Punctuation Restoration` and `Token Timestamp` and `Text-to-Speech`.
-- 👏🏻  2022.05.06: `PaddleSpeech Server` is available for `Audio Classification`, `Automatic Speech Recognition` and `Text-to-Speech`, `Speaker Verification` and `Punctuation Restoration`.
-- 👏🏻  2022.03.28: `PaddleSpeech CLI` is available for `Speaker Verification`.
-- 👏🏻  2021.12.10: `PaddleSpeech CLI` is available for `Audio Classification`, `Automatic Speech Recognition`, `Speech Translation (English to Chinese)` and `Text-to-Speech`.
+- 👏🏻 2022.05.06: `PaddleSpeech Streaming Server` is available for `Streaming ASR` with `Punctuation Restoration` and `Token Timestamp` and `Text-to-Speech`.
+- 👏🏻 2022.05.06: `PaddleSpeech Server` is available for `Audio Classification`, `Automatic Speech Recognition` and `Text-to-Speech`, `Speaker Verification` and `Punctuation Restoration`.
+- 👏🏻 2022.03.28: `PaddleSpeech CLI` is available for `Speaker Verification`.
+- 👏🏻 2021.12.10: `PaddleSpeech CLI` is available for `Audio Classification`, `Automatic Speech Recognition`, `Speech Translation (English to Chinese)` and `Text-to-Speech`.
 
 ### Community
 - Scan the QR code below with your Wechat, you can access to official technical exchange group and get the bonus ( more than 20GB learning materials, such as papers, codes and videos ) and the live link of the lessons. Look forward to your participation.
 
 <div align="center">
-<img src="https://user-images.githubusercontent.com/30135920/196351517-19dece6b-d6ea-448e-a341-d6bfe5712ec1.jpg"  width = "200"  />
+<img src="https://user-images.githubusercontent.com/30135920/212860467-9e943cc3-8be8-49a4-97fd-7c94aad8e979.jpg"  width = "200"  />
 </div>
 
 ## Installation
@@ -550,14 +579,14 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r
   </thead>
   <tbody>
     <tr>
-    <td> Text Frontend </td>
-    <td colspan="2"> &emsp; </td>
-    <td>
-    <a href = "./examples/other/tn">tn</a> / <a href = "./examples/other/g2p">g2p</a>
-    </td>
+      <td> Text Frontend </td>
+      <td colspan="2"> &emsp; </td>
+      <td>
+      <a href = "./examples/other/tn">tn</a> / <a href = "./examples/other/g2p">g2p</a>
+      </td>
     </tr>
     <tr>
-      <td rowspan="5">Acoustic Model</td>
+      <td rowspan="6">Acoustic Model</td>
       <td>Tacotron2</td>
       <td>LJSpeech / CSMSC</td>
       <td>
@@ -592,6 +621,13 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r
       <a href = "./examples/vctk/ernie_sat">ERNIE-SAT-vctk</a> / <a href = "./examples/aishell3/ernie_sat">ERNIE-SAT-aishell3</a> / <a href = "./examples/aishell3_vctk/ernie_sat">ERNIE-SAT-zh_en</a>
       </td>
     </tr>
+    <tr>
+      <td>DiffSinger</td>
+      <td>Opencpop</td>
+      <td>
+      <a href = "./examples/opencpop/svs1">DiffSinger-opencpop</a>
+      </td>
+   </tr>
    <tr>
       <td rowspan="6">Vocoder</td>
       <td >WaveFlow</td>
@@ -602,9 +638,9 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r
     </tr>
     <tr>
       <td >Parallel WaveGAN</td>
-      <td >LJSpeech / VCTK / CSMSC / AISHELL-3</td>
+      <td >LJSpeech / VCTK / CSMSC / AISHELL-3 / Opencpop</td>
       <td>
-      <a href = "./examples/ljspeech/voc1">PWGAN-ljspeech</a> / <a href = "./examples/vctk/voc1">PWGAN-vctk</a> / <a href = "./examples/csmsc/voc1">PWGAN-csmsc</a> /  <a href = "./examples/aishell3/voc1">PWGAN-aishell3</a>
+      <a href = "./examples/ljspeech/voc1">PWGAN-ljspeech</a> / <a href = "./examples/vctk/voc1">PWGAN-vctk</a> / <a href = "./examples/csmsc/voc1">PWGAN-csmsc</a> /  <a href = "./examples/aishell3/voc1">PWGAN-aishell3</a> / <a href = "./examples/opencpop/voc1">PWGAN-opencpop</a>
       </td>
     </tr>
     <tr>
@@ -623,9 +659,9 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r
     </tr>
     <tr>
       <td>HiFiGAN</td>
-      <td>LJSpeech / VCTK / CSMSC / AISHELL-3</td>
+      <td>LJSpeech / VCTK / CSMSC / AISHELL-3 / Opencpop</td>
       <td>
-      <a href = "./examples/ljspeech/voc5">HiFiGAN-ljspeech</a> / <a href = "./examples/vctk/voc5">HiFiGAN-vctk</a> / <a href = "./examples/csmsc/voc5">HiFiGAN-csmsc</a> / <a href = "./examples/aishell3/voc5">HiFiGAN-aishell3</a>
+      <a href = "./examples/ljspeech/voc5">HiFiGAN-ljspeech</a> / <a href = "./examples/vctk/voc5">HiFiGAN-vctk</a> / <a href = "./examples/csmsc/voc5">HiFiGAN-csmsc</a> / <a href = "./examples/aishell3/voc5">HiFiGAN-aishell3</a> / <a href = "./examples/opencpop/voc5">HiFiGAN-opencpop</a>
       </td>
     </tr>
     <tr>
@@ -985,10 +1021,16 @@ You are warmly welcome to submit questions in [discussions](https://github.com/P
 - Many thanks to [vpegasus](https://github.com/vpegasus)/[xuesebot](https://github.com/vpegasus/xuesebot) for developing a rasa chatbot,which is able to speak and listen thanks to PaddleSpeech.
 - Many thanks to [chenkui164](https://github.com/chenkui164)/[FastASR](https://github.com/chenkui164/FastASR) for the C++ inference implementation of PaddleSpeech ASR.
 - Many thanks to [heyudage](https://github.com/heyudage)/[VoiceTyping](https://github.com/heyudage/VoiceTyping) for the real-time voice typing tool implementation of PaddleSpeech ASR streaming services.
-
+- Many thanks to [EscaticZheng](https://github.com/EscaticZheng)/[ps3.9wheel-install](https://github.com/EscaticZheng/ps3.9wheel-install) for the python3.9 prebuilt wheel for PaddleSpeech installation in Windows without Viusal Studio.
 Besides, PaddleSpeech depends on a lot of open source repositories. See [references](./docs/source/reference.md) for more information.
+- Many thanks to [chinobing](https://github.com/chinobing)/[FastAPI-PaddleSpeech-Audio-To-Text](https://github.com/chinobing/FastAPI-PaddleSpeech-Audio-To-Text) for converting audio to text based on FastAPI and PaddleSpeech.
+- Many thanks to [MistEO](https://github.com/MistEO)/[Pallas-Bot](https://github.com/MistEO/Pallas-Bot) for QQ bot based on PaddleSpeech TTS.
 
 <a name="License"></a>
 ## License
 
 PaddleSpeech is provided under the [Apache-2.0 License](./LICENSE).
+
+## Stargazers over time
+
+[![Stargazers over time](https://starchart.cc/PaddlePaddle/PaddleSpeech.svg)](https://starchart.cc/PaddlePaddle/PaddleSpeech)
diff --git a/README_cn.md b/README_cn.md
index 5cc156c9..8b98b61c 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -122,6 +122,27 @@
             <img align="center" src="./docs/images/audio_icon.png" width="200" style="max-width: 100%;"></a><br>
       </td>
     </tr>
+    <tr>
+      <td>大家好，我是 parrot 虚拟老师，我们来读一首诗，我与春风皆过客，I and the spring breeze are passing by，你携秋水揽星河，you take the autumn water to take the galaxy。</td>
+      <td align = "center">
+      <a href="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/labixiaoxin.wav" rel="nofollow">
+            <img align="center" src="./docs/images/audio_icon.png" width="200" style="max-width: 100%;"></a><br>
+      </td>
+    </tr>
+    <tr>
+      <td>宜家唔系事必要你讲，但系你所讲嘅说话将会变成呈堂证供。</td>
+      <td align = "center">
+      <a href="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/chengtangzhenggong.wav" rel="nofollow">
+            <img align="center" src="./docs/images/audio_icon.png" width="200" style="max-width: 100%;"></a><br>
+      </td>
+    </tr>
+    <tr>
+      <td>各个国家有各个国家嘅国歌</td>
+      <td align = "center">
+      <a href="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/gegege.wav" rel="nofollow">
+            <img align="center" src="./docs/images/audio_icon.png" width="200" style="max-width: 100%;"></a><br>
+      </td>
+    </tr>
   </tbody>
 </table>
 
@@ -161,18 +182,24 @@
   - 🔬 主流模型及数据集: 本工具包实现了参与整条语音任务流水线的各个模块，并且采用了主流数据集如 LibriSpeech、LJSpeech、AIShell、CSMSC，详情请见 [模型列表](#model-list)。
   - 🧩 级联模型应用: 作为传统语音任务的扩展，我们结合了自然语言处理、计算机视觉等任务，实现更接近实际需求的产业级应用。
 
-
-  
 ### 近期更新
-- 🎉 2022.12.02: 新增 [端到端韵律预测全流程](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3_rhy) (包含在声学模型中使用韵律标签)。
-- 🎉 2022.11.30: 新增 [TTS Android 部署示例](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/TTSAndroid)。
+- 👑 2023.04.06: 新增 [srt格式字幕生成功能](./demos/streaming_asr_server)。
+- 🔥 2023.03.14: 新增基于 Opencpop 数据集的 SVS (歌唱合成) 示例，包含 [DiffSinger](./examples/opencpop/svs1)、[PWGAN](./examples/opencpop/voc1) 和 [HiFiGAN](./examples/opencpop/voc5)，效果持续优化中。
+- 👑 2023.03.09: 新增 [Wav2vec2ASR-zh](./examples/aishell/asr3)。
+- 🎉 2023.03.07: 新增 [TTS ARM Linux C++ 部署示例 (包含 C++ 中文文本前端模块)](./demos/TTSArmLinux)。
+- 🔥 2023.03.03: 新增声音转换模型 [StarGANv2-VC 合成流程](./examples/vctk/vc3)。
+- 🎉 2023.02.16: 新增[粤语语音合成](./examples/canton/tts3)。
+- 🔥 2023.01.10: 新增[中英混合 ASR CLI 和 Demos](./demos/speech_recognition)。
+- 👑 2023.01.06: 新增 [ASR 中英混合 tal_cs 训练推理流程](./examples/tal_cs/asr1/)。
+- 🎉 2022.12.02: 新增[端到端韵律预测全流程](./examples/csmsc/tts3_rhy) (包含在声学模型中使用韵律标签)。
+- 🎉 2022.11.30: 新增 [TTS Android 部署示例](./demos/TTSAndroid)。
 - 🤗 2022.11.28: PP-TTS and PP-ASR 示例可在 [AIStudio](https://aistudio.baidu.com/aistudio/modelsoverview) 和[飞桨官网](https://www.paddlepaddle.org.cn/models)体验！
 - 👑 2022.11.18: 新增 [Whisper CLI 和 Demos](https://github.com/PaddlePaddle/PaddleSpeech/pull/2640), 支持多种语言的识别与翻译。
-- 🔥 2022.11.18: 新增 [Wav2vec2 CLI 和 Demos](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/demos/speech_ssl), 支持 ASR 和 特征提取.
+- 🔥 2022.11.18: 新增 [Wav2vec2 CLI 和 Demos](./demos/speech_ssl), 支持 ASR 和特征提取。
 - 🎉 2022.11.17: TTS 新增[高质量男性音色](https://github.com/PaddlePaddle/PaddleSpeech/pull/2660)。
-- 🔥 2022.11.07: 新增 [U2/U2++ 高性能流式 ASR C++ 部署](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/speechx/examples/u2pp_ol/wenetspeech)。
+- 🔥 2022.11.07: 新增 [U2/U2++ 高性能流式 ASR C++ 部署](./speechx/examples/u2pp_ol/wenetspeech)。
 - 👑 2022.11.01: [中英文混合 TTS](./examples/zh_en_tts/tts3) 新增 [Adversarial Loss](https://arxiv.org/pdf/1907.04448.pdf) 模块。
-- 🔥 2022.10.26: TTS 新增[韵律预测](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/rhy)功能。
+- 🔥 2022.10.26: TTS 新增[韵律预测](./develop/examples/other/rhy)功能。
 - 🎉 2022.10.21: TTS 中文文本前端新增 [SSML](https://github.com/PaddlePaddle/PaddleSpeech/discussions/2538) 功能。
 - 👑 2022.10.11: 新增 [Wav2vec2ASR-en](./examples/librispeech/asr3), 在 LibriSpeech 上针对 ASR 任务对 wav2vec2.0 的 finetuning。
 - 🔥 2022.09.26: 新增 Voice Cloning, TTS finetune 和 [ERNIE-SAT](https://arxiv.org/abs/2211.03545) 到 [PaddleSpeech 网页应用](./demos/speech_web)。
@@ -200,7 +227,7 @@
 微信扫描二维码关注公众号，点击“马上报名”填写问卷加入官方交流群，获得更高效的问题答疑，与各行各业开发者充分交流，期待您的加入。
 
 <div align="center">
-<img src="https://user-images.githubusercontent.com/30135920/196351517-19dece6b-d6ea-448e-a341-d6bfe5712ec1.jpg"  width = "200"  />
+<img src="https://user-images.githubusercontent.com/30135920/212860467-9e943cc3-8be8-49a4-97fd-7c94aad8e979.jpg"  width = "200"  />
 </div>
 
 <a name="安装"></a>
@@ -551,43 +578,50 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块：文本前端、声
     <td>
     <a href = "./examples/other/tn">tn</a> / <a href = "./examples/other/g2p">g2p</a>
     </td>
-    </tr>
-    <tr>
-      <td rowspan="5">声学模型</td>
+   </tr>
+   <tr>
+      <td rowspan="6">声学模型</td>
       <td>Tacotron2</td>
       <td>LJSpeech / CSMSC</td>
       <td>
       <a href = "./examples/ljspeech/tts0">tacotron2-ljspeech</a> / <a href = "./examples/csmsc/tts0">tacotron2-csmsc</a>
       </td>
-    </tr>
-    <tr>
+   </tr>
+   <tr>
       <td>Transformer TTS</td>
       <td>LJSpeech</td>
       <td>
       <a href = "./examples/ljspeech/tts1">transformer-ljspeech</a>
       </td>
-    </tr>
-    <tr>
+   </tr>
+   <tr>
       <td>SpeedySpeech</td>
       <td>CSMSC</td>
       <td >
       <a href = "./examples/csmsc/tts2">speedyspeech-csmsc</a>
       </td>
-    </tr>
-    <tr>
+   </tr>
+   <tr>
       <td>FastSpeech2</td>
       <td>LJSpeech / VCTK / CSMSC / AISHELL-3 / ZH_EN / finetune</td>
       <td>
       <a href = "./examples/ljspeech/tts3">fastspeech2-ljspeech</a> / <a href = "./examples/vctk/tts3">fastspeech2-vctk</a> / <a href = "./examples/csmsc/tts3">fastspeech2-csmsc</a> / <a href = "./examples/aishell3/tts3">fastspeech2-aishell3</a> / <a href = "./examples/zh_en_tts/tts3">fastspeech2-zh_en</a> / <a href = "./examples/other/tts_finetune/tts3">fastspeech2-finetune</a>
       </td>
-    </tr>
-    <tr>
+   </tr>
+   <tr>
       <td><a href = "https://arxiv.org/abs/2211.03545">ERNIE-SAT</a></td>
       <td>VCTK / AISHELL-3 / ZH_EN</td>
       <td>
       <a href = "./examples/vctk/ernie_sat">ERNIE-SAT-vctk</a> / <a href = "./examples/aishell3/ernie_sat">ERNIE-SAT-aishell3</a> / <a href = "./examples/aishell3_vctk/ernie_sat">ERNIE-SAT-zh_en</a>
       </td>
-    </tr>
+   </tr>
+   <tr>
+      <td>DiffSinger</td>
+      <td>Opencpop</td>
+      <td>
+      <a href = "./examples/opencpop/svs1">DiffSinger-opencpop</a>
+      </td>
+   </tr>
    <tr>
       <td rowspan="6">声码器</td>
       <td >WaveFlow</td>
@@ -598,9 +632,9 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块：文本前端、声
     </tr>
     <tr>
       <td >Parallel WaveGAN</td>
-      <td >LJSpeech / VCTK / CSMSC / AISHELL-3</td>
+      <td >LJSpeech / VCTK / CSMSC / AISHELL-3 / Opencpop</td>
       <td>
-      <a href = "./examples/ljspeech/voc1">PWGAN-ljspeech</a> / <a href = "./examples/vctk/voc1">PWGAN-vctk</a> / <a href = "./examples/csmsc/voc1">PWGAN-csmsc</a> /  <a href = "./examples/aishell3/voc1">PWGAN-aishell3</a>
+      <a href = "./examples/ljspeech/voc1">PWGAN-ljspeech</a> / <a href = "./examples/vctk/voc1">PWGAN-vctk</a> / <a href = "./examples/csmsc/voc1">PWGAN-csmsc</a> /  <a href = "./examples/aishell3/voc1">PWGAN-aishell3</a> / <a href = "./examples/opencpop/voc1">PWGAN-opencpop</a>
       </td>
     </tr>
     <tr>
@@ -619,9 +653,9 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块：文本前端、声
     </tr>
     <tr>
       <td >HiFiGAN</td>
-      <td >LJSpeech / VCTK / CSMSC / AISHELL-3</td>
+      <td >LJSpeech / VCTK / CSMSC / AISHELL-3 / Opencpop</td>
       <td>
-      <a href = "./examples/ljspeech/voc5">HiFiGAN-ljspeech</a> / <a href = "./examples/vctk/voc5">HiFiGAN-vctk</a> / <a href = "./examples/csmsc/voc5">HiFiGAN-csmsc</a> / <a href = "./examples/aishell3/voc5">HiFiGAN-aishell3</a>
+      <a href = "./examples/ljspeech/voc5">HiFiGAN-ljspeech</a> / <a href = "./examples/vctk/voc5">HiFiGAN-vctk</a> / <a href = "./examples/csmsc/voc5">HiFiGAN-csmsc</a> / <a href = "./examples/aishell3/voc5">HiFiGAN-aishell3</a> / <a href = "./examples/opencpop/voc5">HiFiGAN-opencpop</a>
       </td>
     </tr>
     <tr>
@@ -678,6 +712,7 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块：文本前端、声
   </tbody>
 </table>
 
+
 <a name="声音分类模型"></a>
 **声音分类**
 
@@ -986,13 +1021,19 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块：文本前端、声
 - 非常感谢 [awmmmm](https://github.com/awmmmm) 提供 fastspeech2 aishell3 conformer 预训练模型。
 - 非常感谢 [phecda-xu](https://github.com/phecda-xu)/[PaddleDubbing](https://github.com/phecda-xu/PaddleDubbing) 基于 PaddleSpeech 的 TTS 模型搭建带 GUI 操作界面的配音工具。
 - 非常感谢 [jerryuhoo](https://github.com/jerryuhoo)/[VTuberTalk](https://github.com/jerryuhoo/VTuberTalk) 基于 PaddleSpeech 的 TTS GUI 界面和基于 ASR 制作数据集的相关代码。
-
 - 非常感谢 [vpegasus](https://github.com/vpegasus)/[xuesebot](https://github.com/vpegasus/xuesebot) 基于 PaddleSpeech 的 ASR 与 TTS 设计的可听、说对话机器人。
 - 非常感谢 [chenkui164](https://github.com/chenkui164)/[FastASR](https://github.com/chenkui164/FastASR) 对 PaddleSpeech 的 ASR 进行 C++ 推理实现。
 - 非常感谢 [heyudage](https://github.com/heyudage)/[VoiceTyping](https://github.com/heyudage/VoiceTyping) 基于 PaddleSpeech 的 ASR 流式服务实现的实时语音输入法工具。
+- 非常感谢 [EscaticZheng](https://github.com/EscaticZheng)/[ps3.9wheel-install](https://github.com/EscaticZheng/ps3.9wheel-install) 对PaddleSpeech在Windows下的安装提供了无需Visua Studio，基于python3.9的预编译依赖安装包。
+- 非常感谢 [chinobing](https://github.com/chinobing)/[FastAPI-PaddleSpeech-Audio-To-Text](https://github.com/chinobing/FastAPI-PaddleSpeech-Audio-To-Text) 利用 FastAPI 实现 PaddleSpeech 语音转文字，文件上传、分割、转换进度显示、后台更新任务并以 csv 格式输出。
+- 非常感谢 [MistEO](https://github.com/MistEO)/[Pallas-Bot](https://github.com/MistEO/Pallas-Bot) 基于 PaddleSpeech TTS 的 QQ Bot 项目。
 
 此外，PaddleSpeech 依赖于许多开源存储库。有关更多信息，请参阅 [references](./docs/source/reference.md)。
 
 ## License
 
 PaddleSpeech 在 [Apache-2.0 许可](./LICENSE) 下提供。
+
+## Stargazers over time
+
+[![Stargazers over time](https://starchart.cc/PaddlePaddle/PaddleSpeech.svg)](https://starchart.cc/PaddlePaddle/PaddleSpeech)
diff --git a/audio/CMakeLists.txt b/audio/CMakeLists.txt
index d9ae63cd..021e2447 100644
--- a/audio/CMakeLists.txt
+++ b/audio/CMakeLists.txt
@@ -41,24 +41,18 @@ option(BUILD_PADDLEAUDIO_PYTHON_EXTENSION "Build Python extension" ON)
 # cmake
 set(CMAKE_MODULE_PATH "${CMAKE_MODULE_PATH};${PROJECT_SOURCE_DIR}/cmake;${PROJECT_SOURCE_DIR}/cmake/external")
 
-if (NOT MSVC)
-    find_package(GFortranLibs REQUIRED)
-    include(FortranCInterface)
-    include(FindGFortranLibs REQUIRED)
-endif()
-
 # fc_patch dir
 set(FETCHCONTENT_QUIET off)
 get_filename_component(fc_patch "fc_patch" REALPATH BASE_DIR "${CMAKE_SOURCE_DIR}")
 set(FETCHCONTENT_BASE_DIR ${fc_patch})
 set(THIRD_PARTY_PATH ${fc_patch})
 
-include(openblas)
-
 set(PYBIND11_PYTHON_VERSION ${PY_VERSION})
 include(cmake/pybind.cmake)
 include_directories(${PYTHON_INCLUDE_DIR})
 
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/paddleaudio/third_party/)
+
 # packages
 find_package(Python3 COMPONENTS Interpreter Development)
 
diff --git a/audio/README.md b/audio/README.md
index bfd8625f..d42d4122 100644
--- a/audio/README.md
+++ b/audio/README.md
@@ -2,33 +2,22 @@
 
 安装方式： pip install paddleaudio
 
-目前支持的平台：Linux：
+目前支持的平台：Linux, Mac, Windows
 
 ## Environment
 
 ## Build wheel
+cmd: python setup.py bdist_wheel
 
 Linux test build whl environment:
-* docker - `registry.baidubce.com/paddlepaddle/paddle:2.2.2`
 * os - Ubuntu 16.04.7 LTS
-* gcc/g++/gfortran - 8.2.0
+* gcc/g++ - 8.2.0
 * cmake - 3.18.0 (need install)
 
-* [How to Install Docker](https://docs.docker.com/engine/install/)
-* [A Docker Tutorial for Beginners](https://docker-curriculum.com/)
-
-1. First to launch docker container.
-
-```
-docker run --privileged  --net=host --ipc=host -it --rm -v $PWD:/workspace --name=dev registry.baidubce.com/paddlepaddle/paddle:2.2.2 /bin/bash
-```
-2. python setup.py bdist_wheel
-
 MAC：test build whl envrioment：
 * os 
-* gcc/g++/gfortran 12.2.0
+* gcc/g++ 12.2.0
 * cpu Intel Xeon E5 x86_64
 
 Windows：
-not support： paddleaudio C++ extension lib (sox io, kaldi native fbank)
-python setup.py bdist_wheel
+not support paddleaudio C++ extension lib (sox io, kaldi native fbank)
diff --git a/audio/paddleaudio/CMakeLists.txt b/audio/paddleaudio/CMakeLists.txt
index dbf2bd3e..c6b43c78 100644
--- a/audio/paddleaudio/CMakeLists.txt
+++ b/audio/paddleaudio/CMakeLists.txt
@@ -1,19 +1,3 @@
 
 add_subdirectory(third_party)
 add_subdirectory(src)
-
-if (APPLE) 
-  file(COPY ${GFORTRAN_LIBRARIES_DIR}/libgcc_s.1.1.dylib
-          DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/lib)
-endif(APPLE)
-
-if (UNIX AND NOT APPLE)
-  file(COPY ${GFORTRAN_LIBRARIES_DIR}/libgfortran.so.5
-          DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/lib FOLLOW_SYMLINK_CHAIN)
-
-  file(COPY ${GFORTRAN_LIBRARIES_DIR}/libquadmath.so.0
-          DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/lib FOLLOW_SYMLINK_CHAIN)
-
-  file(COPY ${GFORTRAN_LIBRARIES_DIR}/libgcc_s.so.1
-          DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/lib FOLLOW_SYMLINK_CHAIN)
-endif()
diff --git a/audio/paddleaudio/_internal/module_utils.py b/audio/paddleaudio/_internal/module_utils.py
index 7b3230de..becd23cd 100644
--- a/audio/paddleaudio/_internal/module_utils.py
+++ b/audio/paddleaudio/_internal/module_utils.py
@@ -67,8 +67,11 @@ def deprecated(direction: str, version: Optional[str]=None):
 
 
 def is_kaldi_available():
-    return is_module_available("paddleaudio._paddleaudio")
-
+    try:
+        from paddleaudio import _paddleaudio  
+        return True
+    except Exception:
+        return False
 
 def requires_kaldi():
     if is_kaldi_available():
@@ -128,9 +131,11 @@ def requires_soundfile():
 
 
 def is_sox_available():
-    if platform.system() == "Windows":  # not support sox in windows
+    try:
+        from paddleaudio import _paddleaudio  
+        return True
+    except Exception:
         return False
-    return is_module_available("paddleaudio._paddleaudio")
 
 
 def requires_sox():
diff --git a/audio/paddleaudio/backends/soundfile_backend.py b/audio/paddleaudio/backends/soundfile_backend.py
index ae7b5b52..9195ea09 100644
--- a/audio/paddleaudio/backends/soundfile_backend.py
+++ b/audio/paddleaudio/backends/soundfile_backend.py
@@ -191,7 +191,7 @@ def soundfile_save(y: np.ndarray, sr: int, file: os.PathLike) -> None:
 
     if sr <= 0:
         raise ParameterError(
-            f'Sample rate should be larger than 0, recieved sr = {sr}')
+            f'Sample rate should be larger than 0, received sr = {sr}')
 
     if y.dtype not in ['int16', 'int8']:
         warnings.warn(
diff --git a/audio/paddleaudio/kaldi/__init__.py b/audio/paddleaudio/kaldi/__init__.py
index f951e280..a0ae644d 100644
--- a/audio/paddleaudio/kaldi/__init__.py
+++ b/audio/paddleaudio/kaldi/__init__.py
@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from .kaldi import fbank
-from .kaldi import pitch
+#from .kaldi import pitch
diff --git a/audio/paddleaudio/kaldi/kaldi.py b/audio/paddleaudio/kaldi/kaldi.py
index 16969d77..0f080de0 100644
--- a/audio/paddleaudio/kaldi/kaldi.py
+++ b/audio/paddleaudio/kaldi/kaldi.py
@@ -16,7 +16,6 @@ from paddleaudio._internal import module_utils
 
 __all__ = [
     'fbank',
-    'pitch',
 ]
 
 
@@ -33,8 +32,6 @@ def fbank(
         round_to_power_of_two: bool=True,
         blackman_coeff: float=0.42,
         snip_edges: bool=True,
-        allow_downsample: bool=False,
-        allow_upsample: bool=False,
         max_feature_vectors: int=-1,
         num_bins: int=23,
         low_freq: float=20,
@@ -62,8 +59,6 @@ def fbank(
     frame_opts.round_to_power_of_two = round_to_power_of_two
     frame_opts.blackman_coeff = blackman_coeff
     frame_opts.snip_edges = snip_edges
-    frame_opts.allow_downsample = allow_downsample
-    frame_opts.allow_upsample = allow_upsample
     frame_opts.max_feature_vectors = max_feature_vectors
 
     mel_opts.num_bins = num_bins
@@ -85,48 +80,48 @@ def fbank(
     return feat
 
 
-@module_utils.requires_kaldi()
-def pitch(wav,
-          samp_freq: int=16000,
-          frame_shift_ms: float=10.0,
-          frame_length_ms: float=25.0,
-          preemph_coeff: float=0.0,
-          min_f0: int=50,
-          max_f0: int=400,
-          soft_min_f0: float=10.0,
-          penalty_factor: float=0.1,
-          lowpass_cutoff: int=1000,
-          resample_freq: int=4000,
-          delta_pitch: float=0.005,
-          nccf_ballast: int=7000,
-          lowpass_filter_width: int=1,
-          upsample_filter_width: int=5,
-          max_frames_latency: int=0,
-          frames_per_chunk: int=0,
-          simulate_first_pass_online: bool=False,
-          recompute_frame: int=500,
-          nccf_ballast_online: bool=False,
-          snip_edges: bool=True):
-    pitch_opts = paddleaudio._paddleaudio.PitchExtractionOptions()
-    pitch_opts.samp_freq = samp_freq
-    pitch_opts.frame_shift_ms = frame_shift_ms
-    pitch_opts.frame_length_ms = frame_length_ms
-    pitch_opts.preemph_coeff = preemph_coeff
-    pitch_opts.min_f0 = min_f0
-    pitch_opts.max_f0 = max_f0
-    pitch_opts.soft_min_f0 = soft_min_f0
-    pitch_opts.penalty_factor = penalty_factor
-    pitch_opts.lowpass_cutoff = lowpass_cutoff
-    pitch_opts.resample_freq = resample_freq
-    pitch_opts.delta_pitch = delta_pitch
-    pitch_opts.nccf_ballast = nccf_ballast
-    pitch_opts.lowpass_filter_width = lowpass_filter_width
-    pitch_opts.upsample_filter_width = upsample_filter_width
-    pitch_opts.max_frames_latency = max_frames_latency
-    pitch_opts.frames_per_chunk = frames_per_chunk
-    pitch_opts.simulate_first_pass_online = simulate_first_pass_online
-    pitch_opts.recompute_frame = recompute_frame
-    pitch_opts.nccf_ballast_online = nccf_ballast_online
-    pitch_opts.snip_edges = snip_edges
-    pitch = paddleaudio._paddleaudio.ComputeKaldiPitch(pitch_opts, wav)
-    return pitch
+#@module_utils.requires_kaldi()
+#def pitch(wav,
+#samp_freq: int=16000,
+#frame_shift_ms: float=10.0,
+#frame_length_ms: float=25.0,
+#preemph_coeff: float=0.0,
+#min_f0: int=50,
+#max_f0: int=400,
+#soft_min_f0: float=10.0,
+#penalty_factor: float=0.1,
+#lowpass_cutoff: int=1000,
+#resample_freq: int=4000,
+#delta_pitch: float=0.005,
+#nccf_ballast: int=7000,
+#lowpass_filter_width: int=1,
+#upsample_filter_width: int=5,
+#max_frames_latency: int=0,
+#frames_per_chunk: int=0,
+#simulate_first_pass_online: bool=False,
+#recompute_frame: int=500,
+#nccf_ballast_online: bool=False,
+#snip_edges: bool=True):
+#pitch_opts = paddleaudio._paddleaudio.PitchExtractionOptions()
+#pitch_opts.samp_freq = samp_freq
+#pitch_opts.frame_shift_ms = frame_shift_ms
+#pitch_opts.frame_length_ms = frame_length_ms
+#pitch_opts.preemph_coeff = preemph_coeff
+#pitch_opts.min_f0 = min_f0
+#pitch_opts.max_f0 = max_f0
+#pitch_opts.soft_min_f0 = soft_min_f0
+#pitch_opts.penalty_factor = penalty_factor
+#pitch_opts.lowpass_cutoff = lowpass_cutoff
+#pitch_opts.resample_freq = resample_freq
+#pitch_opts.delta_pitch = delta_pitch
+#pitch_opts.nccf_ballast = nccf_ballast
+#pitch_opts.lowpass_filter_width = lowpass_filter_width
+#pitch_opts.upsample_filter_width = upsample_filter_width
+#pitch_opts.max_frames_latency = max_frames_latency
+#pitch_opts.frames_per_chunk = frames_per_chunk
+#pitch_opts.simulate_first_pass_online = simulate_first_pass_online
+#pitch_opts.recompute_frame = recompute_frame
+#pitch_opts.nccf_ballast_online = nccf_ballast_online
+#pitch_opts.snip_edges = snip_edges
+#pitch = paddleaudio._paddleaudio.ComputeKaldiPitch(pitch_opts, wav)
+#return pitch
diff --git a/audio/paddleaudio/src/CMakeLists.txt b/audio/paddleaudio/src/CMakeLists.txt
index fb6f3209..21e0f170 100644
--- a/audio/paddleaudio/src/CMakeLists.txt
+++ b/audio/paddleaudio/src/CMakeLists.txt
@@ -52,7 +52,7 @@ if(BUILD_KALDI)
   list(
     APPEND
     LIBPADDLEAUDIO_LINK_LIBRARIES
-    libkaldi
+    kaldi-native-fbank-core
   )
   list(
     APPEND
@@ -92,14 +92,6 @@ define_library(
   "${LIBPADDLEAUDIO_COMPILE_DEFINITIONS}"
 )
 
-if (APPLE)
-  add_custom_command(TARGET libpaddleaudio POST_BUILD COMMAND install_name_tool -change "${GFORTRAN_LIBRARIES_DIR}/libgcc_s.1.1.dylib" "@loader_path/libgcc_s.1.1.dylib" libpaddleaudio.so)
-endif(APPLE)
-
-if (UNIX AND NOT APPLE)
-  set_target_properties(libpaddleaudio PROPERTIES INSTALL_RPATH "$ORIGIN")
-endif()
-
 if (APPLE)
   set(AUDIO_LIBRARY libpaddleaudio CACHE INTERNAL "")
 else()
@@ -207,11 +199,3 @@ define_extension(
 #     )
 # endif()
 endif()
-
-if (APPLE)
-  add_custom_command(TARGET _paddleaudio POST_BUILD COMMAND install_name_tool -change "${GFORTRAN_LIBRARIES_DIR}/libgcc_s.1.1.dylib" "@loader_path/lib/libgcc_s.1.1.dylib" _paddleaudio.so)
-endif(APPLE)
-
-if (UNIX AND NOT APPLE)
-  set_target_properties(_paddleaudio PROPERTIES INSTALL_RPATH "$ORIGIN/lib")
-endif()
diff --git a/audio/paddleaudio/src/pybind/kaldi/feature_common.h b/audio/paddleaudio/src/pybind/kaldi/feature_common.h
index 05522bb7..6571fa3e 100644
--- a/audio/paddleaudio/src/pybind/kaldi/feature_common.h
+++ b/audio/paddleaudio/src/pybind/kaldi/feature_common.h
@@ -16,7 +16,7 @@
 
 #include "pybind11/pybind11.h"
 #include "pybind11/numpy.h"
-#include "feat/feature-window.h"
+#include "kaldi-native-fbank/csrc/feature-window.h"
 
 namespace paddleaudio {
 namespace kaldi {
@@ -28,18 +28,18 @@ class StreamingFeatureTpl {
   public:
     typedef typename F::Options Options;
     StreamingFeatureTpl(const Options& opts);
-    bool ComputeFeature(const ::kaldi::VectorBase<::kaldi::BaseFloat>& wav,
-                        ::kaldi::Vector<::kaldi::BaseFloat>* feats);
-    void Reset() { remained_wav_.Resize(0); }
+    bool ComputeFeature(const std::vector<float>& wav,
+                        std::vector<float>* feats);
+    void Reset() { remained_wav_.resize(0); }
 
     int Dim() { return computer_.Dim(); }
 
   private:
-    bool Compute(const ::kaldi::Vector<::kaldi::BaseFloat>& waves,
-                 ::kaldi::Vector<::kaldi::BaseFloat>* feats);
+    bool Compute(const std::vector<float>& waves,
+                 std::vector<float>* feats);
     Options opts_;
-    ::kaldi::FeatureWindowFunction window_function_;
-    ::kaldi::Vector<::kaldi::BaseFloat> remained_wav_;
+    knf::FeatureWindowFunction window_function_;
+    std::vector<float> remained_wav_;
     F computer_;
 };
 
diff --git a/audio/paddleaudio/src/pybind/kaldi/feature_common_inl.h b/audio/paddleaudio/src/pybind/kaldi/feature_common_inl.h
index c894b977..985d586f 100644
--- a/audio/paddleaudio/src/pybind/kaldi/feature_common_inl.h
+++ b/audio/paddleaudio/src/pybind/kaldi/feature_common_inl.h
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "base/kaldi-common.h"
 
 namespace paddleaudio {
 namespace kaldi {
@@ -25,24 +24,29 @@ StreamingFeatureTpl<F>::StreamingFeatureTpl(const Options& opts)
 
 template <class F>
 bool StreamingFeatureTpl<F>::ComputeFeature(
-    const ::kaldi::VectorBase<::kaldi::BaseFloat>& wav,
-    ::kaldi::Vector<::kaldi::BaseFloat>* feats) {
+    const std::vector<float>& wav,
+    std::vector<float>* feats) {
     // append remaned waves
-    ::kaldi::int32 wav_len = wav.Dim();
+    int wav_len = wav.size();
     if (wav_len == 0) return false;
-    ::kaldi::int32 left_len = remained_wav_.Dim();
-    ::kaldi::Vector<::kaldi::BaseFloat> waves(left_len + wav_len);
-    waves.Range(0, left_len).CopyFromVec(remained_wav_);
-    waves.Range(left_len, wav_len).CopyFromVec(wav);
+    int left_len = remained_wav_.size();
+    std::vector<float> waves(left_len + wav_len);
+    std::memcpy(waves.data(),
+                remained_wav_.data(),
+                left_len * sizeof(float));
+    std::memcpy(waves.data() + left_len,
+                wav.data(),
+                wav_len * sizeof(float));
 
     // cache remaned waves
-    ::kaldi::FrameExtractionOptions frame_opts = computer_.GetFrameOptions();
-    ::kaldi::int32 num_frames = ::kaldi::NumFrames(waves.Dim(), frame_opts);
-    ::kaldi::int32 frame_shift = frame_opts.WindowShift();
-    ::kaldi::int32 left_samples = waves.Dim() - frame_shift * num_frames;
-    remained_wav_.Resize(left_samples);
-    remained_wav_.CopyFromVec(
-        waves.Range(frame_shift * num_frames, left_samples));
+    knf::FrameExtractionOptions frame_opts = computer_.GetFrameOptions();
+    int num_frames = knf::NumFrames(waves.size(), frame_opts);
+    int frame_shift = frame_opts.WindowShift();
+    int left_samples = waves.size() - frame_shift * num_frames;
+    remained_wav_.resize(left_samples);
+    std::memcpy(remained_wav_.data(),
+                waves.data() + frame_shift * num_frames,
+                left_samples * sizeof(float));
 
     // compute speech feature
     Compute(waves, feats);
@@ -51,40 +55,39 @@ bool StreamingFeatureTpl<F>::ComputeFeature(
 
 // Compute feat
 template <class F>
-bool StreamingFeatureTpl<F>::Compute(
-    const ::kaldi::Vector<::kaldi::BaseFloat>& waves,
-    ::kaldi::Vector<::kaldi::BaseFloat>* feats) {
-    ::kaldi::BaseFloat vtln_warp = 1.0;
-    const ::kaldi::FrameExtractionOptions& frame_opts =
-        computer_.GetFrameOptions();
-    ::kaldi::int32 num_samples = waves.Dim();
-    ::kaldi::int32 frame_length = frame_opts.WindowSize();
-    ::kaldi::int32 sample_rate = frame_opts.samp_freq;
+bool StreamingFeatureTpl<F>::Compute(const std::vector<float>& waves,
+                                     std::vector<float>* feats) {
+    const knf::FrameExtractionOptions& frame_opts = computer_.GetFrameOptions();
+    int num_samples = waves.size();
+    int frame_length = frame_opts.WindowSize();
+    int sample_rate = frame_opts.samp_freq;
     if (num_samples < frame_length) {
-        return false;
+        return true;
     }
 
-    ::kaldi::int32 num_frames = ::kaldi::NumFrames(num_samples, frame_opts);
-    feats->Resize(num_frames * Dim());
+    int num_frames = knf::NumFrames(num_samples, frame_opts);
+    feats->resize(num_frames * Dim());
 
-    ::kaldi::Vector<::kaldi::BaseFloat> window;
+    std::vector<float> window;
     bool need_raw_log_energy = computer_.NeedRawLogEnergy();
-    for (::kaldi::int32 frame = 0; frame < num_frames; frame++) {
-        ::kaldi::BaseFloat raw_log_energy = 0.0;
-        ::kaldi::ExtractWindow(0,
-                               waves,
-                               frame,
-                               frame_opts,
-                               window_function_,
-                               &window,
-                               need_raw_log_energy ? &raw_log_energy : NULL);
+    for (int frame = 0; frame < num_frames; frame++) {
+        std::fill(window.begin(), window.end(), 0);
+        float raw_log_energy = 0.0;
+        float vtln_warp = 1.0;
+        knf::ExtractWindow(0,
+                           waves,
+                           frame,
+                           frame_opts,
+                           window_function_,
+                           &window,
+                           need_raw_log_energy ? &raw_log_energy : NULL);
 
-        ::kaldi::Vector<::kaldi::BaseFloat> this_feature(computer_.Dim(),
-                                                         ::kaldi::kUndefined);
-        computer_.Compute(raw_log_energy, vtln_warp, &window, &this_feature);
-        ::kaldi::SubVector<::kaldi::BaseFloat> output_row(
-            feats->Data() + frame * Dim(), Dim());
-        output_row.CopyFromVec(this_feature);
+        std::vector<float> this_feature(computer_.Dim());
+        computer_.Compute(
+            raw_log_energy, vtln_warp, &window, this_feature.data());
+        std::memcpy(feats->data() + frame * Dim(),
+                    this_feature.data(),
+                    sizeof(float) * Dim());
     }
     return true;
 }
diff --git a/audio/paddleaudio/src/pybind/kaldi/kaldi_feature.cc b/audio/paddleaudio/src/pybind/kaldi/kaldi_feature.cc
index 40e3786e..83df454c 100644
--- a/audio/paddleaudio/src/pybind/kaldi/kaldi_feature.cc
+++ b/audio/paddleaudio/src/pybind/kaldi/kaldi_feature.cc
@@ -13,16 +13,16 @@
 // limitations under the License.
 
 #include "paddleaudio/src/pybind/kaldi/kaldi_feature.h"
-#include "feat/pitch-functions.h"
+//#include "feat/pitch-functions.h"
 
 namespace paddleaudio {
 namespace kaldi {
 
 bool InitFbank(
-    ::kaldi::FrameExtractionOptions frame_opts,
-    ::kaldi::MelBanksOptions mel_opts,
+    knf::FrameExtractionOptions frame_opts,
+    knf::MelBanksOptions mel_opts,
     FbankOptions fbank_opts) {
-    ::kaldi::FbankOptions opts;
+    knf::FbankOptions opts;
     opts.frame_opts = frame_opts;
     opts.mel_opts = mel_opts;
     opts.use_energy = fbank_opts.use_energy;
@@ -41,8 +41,8 @@ py::array_t<float> ComputeFbankStreaming(const py::array_t<float>& wav) {
 }
 
 py::array_t<float> ComputeFbank(
-    ::kaldi::FrameExtractionOptions frame_opts,
-    ::kaldi::MelBanksOptions mel_opts,
+    knf::FrameExtractionOptions frame_opts,
+    knf::MelBanksOptions mel_opts,
     FbankOptions fbank_opts,
     const py::array_t<float>& wav) {
     InitFbank(frame_opts, mel_opts, fbank_opts);
@@ -55,21 +55,21 @@ void ResetFbank() {
     paddleaudio::kaldi::KaldiFeatureWrapper::GetInstance()->ResetFbank();
 }
 
-py::array_t<float> ComputeKaldiPitch(
-  const ::kaldi::PitchExtractionOptions& opts,
-  const py::array_t<float>& wav) {
-    py::buffer_info info = wav.request();
-    ::kaldi::SubVector<::kaldi::BaseFloat> input_wav((float*)info.ptr, info.size);
+//py::array_t<float> ComputeKaldiPitch(
+  //const ::kaldi::PitchExtractionOptions& opts,
+  //const py::array_t<float>& wav) {
+    //py::buffer_info info = wav.request();
+    //::kaldi::SubVector<::kaldi::BaseFloat> input_wav((float*)info.ptr, info.size);
    
-    ::kaldi::Matrix<::kaldi::BaseFloat> features;
-    ::kaldi::ComputeKaldiPitch(opts, input_wav, &features);
-    auto result = py::array_t<float>({features.NumRows(), features.NumCols()});
-    for (int row_idx = 0; row_idx < features.NumRows(); ++row_idx) {
-        std::memcpy(result.mutable_data(row_idx), features.Row(row_idx).Data(),
-                    sizeof(float)*features.NumCols());
-    }
-   return result;
-}
+    //::kaldi::Matrix<::kaldi::BaseFloat> features;
+    //::kaldi::ComputeKaldiPitch(opts, input_wav, &features);
+    //auto result = py::array_t<float>({features.NumRows(), features.NumCols()});
+    //for (int row_idx = 0; row_idx < features.NumRows(); ++row_idx) {
+        //std::memcpy(result.mutable_data(row_idx), features.Row(row_idx).Data(),
+                    //sizeof(float)*features.NumCols());
+    //}
+   //return result;
+//}
 
 }  // namespace kaldi
 }  // namespace paddleaudio
diff --git a/audio/paddleaudio/src/pybind/kaldi/kaldi_feature.h b/audio/paddleaudio/src/pybind/kaldi/kaldi_feature.h
index e059c52c..031ec863 100644
--- a/audio/paddleaudio/src/pybind/kaldi/kaldi_feature.h
+++ b/audio/paddleaudio/src/pybind/kaldi/kaldi_feature.h
@@ -19,7 +19,7 @@
 #include <string>
 
 #include "paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.h"
-#include "feat/pitch-functions.h"
+//#include "feat/pitch-functions.h"
 
 namespace py = pybind11;
 
@@ -42,13 +42,13 @@ struct FbankOptions{
 };
 
 bool InitFbank(
-    ::kaldi::FrameExtractionOptions frame_opts,
-    ::kaldi::MelBanksOptions mel_opts,
+    knf::FrameExtractionOptions frame_opts,
+    knf::MelBanksOptions mel_opts,
     FbankOptions fbank_opts);
 
 py::array_t<float> ComputeFbank(
-    ::kaldi::FrameExtractionOptions frame_opts,
-    ::kaldi::MelBanksOptions mel_opts,
+    knf::FrameExtractionOptions frame_opts,
+    knf::MelBanksOptions mel_opts,
     FbankOptions fbank_opts,
     const py::array_t<float>& wav);
 
@@ -56,9 +56,9 @@ py::array_t<float> ComputeFbankStreaming(const py::array_t<float>& wav);
 
 void ResetFbank();
 
-py::array_t<float> ComputeKaldiPitch(
-    const ::kaldi::PitchExtractionOptions& opts,
-    const py::array_t<float>& wav);
+//py::array_t<float> ComputeKaldiPitch(
+    //const ::kaldi::PitchExtractionOptions& opts,
+    //const py::array_t<float>& wav);
 
 }  // namespace kaldi
 }  // namespace paddleaudio
diff --git a/audio/paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.cc b/audio/paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.cc
index 79558046..8b8ff18b 100644
--- a/audio/paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.cc
+++ b/audio/paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.cc
@@ -22,7 +22,7 @@ KaldiFeatureWrapper* KaldiFeatureWrapper::GetInstance() {
     return &instance;
 }
 
-bool KaldiFeatureWrapper::InitFbank(::kaldi::FbankOptions opts) {
+bool KaldiFeatureWrapper::InitFbank(knf::FbankOptions opts) {
     fbank_.reset(new Fbank(opts));
     return true;
 }
@@ -30,21 +30,18 @@ bool KaldiFeatureWrapper::InitFbank(::kaldi::FbankOptions opts) {
 py::array_t<float> KaldiFeatureWrapper::ComputeFbank(
     const py::array_t<float> wav) {
     py::buffer_info info = wav.request();
-    ::kaldi::SubVector<::kaldi::BaseFloat> input_wav((float*)info.ptr, info.size);
+    std::vector<float> input_wav((float*)info.ptr, (float*)info.ptr + info.size);
 
-    ::kaldi::Vector<::kaldi::BaseFloat> feats;
+    std::vector<float> feats;
     bool flag = fbank_->ComputeFeature(input_wav, &feats);
-    if (flag == false || feats.Dim() == 0) return py::array_t<float>();
-    auto result = py::array_t<float>(feats.Dim());
+    if (flag == false || feats.size() == 0) return py::array_t<float>();
+    auto result = py::array_t<float>(feats.size());
     py::buffer_info xs = result.request();
-    std::cout << std::endl;
     float* res_ptr = (float*)xs.ptr;
-    for (int idx = 0; idx < feats.Dim(); ++idx) {
-        *res_ptr = feats(idx);
-        res_ptr++;
-    }
-
-    return result.reshape({feats.Dim() / Dim(), Dim()});
+    std::memcpy(res_ptr, feats.data(), sizeof(float)*feats.size());
+    std::vector<int> shape{static_cast<int>(feats.size() / Dim()), 
+                           static_cast<int>(Dim())};
+    return result.reshape(shape);
 }
 
 }  // namesapce kaldi
diff --git a/audio/paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.h b/audio/paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.h
index bee1eee0..daad2d58 100644
--- a/audio/paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.h
+++ b/audio/paddleaudio/src/pybind/kaldi/kaldi_feature_wrapper.h
@@ -14,20 +14,18 @@
 
 #pragma once
 
-#include "base/kaldi-common.h"
-#include "feat/feature-fbank.h"
-
+#include "paddleaudio/third_party/kaldi-native-fbank/csrc/feature-fbank.h"
 #include "paddleaudio/src/pybind/kaldi/feature_common.h"
 
 namespace paddleaudio {
 namespace kaldi {
 
-typedef StreamingFeatureTpl<::kaldi::FbankComputer> Fbank;
+typedef StreamingFeatureTpl<knf::FbankComputer> Fbank;
 
 class KaldiFeatureWrapper {
   public:
     static KaldiFeatureWrapper* GetInstance();
-    bool InitFbank(::kaldi::FbankOptions opts);
+    bool InitFbank(knf::FbankOptions opts);
     py::array_t<float> ComputeFbank(const py::array_t<float> wav);
     int Dim() { return fbank_->Dim(); }
     void ResetFbank() { fbank_->Reset(); }
diff --git a/audio/paddleaudio/src/pybind/pybind.cpp b/audio/paddleaudio/src/pybind/pybind.cpp
index 692e8099..51071203 100644
--- a/audio/paddleaudio/src/pybind/pybind.cpp
+++ b/audio/paddleaudio/src/pybind/pybind.cpp
@@ -2,7 +2,7 @@
 
 #ifdef INCLUDE_KALDI
 #include "paddleaudio/src/pybind/kaldi/kaldi_feature.h"
-#include "paddleaudio/third_party/kaldi/feat/feature-fbank.h"
+#include "paddleaudio/third_party/kaldi-native-fbank/csrc/feature-fbank.h"
 #endif
 
 #ifdef INCLUDE_SOX
@@ -89,53 +89,51 @@ PYBIND11_MODULE(_paddleaudio, m) {
 
 #ifdef INCLUDE_KALDI
     m.def("ComputeFbank", &paddleaudio::kaldi::ComputeFbank, "compute fbank");
-    py::class_<kaldi::PitchExtractionOptions>(m, "PitchExtractionOptions")
-        .def(py::init<>())
-        .def_readwrite("samp_freq", &kaldi::PitchExtractionOptions::samp_freq)
-        .def_readwrite("frame_shift_ms", &kaldi::PitchExtractionOptions::frame_shift_ms)
-        .def_readwrite("frame_length_ms", &kaldi::PitchExtractionOptions::frame_length_ms)
-        .def_readwrite("preemph_coeff", &kaldi::PitchExtractionOptions::preemph_coeff)
-        .def_readwrite("min_f0", &kaldi::PitchExtractionOptions::min_f0)
-        .def_readwrite("max_f0", &kaldi::PitchExtractionOptions::max_f0)
-        .def_readwrite("soft_min_f0", &kaldi::PitchExtractionOptions::soft_min_f0)
-        .def_readwrite("penalty_factor", &kaldi::PitchExtractionOptions::penalty_factor)
-        .def_readwrite("lowpass_cutoff", &kaldi::PitchExtractionOptions::lowpass_cutoff)
-        .def_readwrite("resample_freq", &kaldi::PitchExtractionOptions::resample_freq)
-        .def_readwrite("delta_pitch", &kaldi::PitchExtractionOptions::delta_pitch)
-        .def_readwrite("nccf_ballast", &kaldi::PitchExtractionOptions::nccf_ballast)
-        .def_readwrite("lowpass_filter_width", &kaldi::PitchExtractionOptions::lowpass_filter_width)
-        .def_readwrite("upsample_filter_width", &kaldi::PitchExtractionOptions::upsample_filter_width)
-        .def_readwrite("max_frames_latency", &kaldi::PitchExtractionOptions::max_frames_latency)
-        .def_readwrite("frames_per_chunk", &kaldi::PitchExtractionOptions::frames_per_chunk)
-        .def_readwrite("simulate_first_pass_online", &kaldi::PitchExtractionOptions::simulate_first_pass_online)
-        .def_readwrite("recompute_frame", &kaldi::PitchExtractionOptions::recompute_frame)
-        .def_readwrite("nccf_ballast_online", &kaldi::PitchExtractionOptions::nccf_ballast_online)
-        .def_readwrite("snip_edges", &kaldi::PitchExtractionOptions::snip_edges);
-    m.def("ComputeKaldiPitch", &paddleaudio::kaldi::ComputeKaldiPitch, "compute kaldi pitch");
-    py::class_<kaldi::FrameExtractionOptions>(m, "FrameExtractionOptions")
+    //py::class_<kaldi::PitchExtractionOptions>(m, "PitchExtractionOptions")
+        //.def(py::init<>())
+        //.def_readwrite("samp_freq", &kaldi::PitchExtractionOptions::samp_freq)
+        //.def_readwrite("frame_shift_ms", &kaldi::PitchExtractionOptions::frame_shift_ms)
+        //.def_readwrite("frame_length_ms", &kaldi::PitchExtractionOptions::frame_length_ms)
+        //.def_readwrite("preemph_coeff", &kaldi::PitchExtractionOptions::preemph_coeff)
+        //.def_readwrite("min_f0", &kaldi::PitchExtractionOptions::min_f0)
+        //.def_readwrite("max_f0", &kaldi::PitchExtractionOptions::max_f0)
+        //.def_readwrite("soft_min_f0", &kaldi::PitchExtractionOptions::soft_min_f0)
+        //.def_readwrite("penalty_factor", &kaldi::PitchExtractionOptions::penalty_factor)
+        //.def_readwrite("lowpass_cutoff", &kaldi::PitchExtractionOptions::lowpass_cutoff)
+        //.def_readwrite("resample_freq", &kaldi::PitchExtractionOptions::resample_freq)
+        //.def_readwrite("delta_pitch", &kaldi::PitchExtractionOptions::delta_pitch)
+        //.def_readwrite("nccf_ballast", &kaldi::PitchExtractionOptions::nccf_ballast)
+        //.def_readwrite("lowpass_filter_width", &kaldi::PitchExtractionOptions::lowpass_filter_width)
+        //.def_readwrite("upsample_filter_width", &kaldi::PitchExtractionOptions::upsample_filter_width)
+        //.def_readwrite("max_frames_latency", &kaldi::PitchExtractionOptions::max_frames_latency)
+        //.def_readwrite("frames_per_chunk", &kaldi::PitchExtractionOptions::frames_per_chunk)
+        //.def_readwrite("simulate_first_pass_online", &kaldi::PitchExtractionOptions::simulate_first_pass_online)
+        //.def_readwrite("recompute_frame", &kaldi::PitchExtractionOptions::recompute_frame)
+        //.def_readwrite("nccf_ballast_online", &kaldi::PitchExtractionOptions::nccf_ballast_online)
+        //.def_readwrite("snip_edges", &kaldi::PitchExtractionOptions::snip_edges);
+    //m.def("ComputeKaldiPitch", &paddleaudio::kaldi::ComputeKaldiPitch, "compute kaldi pitch");
+    py::class_<knf::FrameExtractionOptions>(m, "FrameExtractionOptions")
         .def(py::init<>())            
-        .def_readwrite("samp_freq", &kaldi::FrameExtractionOptions::samp_freq)
-        .def_readwrite("frame_shift_ms", &kaldi::FrameExtractionOptions::frame_shift_ms)            
-        .def_readwrite("frame_length_ms", &kaldi::FrameExtractionOptions::frame_length_ms)
-        .def_readwrite("dither", &kaldi::FrameExtractionOptions::dither)            
-        .def_readwrite("preemph_coeff", &kaldi::FrameExtractionOptions::preemph_coeff)            
-        .def_readwrite("remove_dc_offset", &kaldi::FrameExtractionOptions::remove_dc_offset)            
-        .def_readwrite("window_type", &kaldi::FrameExtractionOptions::window_type)
-        .def_readwrite("round_to_power_of_two", &kaldi::FrameExtractionOptions::round_to_power_of_two)           
-        .def_readwrite("blackman_coeff", &kaldi::FrameExtractionOptions::blackman_coeff)          
-        .def_readwrite("snip_edges", &kaldi::FrameExtractionOptions::snip_edges)
-        .def_readwrite("allow_downsample", &kaldi::FrameExtractionOptions::allow_downsample)
-        .def_readwrite("allow_upsample", &kaldi::FrameExtractionOptions::allow_upsample)
-        .def_readwrite("max_feature_vectors", &kaldi::FrameExtractionOptions::max_feature_vectors);
-    py::class_<kaldi::MelBanksOptions>(m, "MelBanksOptions")
+        .def_readwrite("samp_freq", &knf::FrameExtractionOptions::samp_freq)
+        .def_readwrite("frame_shift_ms", &knf::FrameExtractionOptions::frame_shift_ms)            
+        .def_readwrite("frame_length_ms", &knf::FrameExtractionOptions::frame_length_ms)
+        .def_readwrite("dither", &knf::FrameExtractionOptions::dither)            
+        .def_readwrite("preemph_coeff", &knf::FrameExtractionOptions::preemph_coeff)            
+        .def_readwrite("remove_dc_offset", &knf::FrameExtractionOptions::remove_dc_offset)            
+        .def_readwrite("window_type", &knf::FrameExtractionOptions::window_type)
+        .def_readwrite("round_to_power_of_two", &knf::FrameExtractionOptions::round_to_power_of_two)           
+        .def_readwrite("blackman_coeff", &knf::FrameExtractionOptions::blackman_coeff)          
+        .def_readwrite("snip_edges", &knf::FrameExtractionOptions::snip_edges)
+        .def_readwrite("max_feature_vectors", &knf::FrameExtractionOptions::max_feature_vectors);
+    py::class_<knf::MelBanksOptions>(m, "MelBanksOptions")
         .def(py::init<>())
-        .def_readwrite("num_bins", &kaldi::MelBanksOptions::num_bins)
-        .def_readwrite("low_freq", &kaldi::MelBanksOptions::low_freq)
-        .def_readwrite("high_freq", &kaldi::MelBanksOptions::high_freq)
-        .def_readwrite("vtln_low", &kaldi::MelBanksOptions::vtln_low)
-        .def_readwrite("vtln_high", &kaldi::MelBanksOptions::vtln_high)
-        .def_readwrite("debug_mel", &kaldi::MelBanksOptions::debug_mel)
-        .def_readwrite("htk_mode", &kaldi::MelBanksOptions::htk_mode);
+        .def_readwrite("num_bins", &knf::MelBanksOptions::num_bins)
+        .def_readwrite("low_freq", &knf::MelBanksOptions::low_freq)
+        .def_readwrite("high_freq", &knf::MelBanksOptions::high_freq)
+        .def_readwrite("vtln_low", &knf::MelBanksOptions::vtln_low)
+        .def_readwrite("vtln_high", &knf::MelBanksOptions::vtln_high)
+        .def_readwrite("debug_mel", &knf::MelBanksOptions::debug_mel)
+        .def_readwrite("htk_mode", &knf::MelBanksOptions::htk_mode);
 
     py::class_<paddleaudio::kaldi::FbankOptions>(m, "FbankOptions")
         .def(py::init<>())
diff --git a/audio/paddleaudio/third_party/CMakeLists.txt b/audio/paddleaudio/third_party/CMakeLists.txt
index 43288f39..4b85bada 100644
--- a/audio/paddleaudio/third_party/CMakeLists.txt
+++ b/audio/paddleaudio/third_party/CMakeLists.txt
@@ -11,5 +11,6 @@ endif()
 # kaldi
 ################################################################################
 if (BUILD_KALDI)
-  add_subdirectory(kaldi)
-endif()
\ No newline at end of file
+  include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+  add_subdirectory(kaldi-native-fbank/csrc)
+endif()
diff --git a/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/CMakeLists.txt b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/CMakeLists.txt
new file mode 100644
index 00000000..176607fc
--- /dev/null
+++ b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/CMakeLists.txt
@@ -0,0 +1,22 @@
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../../)
+add_library(kaldi-native-fbank-core
+  feature-fbank.cc
+  feature-functions.cc
+  feature-window.cc
+  fftsg.c
+  log.cc
+  mel-computations.cc
+  rfft.cc
+)
+# We are using std::call_once() in log.h,which requires us to link with -pthread
+if(NOT WIN32)
+  target_link_libraries(kaldi-native-fbank-core -pthread)
+endif()
+
+if(KNF_HAVE_EXECINFO_H)
+  target_compile_definitions(kaldi-native-fbank-core PRIVATE KNF_HAVE_EXECINFO_H=1)
+endif()
+
+if(KNF_HAVE_CXXABI_H)
+  target_compile_definitions(kaldi-native-fbank-core PRIVATE KNF_HAVE_CXXABI_H=1)
+endif()
diff --git a/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-fbank.cc b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-fbank.cc
new file mode 100644
index 00000000..740ee17e
--- /dev/null
+++ b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-fbank.cc
@@ -0,0 +1,117 @@
+/**
+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// This file is copied/modified from kaldi/src/feat/feature-fbank.cc
+//
+#include "kaldi-native-fbank/csrc/feature-fbank.h"
+
+#include <cmath>
+
+#include "kaldi-native-fbank/csrc/feature-functions.h"
+
+namespace knf {
+
+static void Sqrt(float *in_out, int32_t n) {
+  for (int32_t i = 0; i != n; ++i) {
+    in_out[i] = std::sqrt(in_out[i]);
+  }
+}
+
+std::ostream &operator<<(std::ostream &os, const FbankOptions &opts) {
+  os << opts.ToString();
+  return os;
+}
+
+FbankComputer::FbankComputer(const FbankOptions &opts)
+    : opts_(opts), rfft_(opts.frame_opts.PaddedWindowSize()) {
+  if (opts.energy_floor > 0.0f) {
+    log_energy_floor_ = logf(opts.energy_floor);
+  }
+
+  // We'll definitely need the filterbanks info for VTLN warping factor 1.0.
+  // [note: this call caches it.]
+  GetMelBanks(1.0f);
+}
+
+FbankComputer::~FbankComputer() {
+  for (auto iter = mel_banks_.begin(); iter != mel_banks_.end(); ++iter)
+    delete iter->second;
+}
+
+const MelBanks *FbankComputer::GetMelBanks(float vtln_warp) {
+  MelBanks *this_mel_banks = nullptr;
+
+  // std::map<float, MelBanks *>::iterator iter = mel_banks_.find(vtln_warp);
+  auto iter = mel_banks_.find(vtln_warp);
+  if (iter == mel_banks_.end()) {
+    this_mel_banks = new MelBanks(opts_.mel_opts, opts_.frame_opts, vtln_warp);
+    mel_banks_[vtln_warp] = this_mel_banks;
+  } else {
+    this_mel_banks = iter->second;
+  }
+  return this_mel_banks;
+}
+
+void FbankComputer::Compute(float signal_raw_log_energy, float vtln_warp,
+                            std::vector<float> *signal_frame, float *feature) {
+  const MelBanks &mel_banks = *(GetMelBanks(vtln_warp));
+
+  KNF_CHECK_EQ(signal_frame->size(), opts_.frame_opts.PaddedWindowSize());
+
+  // Compute energy after window function (not the raw one).
+  if (opts_.use_energy && !opts_.raw_energy) {
+    signal_raw_log_energy = std::log(
+        std::max<float>(InnerProduct(signal_frame->data(), signal_frame->data(),
+                                     signal_frame->size()),
+                        std::numeric_limits<float>::epsilon()));
+  }
+  rfft_.Compute(signal_frame->data());  // signal_frame is modified in-place
+  ComputePowerSpectrum(signal_frame);
+
+  // Use magnitude instead of power if requested.
+  if (!opts_.use_power) {
+    Sqrt(signal_frame->data(), signal_frame->size() / 2 + 1);
+  }
+
+  int32_t mel_offset = ((opts_.use_energy && !opts_.htk_compat) ? 1 : 0);
+
+  // Its length is opts_.mel_opts.num_bins
+  float *mel_energies = feature + mel_offset;
+
+  // Sum with mel filter banks over the power spectrum
+  mel_banks.Compute(signal_frame->data(), mel_energies);
+
+  if (opts_.use_log_fbank) {
+    // Avoid log of zero (which should be prevented anyway by dithering).
+    for (int32_t i = 0; i != opts_.mel_opts.num_bins; ++i) {
+      auto t = std::max(mel_energies[i], std::numeric_limits<float>::epsilon());
+      mel_energies[i] = std::log(t);
+    }
+  }
+
+  // Copy energy as first value (or the last, if htk_compat == true).
+  if (opts_.use_energy) {
+    if (opts_.energy_floor > 0.0 && signal_raw_log_energy < log_energy_floor_) {
+      signal_raw_log_energy = log_energy_floor_;
+    }
+    int32_t energy_index = opts_.htk_compat ? opts_.mel_opts.num_bins : 0;
+    feature[energy_index] = signal_raw_log_energy;
+  }
+}
+
+}  // namespace knf
diff --git a/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-fbank.h b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-fbank.h
new file mode 100644
index 00000000..0ef3fac0
--- /dev/null
+++ b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-fbank.h
@@ -0,0 +1,132 @@
+/**
+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// This file is copied/modified from kaldi/src/feat/feature-fbank.h
+
+#ifndef KALDI_NATIVE_FBANK_CSRC_FEATURE_FBANK_H_
+#define KALDI_NATIVE_FBANK_CSRC_FEATURE_FBANK_H_
+
+#include <map>
+
+#include "kaldi-native-fbank/csrc/feature-window.h"
+#include "kaldi-native-fbank/csrc/mel-computations.h"
+#include "kaldi-native-fbank/csrc/rfft.h"
+
+namespace knf {
+
+struct FbankOptions {
+  FrameExtractionOptions frame_opts;
+  MelBanksOptions mel_opts;
+  // append an extra dimension with energy to the filter banks
+  bool use_energy = false;
+  float energy_floor = 0.0f;  // active iff use_energy==true
+
+  // If true, compute log_energy before preemphasis and windowing
+  // If false, compute log_energy after preemphasis ans windowing
+  bool raw_energy = true;  // active iff use_energy==true
+
+  // If true, put energy last (if using energy)
+  // If false, put energy first
+  bool htk_compat = false;  // active iff use_energy==true
+
+  // if true (default), produce log-filterbank, else linear
+  bool use_log_fbank = true;
+
+  // if true (default), use power in filterbank
+  // analysis, else magnitude.
+  bool use_power = true;
+
+  FbankOptions() { mel_opts.num_bins = 23; }
+
+  std::string ToString() const {
+    std::ostringstream os;
+    os << "frame_opts: \n";
+    os << frame_opts << "\n";
+    os << "\n";
+
+    os << "mel_opts: \n";
+    os << mel_opts << "\n";
+
+    os << "use_energy: " << use_energy << "\n";
+    os << "energy_floor: " << energy_floor << "\n";
+    os << "raw_energy: " << raw_energy << "\n";
+    os << "htk_compat: " << htk_compat << "\n";
+    os << "use_log_fbank: " << use_log_fbank << "\n";
+    os << "use_power: " << use_power << "\n";
+    return os.str();
+  }
+};
+
+std::ostream &operator<<(std::ostream &os, const FbankOptions &opts);
+
+class FbankComputer {
+ public:
+  using Options = FbankOptions;
+
+  explicit FbankComputer(const FbankOptions &opts);
+  ~FbankComputer();
+
+  int32_t Dim() const {
+    return opts_.mel_opts.num_bins + (opts_.use_energy ? 1 : 0);
+  }
+
+  // if true, compute log_energy_pre_window but after dithering and dc removal
+  bool NeedRawLogEnergy() const { return opts_.use_energy && opts_.raw_energy; }
+
+  const FrameExtractionOptions &GetFrameOptions() const {
+    return opts_.frame_opts;
+  }
+
+  const FbankOptions &GetOptions() const { return opts_; }
+
+  /**
+     Function that computes one frame of features from
+     one frame of signal.
+
+     @param [in] signal_raw_log_energy The log-energy of the frame of the signal
+         prior to windowing and pre-emphasis, or
+         log(numeric_limits<float>::min()), whichever is greater.  Must be
+         ignored by this function if this class returns false from
+         this->NeedsRawLogEnergy().
+     @param [in] vtln_warp  The VTLN warping factor that the user wants
+         to be applied when computing features for this utterance.  Will
+         normally be 1.0, meaning no warping is to be done.  The value will
+         be ignored for feature types that don't support VLTN, such as
+         spectrogram features.
+     @param [in] signal_frame  One frame of the signal,
+       as extracted using the function ExtractWindow() using the options
+       returned by this->GetFrameOptions().  The function will use the
+       vector as a workspace, which is why it's a non-const pointer.
+     @param [out] feature  Pointer to a vector of size this->Dim(), to which
+         the computed feature will be written. It should be pre-allocated.
+  */
+  void Compute(float signal_raw_log_energy, float vtln_warp,
+               std::vector<float> *signal_frame, float *feature);
+
+ private:
+  const MelBanks *GetMelBanks(float vtln_warp);
+
+  FbankOptions opts_;
+  float log_energy_floor_;
+  std::map<float, MelBanks *> mel_banks_;  // float is VTLN coefficient.
+  Rfft rfft_;
+};
+
+}  // namespace knf
+
+#endif  // KALDI_NATIVE_FBANK_CSRC_FEATURE_FBANK_H_
diff --git a/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-functions.cc b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-functions.cc
new file mode 100644
index 00000000..00ae4c79
--- /dev/null
+++ b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-functions.cc
@@ -0,0 +1,49 @@
+/**
+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// This file is copied/modified from kaldi/src/feat/feature-functions.cc
+
+#include "kaldi-native-fbank/csrc/feature-functions.h"
+
+#include <cstdint>
+#include <vector>
+
+namespace knf {
+
+void ComputePowerSpectrum(std::vector<float> *complex_fft) {
+  int32_t dim = complex_fft->size();
+
+  // now we have in complex_fft, first half of complex spectrum
+  // it's stored as [real0, realN/2, real1, im1, real2, im2, ...]
+
+  float *p = complex_fft->data();
+  int32_t half_dim = dim / 2;
+  float first_energy = p[0] * p[0];
+  float last_energy = p[1] * p[1];  // handle this special case
+
+  for (int32_t i = 1; i < half_dim; ++i) {
+    float real = p[i * 2];
+    float im = p[i * 2 + 1];
+    p[i] = real * real + im * im;
+  }
+  p[0] = first_energy;
+  p[half_dim] = last_energy;  // Will actually never be used, and anyway
+  // if the signal has been bandlimited sensibly this should be zero.
+}
+
+}  // namespace knf
diff --git a/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-functions.h b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-functions.h
new file mode 100644
index 00000000..852d0612
--- /dev/null
+++ b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-functions.h
@@ -0,0 +1,38 @@
+/**
+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// This file is copied/modified from kaldi/src/feat/feature-functions.h
+#ifndef KALDI_NATIVE_FBANK_CSRC_FEATURE_FUNCTIONS_H
+#define KALDI_NATIVE_FBANK_CSRC_FEATURE_FUNCTIONS_H
+
+#include <vector>
+namespace knf {
+
+// ComputePowerSpectrum converts a complex FFT (as produced by the FFT
+// functions in csrc/rfft.h), and converts it into
+// a power spectrum.  If the complex FFT is a vector of size n (representing
+// half of the complex FFT of a real signal of size n, as described there),
+// this function computes in the first (n/2) + 1 elements of it, the
+// energies of the fft bins from zero to the Nyquist frequency.  Contents of the
+// remaining (n/2) - 1 elements are undefined at output.
+
+void ComputePowerSpectrum(std::vector<float> *complex_fft);
+
+}  // namespace knf
+
+#endif  // KALDI_NATIVE_FBANK_CSRC_FEATURE_FUNCTIONS_H
diff --git a/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-window.cc b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-window.cc
new file mode 100644
index 00000000..b86a2c3d
--- /dev/null
+++ b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-window.cc
@@ -0,0 +1,236 @@
+// kaldi-native-fbank/csrc/feature-window.cc
+//
+// Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
+
+// This file is copied/modified from kaldi/src/feat/feature-window.cc
+
+#include "kaldi-native-fbank/csrc/feature-window.h"
+
+#include <cmath>
+#include <vector>
+
+#ifndef M_2PI
+#define M_2PI 6.283185307179586476925286766559005
+#endif
+
+namespace knf {
+
+std::ostream &operator<<(std::ostream &os, const FrameExtractionOptions &opts) {
+  os << opts.ToString();
+  return os;
+}
+
+FeatureWindowFunction::FeatureWindowFunction(const FrameExtractionOptions &opts)
+    : window_(opts.WindowSize()) {
+  int32_t frame_length = opts.WindowSize();
+  KNF_CHECK_GT(frame_length, 0);
+
+  float *window_data = window_.data();
+
+  double a = M_2PI / (frame_length - 1);
+  for (int32_t i = 0; i < frame_length; i++) {
+    double i_fl = static_cast<double>(i);
+    if (opts.window_type == "hanning") {
+      window_data[i] = 0.5 - 0.5 * cos(a * i_fl);
+    } else if (opts.window_type == "sine") {
+      // when you are checking ws wikipedia, please
+      // note that 0.5 * a = M_PI/(frame_length-1)
+      window_data[i] = sin(0.5 * a * i_fl);
+    } else if (opts.window_type == "hamming") {
+      window_data[i] = 0.54 - 0.46 * cos(a * i_fl);
+    } else if (opts.window_type ==
+               "povey") {  // like hamming but goes to zero at edges.
+      window_data[i] = pow(0.5 - 0.5 * cos(a * i_fl), 0.85);
+    } else if (opts.window_type == "rectangular") {
+      window_data[i] = 1.0;
+    } else if (opts.window_type == "blackman") {
+      window_data[i] = opts.blackman_coeff - 0.5 * cos(a * i_fl) +
+                       (0.5 - opts.blackman_coeff) * cos(2 * a * i_fl);
+    } else {
+      KNF_LOG(FATAL) << "Invalid window type " << opts.window_type;
+    }
+  }
+}
+
+void FeatureWindowFunction::Apply(float *wave) const {
+  int32_t window_size = window_.size();
+  const float *p = window_.data();
+  for (int32_t k = 0; k != window_size; ++k) {
+    wave[k] *= p[k];
+  }
+}
+
+int64_t FirstSampleOfFrame(int32_t frame, const FrameExtractionOptions &opts) {
+  int64_t frame_shift = opts.WindowShift();
+  if (opts.snip_edges) {
+    return frame * frame_shift;
+  } else {
+    int64_t midpoint_of_frame = frame_shift * frame + frame_shift / 2,
+            beginning_of_frame = midpoint_of_frame - opts.WindowSize() / 2;
+    return beginning_of_frame;
+  }
+}
+
+int32_t NumFrames(int64_t num_samples, const FrameExtractionOptions &opts,
+                  bool flush /*= true*/) {
+  int64_t frame_shift = opts.WindowShift();
+  int64_t frame_length = opts.WindowSize();
+  if (opts.snip_edges) {
+    // with --snip-edges=true (the default), we use a HTK-like approach to
+    // determining the number of frames-- all frames have to fit completely into
+    // the waveform, and the first frame begins at sample zero.
+    if (num_samples < frame_length)
+      return 0;
+    else
+      return (1 + ((num_samples - frame_length) / frame_shift));
+    // You can understand the expression above as follows: 'num_samples -
+    // frame_length' is how much room we have to shift the frame within the
+    // waveform; 'frame_shift' is how much we shift it each time; and the ratio
+    // is how many times we can shift it (integer arithmetic rounds down).
+  } else {
+    // if --snip-edges=false, the number of frames is determined by rounding the
+    // (file-length / frame-shift) to the nearest integer.  The point of this
+    // formula is to make the number of frames an obvious and predictable
+    // function of the frame shift and signal length, which makes many
+    // segmentation-related questions simpler.
+    //
+    // Because integer division in C++ rounds toward zero, we add (half the
+    // frame-shift minus epsilon) before dividing, to have the effect of
+    // rounding towards the closest integer.
+    int32_t num_frames = (num_samples + (frame_shift / 2)) / frame_shift;
+
+    if (flush) return num_frames;
+
+    // note: 'end' always means the last plus one, i.e. one past the last.
+    int64_t end_sample_of_last_frame =
+        FirstSampleOfFrame(num_frames - 1, opts) + frame_length;
+
+    // the following code is optimized more for clarity than efficiency.
+    // If flush == false, we can't output frames that extend past the end
+    // of the signal.
+    while (num_frames > 0 && end_sample_of_last_frame > num_samples) {
+      num_frames--;
+      end_sample_of_last_frame -= frame_shift;
+    }
+    return num_frames;
+  }
+}
+
+void ExtractWindow(int64_t sample_offset, const std::vector<float> &wave,
+                   int32_t f, const FrameExtractionOptions &opts,
+                   const FeatureWindowFunction &window_function,
+                   std::vector<float> *window,
+                   float *log_energy_pre_window /*= nullptr*/) {
+  KNF_CHECK(sample_offset >= 0 && wave.size() != 0);
+
+  int32_t frame_length = opts.WindowSize();
+  int32_t frame_length_padded = opts.PaddedWindowSize();
+
+  int64_t num_samples = sample_offset + wave.size();
+  int64_t start_sample = FirstSampleOfFrame(f, opts);
+  int64_t end_sample = start_sample + frame_length;
+
+  if (opts.snip_edges) {
+    KNF_CHECK(start_sample >= sample_offset && end_sample <= num_samples);
+  } else {
+    KNF_CHECK(sample_offset == 0 || start_sample >= sample_offset);
+  }
+
+  if (window->size() != frame_length_padded) {
+    window->resize(frame_length_padded);
+  }
+
+  // wave_start and wave_end are start and end indexes into 'wave', for the
+  // piece of wave that we're trying to extract.
+  int32_t wave_start = int32_t(start_sample - sample_offset);
+  int32_t wave_end = wave_start + frame_length;
+
+  if (wave_start >= 0 && wave_end <= wave.size()) {
+    // the normal case-- no edge effects to consider.
+    std::copy(wave.begin() + wave_start,
+              wave.begin() + wave_start + frame_length, window->data());
+  } else {
+    // Deal with any end effects by reflection, if needed.  This code will only
+    // be reached for about two frames per utterance, so we don't concern
+    // ourselves excessively with efficiency.
+    int32_t wave_dim = wave.size();
+    for (int32_t s = 0; s < frame_length; ++s) {
+      int32_t s_in_wave = s + wave_start;
+      while (s_in_wave < 0 || s_in_wave >= wave_dim) {
+        // reflect around the beginning or end of the wave.
+        // e.g. -1 -> 0, -2 -> 1.
+        // dim -> dim - 1, dim + 1 -> dim - 2.
+        // the code supports repeated reflections, although this
+        // would only be needed in pathological cases.
+        if (s_in_wave < 0)
+          s_in_wave = -s_in_wave - 1;
+        else
+          s_in_wave = 2 * wave_dim - 1 - s_in_wave;
+      }
+      (*window)[s] = wave[s_in_wave];
+    }
+  }
+
+  ProcessWindow(opts, window_function, window->data(), log_energy_pre_window);
+}
+
+static void RemoveDcOffset(float *d, int32_t n) {
+  float sum = 0;
+  for (int32_t i = 0; i != n; ++i) {
+    sum += d[i];
+  }
+
+  float mean = sum / n;
+
+  for (int32_t i = 0; i != n; ++i) {
+    d[i] -= mean;
+  }
+}
+
+float InnerProduct(const float *a, const float *b, int32_t n) {
+  float sum = 0;
+  for (int32_t i = 0; i != n; ++i) {
+    sum += a[i] * b[i];
+  }
+  return sum;
+}
+
+static void Preemphasize(float *d, int32_t n, float preemph_coeff) {
+  if (preemph_coeff == 0.0) {
+    return;
+  }
+
+  KNF_CHECK(preemph_coeff >= 0.0 && preemph_coeff <= 1.0);
+
+  for (int32_t i = n - 1; i > 0; --i) {
+    d[i] -= preemph_coeff * d[i - 1];
+  }
+  d[0] -= preemph_coeff * d[0];
+}
+
+void ProcessWindow(const FrameExtractionOptions &opts,
+                   const FeatureWindowFunction &window_function, float *window,
+                   float *log_energy_pre_window /*= nullptr*/) {
+  int32_t frame_length = opts.WindowSize();
+
+  // TODO(fangjun): Remove dither
+  KNF_CHECK_EQ(opts.dither, 0);
+
+  if (opts.remove_dc_offset) {
+    RemoveDcOffset(window, frame_length);
+  }
+
+  if (log_energy_pre_window != NULL) {
+    float energy = std::max<float>(InnerProduct(window, window, frame_length),
+                                   std::numeric_limits<float>::epsilon());
+    *log_energy_pre_window = std::log(energy);
+  }
+
+  if (opts.preemph_coeff != 0.0) {
+    Preemphasize(window, frame_length, opts.preemph_coeff);
+  }
+
+  window_function.Apply(window);
+}
+
+}  // namespace knf
diff --git a/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-window.h b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-window.h
new file mode 100644
index 00000000..a33844f4
--- /dev/null
+++ b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/feature-window.h
@@ -0,0 +1,178 @@
+// kaldi-native-fbank/csrc/feature-window.h
+//
+// Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
+
+// This file is copied/modified from kaldi/src/feat/feature-window.h
+
+#ifndef KALDI_NATIVE_FEAT_CSRC_FEATURE_WINDOW_H_
+#define KALDI_NATIVE_FEAT_CSRC_FEATURE_WINDOW_H_
+
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "kaldi-native-fbank/csrc/log.h"
+
+namespace knf {
+
+inline int32_t RoundUpToNearestPowerOfTwo(int32_t n) {
+  // copied from kaldi/src/base/kaldi-math.cc
+  KNF_CHECK_GT(n, 0);
+  n--;
+  n |= n >> 1;
+  n |= n >> 2;
+  n |= n >> 4;
+  n |= n >> 8;
+  n |= n >> 16;
+  return n + 1;
+}
+
+struct FrameExtractionOptions {
+  float samp_freq = 16000;
+  float frame_shift_ms = 10.0f;   // in milliseconds.
+  float frame_length_ms = 25.0f;  // in milliseconds.
+  float dither = 1.0f;            // Amount of dithering, 0.0 means no dither.
+  float preemph_coeff = 0.97f;    // Preemphasis coefficient.
+  bool remove_dc_offset = true;   // Subtract mean of wave before FFT.
+  std::string window_type = "povey";  // e.g. Hamming window
+  // May be "hamming", "rectangular", "povey", "hanning", "sine", "blackman"
+  // "povey" is a window I made to be similar to Hamming but to go to zero at
+  // the edges, it's pow((0.5 - 0.5*cos(n/N*2*pi)), 0.85) I just don't think the
+  // Hamming window makes sense as a windowing function.
+  bool round_to_power_of_two = true;
+  float blackman_coeff = 0.42f;
+  bool snip_edges = true;
+  // bool allow_downsample = false;
+  // bool allow_upsample = false;
+
+  // Used for streaming feature extraction. It indicates the number
+  // of feature frames to keep in the recycling vector. -1 means to
+  // keep all feature frames.
+  int32_t max_feature_vectors = -1;
+
+  int32_t WindowShift() const {
+    return static_cast<int32_t>(samp_freq * 0.001f * frame_shift_ms);
+  }
+  int32_t WindowSize() const {
+    return static_cast<int32_t>(samp_freq * 0.001f * frame_length_ms);
+  }
+  int32_t PaddedWindowSize() const {
+    return (round_to_power_of_two ? RoundUpToNearestPowerOfTwo(WindowSize())
+                                  : WindowSize());
+  }
+  std::string ToString() const {
+    std::ostringstream os;
+#define KNF_PRINT(x) os << #x << ": " << x << "\n"
+    KNF_PRINT(samp_freq);
+    KNF_PRINT(frame_shift_ms);
+    KNF_PRINT(frame_length_ms);
+    KNF_PRINT(dither);
+    KNF_PRINT(preemph_coeff);
+    KNF_PRINT(remove_dc_offset);
+    KNF_PRINT(window_type);
+    KNF_PRINT(round_to_power_of_two);
+    KNF_PRINT(blackman_coeff);
+    KNF_PRINT(snip_edges);
+    // KNF_PRINT(allow_downsample);
+    // KNF_PRINT(allow_upsample);
+    KNF_PRINT(max_feature_vectors);
+#undef KNF_PRINT
+    return os.str();
+  }
+};
+
+std::ostream &operator<<(std::ostream &os, const FrameExtractionOptions &opts);
+
+class FeatureWindowFunction {
+ public:
+  FeatureWindowFunction() = default;
+  explicit FeatureWindowFunction(const FrameExtractionOptions &opts);
+  /**
+   * @param wave Pointer to a 1-D array of shape [window_size].
+   *             It is modified in-place: wave[i] = wave[i] * window_[i].
+   * @param
+   */
+  void Apply(float *wave) const;
+
+ private:
+  std::vector<float> window_;  // of size opts.WindowSize()
+};
+
+int64_t FirstSampleOfFrame(int32_t frame, const FrameExtractionOptions &opts);
+
+/**
+   This function returns the number of frames that we can extract from a wave
+   file with the given number of samples in it (assumed to have the same
+   sampling rate as specified in 'opts').
+
+      @param [in] num_samples  The number of samples in the wave file.
+      @param [in] opts     The frame-extraction options class
+
+      @param [in] flush   True if we are asserting that this number of samples
+   is 'all there is', false if we expecting more data to possibly come in.  This
+   only makes a difference to the answer
+   if opts.snips_edges== false.  For offline feature extraction you always want
+   flush == true.  In an online-decoding context, once you know (or decide) that
+   no more data is coming in, you'd call it with flush == true at the end to
+   flush out any remaining data.
+*/
+int32_t NumFrames(int64_t num_samples, const FrameExtractionOptions &opts,
+                  bool flush = true);
+
+/*
+  ExtractWindow() extracts a windowed frame of waveform (possibly with a
+  power-of-two, padded size, depending on the config), including all the
+  processing done by ProcessWindow().
+
+  @param [in] sample_offset  If 'wave' is not the entire waveform, but
+                   part of it to the left has been discarded, then the
+                   number of samples prior to 'wave' that we have
+                   already discarded.  Set this to zero if you are
+                   processing the entire waveform in one piece, or
+                   if you get 'no matching function' compilation
+                   errors when updating the code.
+  @param [in] wave  The waveform
+  @param [in] f     The frame index to be extracted, with
+                    0 <= f < NumFrames(sample_offset + wave.Dim(), opts, true)
+  @param [in] opts  The options class to be used
+  @param [in] window_function  The windowing function, as derived from the
+                    options class.
+  @param [out] window  The windowed, possibly-padded waveform to be
+                     extracted.  Will be resized as needed.
+  @param [out] log_energy_pre_window  If non-NULL, the log-energy of
+                   the signal prior to pre-emphasis and multiplying by
+                   the windowing function will be written to here.
+*/
+void ExtractWindow(int64_t sample_offset, const std::vector<float> &wave,
+                   int32_t f, const FrameExtractionOptions &opts,
+                   const FeatureWindowFunction &window_function,
+                   std::vector<float> *window,
+                   float *log_energy_pre_window = nullptr);
+
+/**
+  This function does all the windowing steps after actually
+  extracting the windowed signal: depending on the
+  configuration, it does dithering, dc offset removal,
+  preemphasis, and multiplication by the windowing function.
+   @param [in] opts  The options class to be used
+   @param [in] window_function  The windowing function-- should have
+                    been initialized using 'opts'.
+   @param [in,out] window  A vector of size opts.WindowSize().  Note:
+      it will typically be a sub-vector of a larger vector of size
+      opts.PaddedWindowSize(), with the remaining samples zero,
+      as the FFT code is more efficient if it operates on data with
+      power-of-two size.
+   @param [out]   log_energy_pre_window If non-NULL, then after dithering and
+      DC offset removal, this function will write to this pointer the log of
+      the total energy (i.e. sum-squared) of the frame.
+ */
+void ProcessWindow(const FrameExtractionOptions &opts,
+                   const FeatureWindowFunction &window_function, float *window,
+                   float *log_energy_pre_window = nullptr);
+
+// Compute the inner product of two vectors
+float InnerProduct(const float *a, const float *b, int32_t n);
+
+}  // namespace knf
+
+#endif  // KALDI_NATIVE_FEAT_CSRC_FEATURE_WINDOW_H_
diff --git a/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/fftsg.c b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/fftsg.c
new file mode 100644
index 00000000..ec8217a2
--- /dev/null
+++ b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/fftsg.c
@@ -0,0 +1,3271 @@
+/* This file is copied from
+ * https://www.kurims.kyoto-u.ac.jp/~ooura/fft.html
+ */
+/*
+Fast Fourier/Cosine/Sine Transform
+    dimension   :one
+    data length :power of 2
+    decimation  :frequency
+    radix       :split-radix
+    data        :inplace
+    table       :use
+functions
+    cdft: Complex Discrete Fourier Transform
+    rdft: Real Discrete Fourier Transform
+    ddct: Discrete Cosine Transform
+    ddst: Discrete Sine Transform
+    dfct: Cosine Transform of RDFT (Real Symmetric DFT)
+    dfst: Sine Transform of RDFT (Real Anti-symmetric DFT)
+function prototypes
+    void cdft(int, int, double *, int *, double *);
+    void rdft(int, int, double *, int *, double *);
+    void ddct(int, int, double *, int *, double *);
+    void ddst(int, int, double *, int *, double *);
+    void dfct(int, double *, double *, int *, double *);
+    void dfst(int, double *, double *, int *, double *);
+macro definitions
+    USE_CDFT_PTHREADS : default=not defined
+        CDFT_THREADS_BEGIN_N  : must be >= 512, default=8192
+        CDFT_4THREADS_BEGIN_N : must be >= 512, default=65536
+    USE_CDFT_WINTHREADS : default=not defined
+        CDFT_THREADS_BEGIN_N  : must be >= 512, default=32768
+        CDFT_4THREADS_BEGIN_N : must be >= 512, default=524288
+
+
+-------- Complex DFT (Discrete Fourier Transform) --------
+    [definition]
+        <case1>
+            X[k] = sum_j=0^n-1 x[j]*exp(2*pi*i*j*k/n), 0<=k<n
+        <case2>
+            X[k] = sum_j=0^n-1 x[j]*exp(-2*pi*i*j*k/n), 0<=k<n
+        (notes: sum_j=0^n-1 is a summation from j=0 to n-1)
+    [usage]
+        <case1>
+            ip[0] = 0; // first time only
+            cdft(2*n, 1, a, ip, w);
+        <case2>
+            ip[0] = 0; // first time only
+            cdft(2*n, -1, a, ip, w);
+    [parameters]
+        2*n            :data length (int)
+                        n >= 1, n = power of 2
+        a[0...2*n-1]   :input/output data (double *)
+                        input data
+                            a[2*j] = Re(x[j]),
+                            a[2*j+1] = Im(x[j]), 0<=j<n
+                        output data
+                            a[2*k] = Re(X[k]),
+                            a[2*k+1] = Im(X[k]), 0<=k<n
+        ip[0...*]      :work area for bit reversal (int *)
+                        length of ip >= 2+sqrt(n)
+                        strictly,
+                        length of ip >=
+                            2+(1<<(int)(log(n+0.5)/log(2))/2).
+                        ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...n/2-1]   :cos/sin table (double *)
+                        w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of
+            cdft(2*n, -1, a, ip, w);
+        is
+            cdft(2*n, 1, a, ip, w);
+            for (j = 0; j <= 2 * n - 1; j++) {
+                a[j] *= 1.0 / n;
+            }
+        .
+
+
+-------- Real DFT / Inverse of Real DFT --------
+    [definition]
+        <case1> RDFT
+            R[k] = sum_j=0^n-1 a[j]*cos(2*pi*j*k/n), 0<=k<=n/2
+            I[k] = sum_j=0^n-1 a[j]*sin(2*pi*j*k/n), 0<k<n/2
+        <case2> IRDFT (excluding scale)
+            a[k] = (R[0] + R[n/2]*cos(pi*k))/2 +
+                   sum_j=1^n/2-1 R[j]*cos(2*pi*j*k/n) +
+                   sum_j=1^n/2-1 I[j]*sin(2*pi*j*k/n), 0<=k<n
+    [usage]
+        <case1>
+            ip[0] = 0; // first time only
+            rdft(n, 1, a, ip, w);
+        <case2>
+            ip[0] = 0; // first time only
+            rdft(n, -1, a, ip, w);
+    [parameters]
+        n              :data length (int)
+                        n >= 2, n = power of 2
+        a[0...n-1]     :input/output data (double *)
+                        <case1>
+                            output data
+                                a[2*k] = R[k], 0<=k<n/2
+                                a[2*k+1] = I[k], 0<k<n/2
+                                a[1] = R[n/2]
+                        <case2>
+                            input data
+                                a[2*j] = R[j], 0<=j<n/2
+                                a[2*j+1] = I[j], 0<j<n/2
+                                a[1] = R[n/2]
+        ip[0...*]      :work area for bit reversal (int *)
+                        length of ip >= 2+sqrt(n/2)
+                        strictly,
+                        length of ip >=
+                            2+(1<<(int)(log(n/2+0.5)/log(2))/2).
+                        ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...n/2-1]   :cos/sin table (double *)
+                        w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of
+            rdft(n, 1, a, ip, w);
+        is
+            rdft(n, -1, a, ip, w);
+            for (j = 0; j <= n - 1; j++) {
+                a[j] *= 2.0 / n;
+            }
+        .
+
+
+-------- DCT (Discrete Cosine Transform) / Inverse of DCT --------
+    [definition]
+        <case1> IDCT (excluding scale)
+            C[k] = sum_j=0^n-1 a[j]*cos(pi*j*(k+1/2)/n), 0<=k<n
+        <case2> DCT
+            C[k] = sum_j=0^n-1 a[j]*cos(pi*(j+1/2)*k/n), 0<=k<n
+    [usage]
+        <case1>
+            ip[0] = 0; // first time only
+            ddct(n, 1, a, ip, w);
+        <case2>
+            ip[0] = 0; // first time only
+            ddct(n, -1, a, ip, w);
+    [parameters]
+        n              :data length (int)
+                        n >= 2, n = power of 2
+        a[0...n-1]     :input/output data (double *)
+                        output data
+                            a[k] = C[k], 0<=k<n
+        ip[0...*]      :work area for bit reversal (int *)
+                        length of ip >= 2+sqrt(n/2)
+                        strictly,
+                        length of ip >=
+                            2+(1<<(int)(log(n/2+0.5)/log(2))/2).
+                        ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...n*5/4-1] :cos/sin table (double *)
+                        w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of
+            ddct(n, -1, a, ip, w);
+        is
+            a[0] *= 0.5;
+            ddct(n, 1, a, ip, w);
+            for (j = 0; j <= n - 1; j++) {
+                a[j] *= 2.0 / n;
+            }
+        .
+
+
+-------- DST (Discrete Sine Transform) / Inverse of DST --------
+    [definition]
+        <case1> IDST (excluding scale)
+            S[k] = sum_j=1^n A[j]*sin(pi*j*(k+1/2)/n), 0<=k<n
+        <case2> DST
+            S[k] = sum_j=0^n-1 a[j]*sin(pi*(j+1/2)*k/n), 0<k<=n
+    [usage]
+        <case1>
+            ip[0] = 0; // first time only
+            ddst(n, 1, a, ip, w);
+        <case2>
+            ip[0] = 0; // first time only
+            ddst(n, -1, a, ip, w);
+    [parameters]
+        n              :data length (int)
+                        n >= 2, n = power of 2
+        a[0...n-1]     :input/output data (double *)
+                        <case1>
+                            input data
+                                a[j] = A[j], 0<j<n
+                                a[0] = A[n]
+                            output data
+                                a[k] = S[k], 0<=k<n
+                        <case2>
+                            output data
+                                a[k] = S[k], 0<k<n
+                                a[0] = S[n]
+        ip[0...*]      :work area for bit reversal (int *)
+                        length of ip >= 2+sqrt(n/2)
+                        strictly,
+                        length of ip >=
+                            2+(1<<(int)(log(n/2+0.5)/log(2))/2).
+                        ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...n*5/4-1] :cos/sin table (double *)
+                        w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of
+            ddst(n, -1, a, ip, w);
+        is
+            a[0] *= 0.5;
+            ddst(n, 1, a, ip, w);
+            for (j = 0; j <= n - 1; j++) {
+                a[j] *= 2.0 / n;
+            }
+        .
+
+
+-------- Cosine Transform of RDFT (Real Symmetric DFT) --------
+    [definition]
+        C[k] = sum_j=0^n a[j]*cos(pi*j*k/n), 0<=k<=n
+    [usage]
+        ip[0] = 0; // first time only
+        dfct(n, a, t, ip, w);
+    [parameters]
+        n              :data length - 1 (int)
+                        n >= 2, n = power of 2
+        a[0...n]       :input/output data (double *)
+                        output data
+                            a[k] = C[k], 0<=k<=n
+        t[0...n/2]     :work area (double *)
+        ip[0...*]      :work area for bit reversal (int *)
+                        length of ip >= 2+sqrt(n/4)
+                        strictly,
+                        length of ip >=
+                            2+(1<<(int)(log(n/4+0.5)/log(2))/2).
+                        ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...n*5/8-1] :cos/sin table (double *)
+                        w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of
+            a[0] *= 0.5;
+            a[n] *= 0.5;
+            dfct(n, a, t, ip, w);
+        is
+            a[0] *= 0.5;
+            a[n] *= 0.5;
+            dfct(n, a, t, ip, w);
+            for (j = 0; j <= n; j++) {
+                a[j] *= 2.0 / n;
+            }
+        .
+
+
+-------- Sine Transform of RDFT (Real Anti-symmetric DFT) --------
+    [definition]
+        S[k] = sum_j=1^n-1 a[j]*sin(pi*j*k/n), 0<k<n
+    [usage]
+        ip[0] = 0; // first time only
+        dfst(n, a, t, ip, w);
+    [parameters]
+        n              :data length + 1 (int)
+                        n >= 2, n = power of 2
+        a[0...n-1]     :input/output data (double *)
+                        output data
+                            a[k] = S[k], 0<k<n
+                        (a[0] is used for work area)
+        t[0...n/2-1]   :work area (double *)
+        ip[0...*]      :work area for bit reversal (int *)
+                        length of ip >= 2+sqrt(n/4)
+                        strictly,
+                        length of ip >=
+                            2+(1<<(int)(log(n/4+0.5)/log(2))/2).
+                        ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...n*5/8-1] :cos/sin table (double *)
+                        w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of
+            dfst(n, a, t, ip, w);
+        is
+            dfst(n, a, t, ip, w);
+            for (j = 1; j <= n - 1; j++) {
+                a[j] *= 2.0 / n;
+            }
+        .
+
+
+Appendix :
+    The cos/sin table is recalculated when the larger table required.
+    w[] and ip[] are compatible with all routines.
+*/
+
+
+void cdft(int n, int isgn, double *a, int *ip, double *w) {
+    void makewt(int nw, int *ip, double *w);
+    void cftfsub(int n, double *a, int *ip, int nw, double *w);
+    void cftbsub(int n, double *a, int *ip, int nw, double *w);
+    int nw;
+
+    nw = ip[0];
+    if (n > (nw << 2)) {
+        nw = n >> 2;
+        makewt(nw, ip, w);
+    }
+    if (isgn >= 0) {
+        cftfsub(n, a, ip, nw, w);
+    } else {
+        cftbsub(n, a, ip, nw, w);
+    }
+}
+
+
+void rdft(int n, int isgn, double *a, int *ip, double *w) {
+    void makewt(int nw, int *ip, double *w);
+    void makect(int nc, int *ip, double *c);
+    void cftfsub(int n, double *a, int *ip, int nw, double *w);
+    void cftbsub(int n, double *a, int *ip, int nw, double *w);
+    void rftfsub(int n, double *a, int nc, double *c);
+    void rftbsub(int n, double *a, int nc, double *c);
+    int nw, nc;
+    double xi;
+
+    nw = ip[0];
+    if (n > (nw << 2)) {
+        nw = n >> 2;
+        makewt(nw, ip, w);
+    }
+    nc = ip[1];
+    if (n > (nc << 2)) {
+        nc = n >> 2;
+        makect(nc, ip, w + nw);
+    }
+    if (isgn >= 0) {
+        if (n > 4) {
+            cftfsub(n, a, ip, nw, w);
+            rftfsub(n, a, nc, w + nw);
+        } else if (n == 4) {
+            cftfsub(n, a, ip, nw, w);
+        }
+        xi = a[0] - a[1];
+        a[0] += a[1];
+        a[1] = xi;
+    } else {
+        a[1] = 0.5 * (a[0] - a[1]);
+        a[0] -= a[1];
+        if (n > 4) {
+            rftbsub(n, a, nc, w + nw);
+            cftbsub(n, a, ip, nw, w);
+        } else if (n == 4) {
+            cftbsub(n, a, ip, nw, w);
+        }
+    }
+}
+
+
+void ddct(int n, int isgn, double *a, int *ip, double *w) {
+    void makewt(int nw, int *ip, double *w);
+    void makect(int nc, int *ip, double *c);
+    void cftfsub(int n, double *a, int *ip, int nw, double *w);
+    void cftbsub(int n, double *a, int *ip, int nw, double *w);
+    void rftfsub(int n, double *a, int nc, double *c);
+    void rftbsub(int n, double *a, int nc, double *c);
+    void dctsub(int n, double *a, int nc, double *c);
+    int j, nw, nc;
+    double xr;
+
+    nw = ip[0];
+    if (n > (nw << 2)) {
+        nw = n >> 2;
+        makewt(nw, ip, w);
+    }
+    nc = ip[1];
+    if (n > nc) {
+        nc = n;
+        makect(nc, ip, w + nw);
+    }
+    if (isgn < 0) {
+        xr = a[n - 1];
+        for (j = n - 2; j >= 2; j -= 2) {
+            a[j + 1] = a[j] - a[j - 1];
+            a[j] += a[j - 1];
+        }
+        a[1] = a[0] - xr;
+        a[0] += xr;
+        if (n > 4) {
+            rftbsub(n, a, nc, w + nw);
+            cftbsub(n, a, ip, nw, w);
+        } else if (n == 4) {
+            cftbsub(n, a, ip, nw, w);
+        }
+    }
+    dctsub(n, a, nc, w + nw);
+    if (isgn >= 0) {
+        if (n > 4) {
+            cftfsub(n, a, ip, nw, w);
+            rftfsub(n, a, nc, w + nw);
+        } else if (n == 4) {
+            cftfsub(n, a, ip, nw, w);
+        }
+        xr = a[0] - a[1];
+        a[0] += a[1];
+        for (j = 2; j < n; j += 2) {
+            a[j - 1] = a[j] - a[j + 1];
+            a[j] += a[j + 1];
+        }
+        a[n - 1] = xr;
+    }
+}
+
+
+void ddst(int n, int isgn, double *a, int *ip, double *w) {
+    void makewt(int nw, int *ip, double *w);
+    void makect(int nc, int *ip, double *c);
+    void cftfsub(int n, double *a, int *ip, int nw, double *w);
+    void cftbsub(int n, double *a, int *ip, int nw, double *w);
+    void rftfsub(int n, double *a, int nc, double *c);
+    void rftbsub(int n, double *a, int nc, double *c);
+    void dstsub(int n, double *a, int nc, double *c);
+    int j, nw, nc;
+    double xr;
+
+    nw = ip[0];
+    if (n > (nw << 2)) {
+        nw = n >> 2;
+        makewt(nw, ip, w);
+    }
+    nc = ip[1];
+    if (n > nc) {
+        nc = n;
+        makect(nc, ip, w + nw);
+    }
+    if (isgn < 0) {
+        xr = a[n - 1];
+        for (j = n - 2; j >= 2; j -= 2) {
+            a[j + 1] = -a[j] - a[j - 1];
+            a[j] -= a[j - 1];
+        }
+        a[1] = a[0] + xr;
+        a[0] -= xr;
+        if (n > 4) {
+            rftbsub(n, a, nc, w + nw);
+            cftbsub(n, a, ip, nw, w);
+        } else if (n == 4) {
+            cftbsub(n, a, ip, nw, w);
+        }
+    }
+    dstsub(n, a, nc, w + nw);
+    if (isgn >= 0) {
+        if (n > 4) {
+            cftfsub(n, a, ip, nw, w);
+            rftfsub(n, a, nc, w + nw);
+        } else if (n == 4) {
+            cftfsub(n, a, ip, nw, w);
+        }
+        xr = a[0] - a[1];
+        a[0] += a[1];
+        for (j = 2; j < n; j += 2) {
+            a[j - 1] = -a[j] - a[j + 1];
+            a[j] -= a[j + 1];
+        }
+        a[n - 1] = -xr;
+    }
+}
+
+
+void dfct(int n, double *a, double *t, int *ip, double *w) {
+    void makewt(int nw, int *ip, double *w);
+    void makect(int nc, int *ip, double *c);
+    void cftfsub(int n, double *a, int *ip, int nw, double *w);
+    void rftfsub(int n, double *a, int nc, double *c);
+    void dctsub(int n, double *a, int nc, double *c);
+    int j, k, l, m, mh, nw, nc;
+    double xr, xi, yr, yi;
+
+    nw = ip[0];
+    if (n > (nw << 3)) {
+        nw = n >> 3;
+        makewt(nw, ip, w);
+    }
+    nc = ip[1];
+    if (n > (nc << 1)) {
+        nc = n >> 1;
+        makect(nc, ip, w + nw);
+    }
+    m = n >> 1;
+    yi = a[m];
+    xi = a[0] + a[n];
+    a[0] -= a[n];
+    t[0] = xi - yi;
+    t[m] = xi + yi;
+    if (n > 2) {
+        mh = m >> 1;
+        for (j = 1; j < mh; j++) {
+            k = m - j;
+            xr = a[j] - a[n - j];
+            xi = a[j] + a[n - j];
+            yr = a[k] - a[n - k];
+            yi = a[k] + a[n - k];
+            a[j] = xr;
+            a[k] = yr;
+            t[j] = xi - yi;
+            t[k] = xi + yi;
+        }
+        t[mh] = a[mh] + a[n - mh];
+        a[mh] -= a[n - mh];
+        dctsub(m, a, nc, w + nw);
+        if (m > 4) {
+            cftfsub(m, a, ip, nw, w);
+            rftfsub(m, a, nc, w + nw);
+        } else if (m == 4) {
+            cftfsub(m, a, ip, nw, w);
+        }
+        a[n - 1] = a[0] - a[1];
+        a[1] = a[0] + a[1];
+        for (j = m - 2; j >= 2; j -= 2) {
+            a[2 * j + 1] = a[j] + a[j + 1];
+            a[2 * j - 1] = a[j] - a[j + 1];
+        }
+        l = 2;
+        m = mh;
+        while (m >= 2) {
+            dctsub(m, t, nc, w + nw);
+            if (m > 4) {
+                cftfsub(m, t, ip, nw, w);
+                rftfsub(m, t, nc, w + nw);
+            } else if (m == 4) {
+                cftfsub(m, t, ip, nw, w);
+            }
+            a[n - l] = t[0] - t[1];
+            a[l] = t[0] + t[1];
+            k = 0;
+            for (j = 2; j < m; j += 2) {
+                k += l << 2;
+                a[k - l] = t[j] - t[j + 1];
+                a[k + l] = t[j] + t[j + 1];
+            }
+            l <<= 1;
+            mh = m >> 1;
+            for (j = 0; j < mh; j++) {
+                k = m - j;
+                t[j] = t[m + k] - t[m + j];
+                t[k] = t[m + k] + t[m + j];
+            }
+            t[mh] = t[m + mh];
+            m = mh;
+        }
+        a[l] = t[0];
+        a[n] = t[2] - t[1];
+        a[0] = t[2] + t[1];
+    } else {
+        a[1] = a[0];
+        a[2] = t[0];
+        a[0] = t[1];
+    }
+}
+
+
+void dfst(int n, double *a, double *t, int *ip, double *w) {
+    void makewt(int nw, int *ip, double *w);
+    void makect(int nc, int *ip, double *c);
+    void cftfsub(int n, double *a, int *ip, int nw, double *w);
+    void rftfsub(int n, double *a, int nc, double *c);
+    void dstsub(int n, double *a, int nc, double *c);
+    int j, k, l, m, mh, nw, nc;
+    double xr, xi, yr, yi;
+
+    nw = ip[0];
+    if (n > (nw << 3)) {
+        nw = n >> 3;
+        makewt(nw, ip, w);
+    }
+    nc = ip[1];
+    if (n > (nc << 1)) {
+        nc = n >> 1;
+        makect(nc, ip, w + nw);
+    }
+    if (n > 2) {
+        m = n >> 1;
+        mh = m >> 1;
+        for (j = 1; j < mh; j++) {
+            k = m - j;
+            xr = a[j] + a[n - j];
+            xi = a[j] - a[n - j];
+            yr = a[k] + a[n - k];
+            yi = a[k] - a[n - k];
+            a[j] = xr;
+            a[k] = yr;
+            t[j] = xi + yi;
+            t[k] = xi - yi;
+        }
+        t[0] = a[mh] - a[n - mh];
+        a[mh] += a[n - mh];
+        a[0] = a[m];
+        dstsub(m, a, nc, w + nw);
+        if (m > 4) {
+            cftfsub(m, a, ip, nw, w);
+            rftfsub(m, a, nc, w + nw);
+        } else if (m == 4) {
+            cftfsub(m, a, ip, nw, w);
+        }
+        a[n - 1] = a[1] - a[0];
+        a[1] = a[0] + a[1];
+        for (j = m - 2; j >= 2; j -= 2) {
+            a[2 * j + 1] = a[j] - a[j + 1];
+            a[2 * j - 1] = -a[j] - a[j + 1];
+        }
+        l = 2;
+        m = mh;
+        while (m >= 2) {
+            dstsub(m, t, nc, w + nw);
+            if (m > 4) {
+                cftfsub(m, t, ip, nw, w);
+                rftfsub(m, t, nc, w + nw);
+            } else if (m == 4) {
+                cftfsub(m, t, ip, nw, w);
+            }
+            a[n - l] = t[1] - t[0];
+            a[l] = t[0] + t[1];
+            k = 0;
+            for (j = 2; j < m; j += 2) {
+                k += l << 2;
+                a[k - l] = -t[j] - t[j + 1];
+                a[k + l] = t[j] - t[j + 1];
+            }
+            l <<= 1;
+            mh = m >> 1;
+            for (j = 1; j < mh; j++) {
+                k = m - j;
+                t[j] = t[m + k] + t[m + j];
+                t[k] = t[m + k] - t[m + j];
+            }
+            t[0] = t[m + mh];
+            m = mh;
+        }
+        a[l] = t[0];
+    }
+    a[0] = 0;
+}
+
+
+/* -------- initializing routines -------- */
+
+
+#include <math.h>
+
+void makewt(int nw, int *ip, double *w) {
+    void makeipt(int nw, int *ip);
+    int j, nwh, nw0, nw1;
+    double delta, wn4r, wk1r, wk1i, wk3r, wk3i;
+
+    ip[0] = nw;
+    ip[1] = 1;
+    if (nw > 2) {
+        nwh = nw >> 1;
+        delta = atan(1.0) / nwh;
+        wn4r = cos(delta * nwh);
+        w[0] = 1;
+        w[1] = wn4r;
+        if (nwh == 4) {
+            w[2] = cos(delta * 2);
+            w[3] = sin(delta * 2);
+        } else if (nwh > 4) {
+            makeipt(nw, ip);
+            w[2] = 0.5 / cos(delta * 2);
+            w[3] = 0.5 / cos(delta * 6);
+            for (j = 4; j < nwh; j += 4) {
+                w[j] = cos(delta * j);
+                w[j + 1] = sin(delta * j);
+                w[j + 2] = cos(3 * delta * j);
+                w[j + 3] = -sin(3 * delta * j);
+            }
+        }
+        nw0 = 0;
+        while (nwh > 2) {
+            nw1 = nw0 + nwh;
+            nwh >>= 1;
+            w[nw1] = 1;
+            w[nw1 + 1] = wn4r;
+            if (nwh == 4) {
+                wk1r = w[nw0 + 4];
+                wk1i = w[nw0 + 5];
+                w[nw1 + 2] = wk1r;
+                w[nw1 + 3] = wk1i;
+            } else if (nwh > 4) {
+                wk1r = w[nw0 + 4];
+                wk3r = w[nw0 + 6];
+                w[nw1 + 2] = 0.5 / wk1r;
+                w[nw1 + 3] = 0.5 / wk3r;
+                for (j = 4; j < nwh; j += 4) {
+                    wk1r = w[nw0 + 2 * j];
+                    wk1i = w[nw0 + 2 * j + 1];
+                    wk3r = w[nw0 + 2 * j + 2];
+                    wk3i = w[nw0 + 2 * j + 3];
+                    w[nw1 + j] = wk1r;
+                    w[nw1 + j + 1] = wk1i;
+                    w[nw1 + j + 2] = wk3r;
+                    w[nw1 + j + 3] = wk3i;
+                }
+            }
+            nw0 = nw1;
+        }
+    }
+}
+
+
+void makeipt(int nw, int *ip) {
+    int j, l, m, m2, p, q;
+
+    ip[2] = 0;
+    ip[3] = 16;
+    m = 2;
+    for (l = nw; l > 32; l >>= 2) {
+        m2 = m << 1;
+        q = m2 << 3;
+        for (j = m; j < m2; j++) {
+            p = ip[j] << 2;
+            ip[m + j] = p;
+            ip[m2 + j] = p + q;
+        }
+        m = m2;
+    }
+}
+
+
+void makect(int nc, int *ip, double *c) {
+    int j, nch;
+    double delta;
+
+    ip[1] = nc;
+    if (nc > 1) {
+        nch = nc >> 1;
+        delta = atan(1.0) / nch;
+        c[0] = cos(delta * nch);
+        c[nch] = 0.5 * c[0];
+        for (j = 1; j < nch; j++) {
+            c[j] = 0.5 * cos(delta * j);
+            c[nc - j] = 0.5 * sin(delta * j);
+        }
+    }
+}
+
+
+/* -------- child routines -------- */
+
+
+#ifdef USE_CDFT_PTHREADS
+#define USE_CDFT_THREADS
+#ifndef CDFT_THREADS_BEGIN_N
+#define CDFT_THREADS_BEGIN_N 8192
+#endif
+#ifndef CDFT_4THREADS_BEGIN_N
+#define CDFT_4THREADS_BEGIN_N 65536
+#endif
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#define cdft_thread_t pthread_t
+#define cdft_thread_create(thp, func, argp)                       \
+    {                                                             \
+        if (pthread_create(thp, NULL, func, (void *)argp) != 0) { \
+            fprintf(stderr, "cdft thread error\n");               \
+            exit(1);                                              \
+        }                                                         \
+    }
+#define cdft_thread_wait(th)                        \
+    {                                               \
+        if (pthread_join(th, NULL) != 0) {          \
+            fprintf(stderr, "cdft thread error\n"); \
+            exit(1);                                \
+        }                                           \
+    }
+#endif /* USE_CDFT_PTHREADS */
+
+
+#ifdef USE_CDFT_WINTHREADS
+#define USE_CDFT_THREADS
+#ifndef CDFT_THREADS_BEGIN_N
+#define CDFT_THREADS_BEGIN_N 32768
+#endif
+#ifndef CDFT_4THREADS_BEGIN_N
+#define CDFT_4THREADS_BEGIN_N 524288
+#endif
+#include <stdio.h>
+#include <stdlib.h>
+#include <windows.h>
+#define cdft_thread_t HANDLE
+#define cdft_thread_create(thp, func, argp)                                 \
+    {                                                                       \
+        DWORD thid;                                                         \
+        *(thp) = CreateThread(                                              \
+            NULL, 0, (LPTHREAD_START_ROUTINE)func, (LPVOID)argp, 0, &thid); \
+        if (*(thp) == 0) {                                                  \
+            fprintf(stderr, "cdft thread error\n");                         \
+            exit(1);                                                        \
+        }                                                                   \
+    }
+#define cdft_thread_wait(th)               \
+    {                                      \
+        WaitForSingleObject(th, INFINITE); \
+        CloseHandle(th);                   \
+    }
+#endif /* USE_CDFT_WINTHREADS */
+
+
+void cftfsub(int n, double *a, int *ip, int nw, double *w) {
+    void bitrv2(int n, int *ip, double *a);
+    void bitrv216(double *a);
+    void bitrv208(double *a);
+    void cftf1st(int n, double *a, double *w);
+    void cftrec4(int n, double *a, int nw, double *w);
+    void cftleaf(int n, int isplt, double *a, int nw, double *w);
+    void cftfx41(int n, double *a, int nw, double *w);
+    void cftf161(double *a, double *w);
+    void cftf081(double *a, double *w);
+    void cftf040(double *a);
+    void cftx020(double *a);
+#ifdef USE_CDFT_THREADS
+    void cftrec4_th(int n, double *a, int nw, double *w);
+#endif /* USE_CDFT_THREADS */
+
+    if (n > 8) {
+        if (n > 32) {
+            cftf1st(n, a, &w[nw - (n >> 2)]);
+#ifdef USE_CDFT_THREADS
+            if (n > CDFT_THREADS_BEGIN_N) {
+                cftrec4_th(n, a, nw, w);
+            } else
+#endif /* USE_CDFT_THREADS */
+                if (n > 512) {
+                cftrec4(n, a, nw, w);
+            } else if (n > 128) {
+                cftleaf(n, 1, a, nw, w);
+            } else {
+                cftfx41(n, a, nw, w);
+            }
+            bitrv2(n, ip, a);
+        } else if (n == 32) {
+            cftf161(a, &w[nw - 8]);
+            bitrv216(a);
+        } else {
+            cftf081(a, w);
+            bitrv208(a);
+        }
+    } else if (n == 8) {
+        cftf040(a);
+    } else if (n == 4) {
+        cftx020(a);
+    }
+}
+
+
+void cftbsub(int n, double *a, int *ip, int nw, double *w) {
+    void bitrv2conj(int n, int *ip, double *a);
+    void bitrv216neg(double *a);
+    void bitrv208neg(double *a);
+    void cftb1st(int n, double *a, double *w);
+    void cftrec4(int n, double *a, int nw, double *w);
+    void cftleaf(int n, int isplt, double *a, int nw, double *w);
+    void cftfx41(int n, double *a, int nw, double *w);
+    void cftf161(double *a, double *w);
+    void cftf081(double *a, double *w);
+    void cftb040(double *a);
+    void cftx020(double *a);
+#ifdef USE_CDFT_THREADS
+    void cftrec4_th(int n, double *a, int nw, double *w);
+#endif /* USE_CDFT_THREADS */
+
+    if (n > 8) {
+        if (n > 32) {
+            cftb1st(n, a, &w[nw - (n >> 2)]);
+#ifdef USE_CDFT_THREADS
+            if (n > CDFT_THREADS_BEGIN_N) {
+                cftrec4_th(n, a, nw, w);
+            } else
+#endif /* USE_CDFT_THREADS */
+                if (n > 512) {
+                cftrec4(n, a, nw, w);
+            } else if (n > 128) {
+                cftleaf(n, 1, a, nw, w);
+            } else {
+                cftfx41(n, a, nw, w);
+            }
+            bitrv2conj(n, ip, a);
+        } else if (n == 32) {
+            cftf161(a, &w[nw - 8]);
+            bitrv216neg(a);
+        } else {
+            cftf081(a, w);
+            bitrv208neg(a);
+        }
+    } else if (n == 8) {
+        cftb040(a);
+    } else if (n == 4) {
+        cftx020(a);
+    }
+}
+
+
+void bitrv2(int n, int *ip, double *a) {
+    int j, j1, k, k1, l, m, nh, nm;
+    double xr, xi, yr, yi;
+
+    m = 1;
+    for (l = n >> 2; l > 8; l >>= 2) {
+        m <<= 1;
+    }
+    nh = n >> 1;
+    nm = 4 * m;
+    if (l == 8) {
+        for (k = 0; k < m; k++) {
+            for (j = 0; j < k; j++) {
+                j1 = 4 * j + 2 * ip[m + k];
+                k1 = 4 * k + 2 * ip[m + j];
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 += 2 * nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 -= nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 += 2 * nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nh;
+                k1 += 2;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 -= 2 * nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 += nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 -= 2 * nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += 2;
+                k1 += nh;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 += 2 * nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 -= nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 += 2 * nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nh;
+                k1 -= 2;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 -= 2 * nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 += nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 -= 2 * nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+            }
+            k1 = 4 * k + 2 * ip[m + k];
+            j1 = k1 + 2;
+            k1 += nh;
+            xr = a[j1];
+            xi = a[j1 + 1];
+            yr = a[k1];
+            yi = a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            j1 += nm;
+            k1 += 2 * nm;
+            xr = a[j1];
+            xi = a[j1 + 1];
+            yr = a[k1];
+            yi = a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            j1 += nm;
+            k1 -= nm;
+            xr = a[j1];
+            xi = a[j1 + 1];
+            yr = a[k1];
+            yi = a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            j1 -= 2;
+            k1 -= nh;
+            xr = a[j1];
+            xi = a[j1 + 1];
+            yr = a[k1];
+            yi = a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            j1 += nh + 2;
+            k1 += nh + 2;
+            xr = a[j1];
+            xi = a[j1 + 1];
+            yr = a[k1];
+            yi = a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            j1 -= nh - nm;
+            k1 += 2 * nm - 2;
+            xr = a[j1];
+            xi = a[j1 + 1];
+            yr = a[k1];
+            yi = a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+        }
+    } else {
+        for (k = 0; k < m; k++) {
+            for (j = 0; j < k; j++) {
+                j1 = 4 * j + ip[m + k];
+                k1 = 4 * k + ip[m + j];
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 += nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nh;
+                k1 += 2;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 -= nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += 2;
+                k1 += nh;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 += nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nh;
+                k1 -= 2;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 -= nm;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+            }
+            k1 = 4 * k + ip[m + k];
+            j1 = k1 + 2;
+            k1 += nh;
+            xr = a[j1];
+            xi = a[j1 + 1];
+            yr = a[k1];
+            yi = a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            j1 += nm;
+            k1 += nm;
+            xr = a[j1];
+            xi = a[j1 + 1];
+            yr = a[k1];
+            yi = a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+        }
+    }
+}
+
+
+void bitrv2conj(int n, int *ip, double *a) {
+    int j, j1, k, k1, l, m, nh, nm;
+    double xr, xi, yr, yi;
+
+    m = 1;
+    for (l = n >> 2; l > 8; l >>= 2) {
+        m <<= 1;
+    }
+    nh = n >> 1;
+    nm = 4 * m;
+    if (l == 8) {
+        for (k = 0; k < m; k++) {
+            for (j = 0; j < k; j++) {
+                j1 = 4 * j + 2 * ip[m + k];
+                k1 = 4 * k + 2 * ip[m + j];
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 += 2 * nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 -= nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 += 2 * nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nh;
+                k1 += 2;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 -= 2 * nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 += nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 -= 2 * nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += 2;
+                k1 += nh;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 += 2 * nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 -= nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 += 2 * nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nh;
+                k1 -= 2;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 -= 2 * nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 += nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 -= 2 * nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+            }
+            k1 = 4 * k + 2 * ip[m + k];
+            j1 = k1 + 2;
+            k1 += nh;
+            a[j1 - 1] = -a[j1 - 1];
+            xr = a[j1];
+            xi = -a[j1 + 1];
+            yr = a[k1];
+            yi = -a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            a[k1 + 3] = -a[k1 + 3];
+            j1 += nm;
+            k1 += 2 * nm;
+            xr = a[j1];
+            xi = -a[j1 + 1];
+            yr = a[k1];
+            yi = -a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            j1 += nm;
+            k1 -= nm;
+            xr = a[j1];
+            xi = -a[j1 + 1];
+            yr = a[k1];
+            yi = -a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            j1 -= 2;
+            k1 -= nh;
+            xr = a[j1];
+            xi = -a[j1 + 1];
+            yr = a[k1];
+            yi = -a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            j1 += nh + 2;
+            k1 += nh + 2;
+            xr = a[j1];
+            xi = -a[j1 + 1];
+            yr = a[k1];
+            yi = -a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            j1 -= nh - nm;
+            k1 += 2 * nm - 2;
+            a[j1 - 1] = -a[j1 - 1];
+            xr = a[j1];
+            xi = -a[j1 + 1];
+            yr = a[k1];
+            yi = -a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            a[k1 + 3] = -a[k1 + 3];
+        }
+    } else {
+        for (k = 0; k < m; k++) {
+            for (j = 0; j < k; j++) {
+                j1 = 4 * j + ip[m + k];
+                k1 = 4 * k + ip[m + j];
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 += nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nh;
+                k1 += 2;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 -= nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += 2;
+                k1 += nh;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += nm;
+                k1 += nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nh;
+                k1 -= 2;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 -= nm;
+                k1 -= nm;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+            }
+            k1 = 4 * k + ip[m + k];
+            j1 = k1 + 2;
+            k1 += nh;
+            a[j1 - 1] = -a[j1 - 1];
+            xr = a[j1];
+            xi = -a[j1 + 1];
+            yr = a[k1];
+            yi = -a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            a[k1 + 3] = -a[k1 + 3];
+            j1 += nm;
+            k1 += nm;
+            a[j1 - 1] = -a[j1 - 1];
+            xr = a[j1];
+            xi = -a[j1 + 1];
+            yr = a[k1];
+            yi = -a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            a[k1 + 3] = -a[k1 + 3];
+        }
+    }
+}
+
+
+void bitrv216(double *a) {
+    double x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i, x5r, x5i, x7r, x7i, x8r, x8i,
+        x10r, x10i, x11r, x11i, x12r, x12i, x13r, x13i, x14r, x14i;
+
+    x1r = a[2];
+    x1i = a[3];
+    x2r = a[4];
+    x2i = a[5];
+    x3r = a[6];
+    x3i = a[7];
+    x4r = a[8];
+    x4i = a[9];
+    x5r = a[10];
+    x5i = a[11];
+    x7r = a[14];
+    x7i = a[15];
+    x8r = a[16];
+    x8i = a[17];
+    x10r = a[20];
+    x10i = a[21];
+    x11r = a[22];
+    x11i = a[23];
+    x12r = a[24];
+    x12i = a[25];
+    x13r = a[26];
+    x13i = a[27];
+    x14r = a[28];
+    x14i = a[29];
+    a[2] = x8r;
+    a[3] = x8i;
+    a[4] = x4r;
+    a[5] = x4i;
+    a[6] = x12r;
+    a[7] = x12i;
+    a[8] = x2r;
+    a[9] = x2i;
+    a[10] = x10r;
+    a[11] = x10i;
+    a[14] = x14r;
+    a[15] = x14i;
+    a[16] = x1r;
+    a[17] = x1i;
+    a[20] = x5r;
+    a[21] = x5i;
+    a[22] = x13r;
+    a[23] = x13i;
+    a[24] = x3r;
+    a[25] = x3i;
+    a[26] = x11r;
+    a[27] = x11i;
+    a[28] = x7r;
+    a[29] = x7i;
+}
+
+
+void bitrv216neg(double *a) {
+    double x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i, x5r, x5i, x6r, x6i, x7r, x7i,
+        x8r, x8i, x9r, x9i, x10r, x10i, x11r, x11i, x12r, x12i, x13r, x13i,
+        x14r, x14i, x15r, x15i;
+
+    x1r = a[2];
+    x1i = a[3];
+    x2r = a[4];
+    x2i = a[5];
+    x3r = a[6];
+    x3i = a[7];
+    x4r = a[8];
+    x4i = a[9];
+    x5r = a[10];
+    x5i = a[11];
+    x6r = a[12];
+    x6i = a[13];
+    x7r = a[14];
+    x7i = a[15];
+    x8r = a[16];
+    x8i = a[17];
+    x9r = a[18];
+    x9i = a[19];
+    x10r = a[20];
+    x10i = a[21];
+    x11r = a[22];
+    x11i = a[23];
+    x12r = a[24];
+    x12i = a[25];
+    x13r = a[26];
+    x13i = a[27];
+    x14r = a[28];
+    x14i = a[29];
+    x15r = a[30];
+    x15i = a[31];
+    a[2] = x15r;
+    a[3] = x15i;
+    a[4] = x7r;
+    a[5] = x7i;
+    a[6] = x11r;
+    a[7] = x11i;
+    a[8] = x3r;
+    a[9] = x3i;
+    a[10] = x13r;
+    a[11] = x13i;
+    a[12] = x5r;
+    a[13] = x5i;
+    a[14] = x9r;
+    a[15] = x9i;
+    a[16] = x1r;
+    a[17] = x1i;
+    a[18] = x14r;
+    a[19] = x14i;
+    a[20] = x6r;
+    a[21] = x6i;
+    a[22] = x10r;
+    a[23] = x10i;
+    a[24] = x2r;
+    a[25] = x2i;
+    a[26] = x12r;
+    a[27] = x12i;
+    a[28] = x4r;
+    a[29] = x4i;
+    a[30] = x8r;
+    a[31] = x8i;
+}
+
+
+void bitrv208(double *a) {
+    double x1r, x1i, x3r, x3i, x4r, x4i, x6r, x6i;
+
+    x1r = a[2];
+    x1i = a[3];
+    x3r = a[6];
+    x3i = a[7];
+    x4r = a[8];
+    x4i = a[9];
+    x6r = a[12];
+    x6i = a[13];
+    a[2] = x4r;
+    a[3] = x4i;
+    a[6] = x6r;
+    a[7] = x6i;
+    a[8] = x1r;
+    a[9] = x1i;
+    a[12] = x3r;
+    a[13] = x3i;
+}
+
+
+void bitrv208neg(double *a) {
+    double x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i, x5r, x5i, x6r, x6i, x7r, x7i;
+
+    x1r = a[2];
+    x1i = a[3];
+    x2r = a[4];
+    x2i = a[5];
+    x3r = a[6];
+    x3i = a[7];
+    x4r = a[8];
+    x4i = a[9];
+    x5r = a[10];
+    x5i = a[11];
+    x6r = a[12];
+    x6i = a[13];
+    x7r = a[14];
+    x7i = a[15];
+    a[2] = x7r;
+    a[3] = x7i;
+    a[4] = x3r;
+    a[5] = x3i;
+    a[6] = x5r;
+    a[7] = x5i;
+    a[8] = x1r;
+    a[9] = x1i;
+    a[10] = x6r;
+    a[11] = x6i;
+    a[12] = x2r;
+    a[13] = x2i;
+    a[14] = x4r;
+    a[15] = x4i;
+}
+
+
+void cftf1st(int n, double *a, double *w) {
+    int j, j0, j1, j2, j3, k, m, mh;
+    double wn4r, csc1, csc3, wk1r, wk1i, wk3r, wk3i, wd1r, wd1i, wd3r, wd3i;
+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, y0r, y0i, y1r, y1i, y2r, y2i,
+        y3r, y3i;
+
+    mh = n >> 3;
+    m = 2 * mh;
+    j1 = m;
+    j2 = j1 + m;
+    j3 = j2 + m;
+    x0r = a[0] + a[j2];
+    x0i = a[1] + a[j2 + 1];
+    x1r = a[0] - a[j2];
+    x1i = a[1] - a[j2 + 1];
+    x2r = a[j1] + a[j3];
+    x2i = a[j1 + 1] + a[j3 + 1];
+    x3r = a[j1] - a[j3];
+    x3i = a[j1 + 1] - a[j3 + 1];
+    a[0] = x0r + x2r;
+    a[1] = x0i + x2i;
+    a[j1] = x0r - x2r;
+    a[j1 + 1] = x0i - x2i;
+    a[j2] = x1r - x3i;
+    a[j2 + 1] = x1i + x3r;
+    a[j3] = x1r + x3i;
+    a[j3 + 1] = x1i - x3r;
+    wn4r = w[1];
+    csc1 = w[2];
+    csc3 = w[3];
+    wd1r = 1;
+    wd1i = 0;
+    wd3r = 1;
+    wd3i = 0;
+    k = 0;
+    for (j = 2; j < mh - 2; j += 4) {
+        k += 4;
+        wk1r = csc1 * (wd1r + w[k]);
+        wk1i = csc1 * (wd1i + w[k + 1]);
+        wk3r = csc3 * (wd3r + w[k + 2]);
+        wk3i = csc3 * (wd3i + w[k + 3]);
+        wd1r = w[k];
+        wd1i = w[k + 1];
+        wd3r = w[k + 2];
+        wd3i = w[k + 3];
+        j1 = j + m;
+        j2 = j1 + m;
+        j3 = j2 + m;
+        x0r = a[j] + a[j2];
+        x0i = a[j + 1] + a[j2 + 1];
+        x1r = a[j] - a[j2];
+        x1i = a[j + 1] - a[j2 + 1];
+        y0r = a[j + 2] + a[j2 + 2];
+        y0i = a[j + 3] + a[j2 + 3];
+        y1r = a[j + 2] - a[j2 + 2];
+        y1i = a[j + 3] - a[j2 + 3];
+        x2r = a[j1] + a[j3];
+        x2i = a[j1 + 1] + a[j3 + 1];
+        x3r = a[j1] - a[j3];
+        x3i = a[j1 + 1] - a[j3 + 1];
+        y2r = a[j1 + 2] + a[j3 + 2];
+        y2i = a[j1 + 3] + a[j3 + 3];
+        y3r = a[j1 + 2] - a[j3 + 2];
+        y3i = a[j1 + 3] - a[j3 + 3];
+        a[j] = x0r + x2r;
+        a[j + 1] = x0i + x2i;
+        a[j + 2] = y0r + y2r;
+        a[j + 3] = y0i + y2i;
+        a[j1] = x0r - x2r;
+        a[j1 + 1] = x0i - x2i;
+        a[j1 + 2] = y0r - y2r;
+        a[j1 + 3] = y0i - y2i;
+        x0r = x1r - x3i;
+        x0i = x1i + x3r;
+        a[j2] = wk1r * x0r - wk1i * x0i;
+        a[j2 + 1] = wk1r * x0i + wk1i * x0r;
+        x0r = y1r - y3i;
+        x0i = y1i + y3r;
+        a[j2 + 2] = wd1r * x0r - wd1i * x0i;
+        a[j2 + 3] = wd1r * x0i + wd1i * x0r;
+        x0r = x1r + x3i;
+        x0i = x1i - x3r;
+        a[j3] = wk3r * x0r + wk3i * x0i;
+        a[j3 + 1] = wk3r * x0i - wk3i * x0r;
+        x0r = y1r + y3i;
+        x0i = y1i - y3r;
+        a[j3 + 2] = wd3r * x0r + wd3i * x0i;
+        a[j3 + 3] = wd3r * x0i - wd3i * x0r;
+        j0 = m - j;
+        j1 = j0 + m;
+        j2 = j1 + m;
+        j3 = j2 + m;
+        x0r = a[j0] + a[j2];
+        x0i = a[j0 + 1] + a[j2 + 1];
+        x1r = a[j0] - a[j2];
+        x1i = a[j0 + 1] - a[j2 + 1];
+        y0r = a[j0 - 2] + a[j2 - 2];
+        y0i = a[j0 - 1] + a[j2 - 1];
+        y1r = a[j0 - 2] - a[j2 - 2];
+        y1i = a[j0 - 1] - a[j2 - 1];
+        x2r = a[j1] + a[j3];
+        x2i = a[j1 + 1] + a[j3 + 1];
+        x3r = a[j1] - a[j3];
+        x3i = a[j1 + 1] - a[j3 + 1];
+        y2r = a[j1 - 2] + a[j3 - 2];
+        y2i = a[j1 - 1] + a[j3 - 1];
+        y3r = a[j1 - 2] - a[j3 - 2];
+        y3i = a[j1 - 1] - a[j3 - 1];
+        a[j0] = x0r + x2r;
+        a[j0 + 1] = x0i + x2i;
+        a[j0 - 2] = y0r + y2r;
+        a[j0 - 1] = y0i + y2i;
+        a[j1] = x0r - x2r;
+        a[j1 + 1] = x0i - x2i;
+        a[j1 - 2] = y0r - y2r;
+        a[j1 - 1] = y0i - y2i;
+        x0r = x1r - x3i;
+        x0i = x1i + x3r;
+        a[j2] = wk1i * x0r - wk1r * x0i;
+        a[j2 + 1] = wk1i * x0i + wk1r * x0r;
+        x0r = y1r - y3i;
+        x0i = y1i + y3r;
+        a[j2 - 2] = wd1i * x0r - wd1r * x0i;
+        a[j2 - 1] = wd1i * x0i + wd1r * x0r;
+        x0r = x1r + x3i;
+        x0i = x1i - x3r;
+        a[j3] = wk3i * x0r + wk3r * x0i;
+        a[j3 + 1] = wk3i * x0i - wk3r * x0r;
+        x0r = y1r + y3i;
+        x0i = y1i - y3r;
+        a[j3 - 2] = wd3i * x0r + wd3r * x0i;
+        a[j3 - 1] = wd3i * x0i - wd3r * x0r;
+    }
+    wk1r = csc1 * (wd1r + wn4r);
+    wk1i = csc1 * (wd1i + wn4r);
+    wk3r = csc3 * (wd3r - wn4r);
+    wk3i = csc3 * (wd3i - wn4r);
+    j0 = mh;
+    j1 = j0 + m;
+    j2 = j1 + m;
+    j3 = j2 + m;
+    x0r = a[j0 - 2] + a[j2 - 2];
+    x0i = a[j0 - 1] + a[j2 - 1];
+    x1r = a[j0 - 2] - a[j2 - 2];
+    x1i = a[j0 - 1] - a[j2 - 1];
+    x2r = a[j1 - 2] + a[j3 - 2];
+    x2i = a[j1 - 1] + a[j3 - 1];
+    x3r = a[j1 - 2] - a[j3 - 2];
+    x3i = a[j1 - 1] - a[j3 - 1];
+    a[j0 - 2] = x0r + x2r;
+    a[j0 - 1] = x0i + x2i;
+    a[j1 - 2] = x0r - x2r;
+    a[j1 - 1] = x0i - x2i;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    a[j2 - 2] = wk1r * x0r - wk1i * x0i;
+    a[j2 - 1] = wk1r * x0i + wk1i * x0r;
+    x0r = x1r + x3i;
+    x0i = x1i - x3r;
+    a[j3 - 2] = wk3r * x0r + wk3i * x0i;
+    a[j3 - 1] = wk3r * x0i - wk3i * x0r;
+    x0r = a[j0] + a[j2];
+    x0i = a[j0 + 1] + a[j2 + 1];
+    x1r = a[j0] - a[j2];
+    x1i = a[j0 + 1] - a[j2 + 1];
+    x2r = a[j1] + a[j3];
+    x2i = a[j1 + 1] + a[j3 + 1];
+    x3r = a[j1] - a[j3];
+    x3i = a[j1 + 1] - a[j3 + 1];
+    a[j0] = x0r + x2r;
+    a[j0 + 1] = x0i + x2i;
+    a[j1] = x0r - x2r;
+    a[j1 + 1] = x0i - x2i;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    a[j2] = wn4r * (x0r - x0i);
+    a[j2 + 1] = wn4r * (x0i + x0r);
+    x0r = x1r + x3i;
+    x0i = x1i - x3r;
+    a[j3] = -wn4r * (x0r + x0i);
+    a[j3 + 1] = -wn4r * (x0i - x0r);
+    x0r = a[j0 + 2] + a[j2 + 2];
+    x0i = a[j0 + 3] + a[j2 + 3];
+    x1r = a[j0 + 2] - a[j2 + 2];
+    x1i = a[j0 + 3] - a[j2 + 3];
+    x2r = a[j1 + 2] + a[j3 + 2];
+    x2i = a[j1 + 3] + a[j3 + 3];
+    x3r = a[j1 + 2] - a[j3 + 2];
+    x3i = a[j1 + 3] - a[j3 + 3];
+    a[j0 + 2] = x0r + x2r;
+    a[j0 + 3] = x0i + x2i;
+    a[j1 + 2] = x0r - x2r;
+    a[j1 + 3] = x0i - x2i;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    a[j2 + 2] = wk1i * x0r - wk1r * x0i;
+    a[j2 + 3] = wk1i * x0i + wk1r * x0r;
+    x0r = x1r + x3i;
+    x0i = x1i - x3r;
+    a[j3 + 2] = wk3i * x0r + wk3r * x0i;
+    a[j3 + 3] = wk3i * x0i - wk3r * x0r;
+}
+
+
+void cftb1st(int n, double *a, double *w) {
+    int j, j0, j1, j2, j3, k, m, mh;
+    double wn4r, csc1, csc3, wk1r, wk1i, wk3r, wk3i, wd1r, wd1i, wd3r, wd3i;
+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, y0r, y0i, y1r, y1i, y2r, y2i,
+        y3r, y3i;
+
+    mh = n >> 3;
+    m = 2 * mh;
+    j1 = m;
+    j2 = j1 + m;
+    j3 = j2 + m;
+    x0r = a[0] + a[j2];
+    x0i = -a[1] - a[j2 + 1];
+    x1r = a[0] - a[j2];
+    x1i = -a[1] + a[j2 + 1];
+    x2r = a[j1] + a[j3];
+    x2i = a[j1 + 1] + a[j3 + 1];
+    x3r = a[j1] - a[j3];
+    x3i = a[j1 + 1] - a[j3 + 1];
+    a[0] = x0r + x2r;
+    a[1] = x0i - x2i;
+    a[j1] = x0r - x2r;
+    a[j1 + 1] = x0i + x2i;
+    a[j2] = x1r + x3i;
+    a[j2 + 1] = x1i + x3r;
+    a[j3] = x1r - x3i;
+    a[j3 + 1] = x1i - x3r;
+    wn4r = w[1];
+    csc1 = w[2];
+    csc3 = w[3];
+    wd1r = 1;
+    wd1i = 0;
+    wd3r = 1;
+    wd3i = 0;
+    k = 0;
+    for (j = 2; j < mh - 2; j += 4) {
+        k += 4;
+        wk1r = csc1 * (wd1r + w[k]);
+        wk1i = csc1 * (wd1i + w[k + 1]);
+        wk3r = csc3 * (wd3r + w[k + 2]);
+        wk3i = csc3 * (wd3i + w[k + 3]);
+        wd1r = w[k];
+        wd1i = w[k + 1];
+        wd3r = w[k + 2];
+        wd3i = w[k + 3];
+        j1 = j + m;
+        j2 = j1 + m;
+        j3 = j2 + m;
+        x0r = a[j] + a[j2];
+        x0i = -a[j + 1] - a[j2 + 1];
+        x1r = a[j] - a[j2];
+        x1i = -a[j + 1] + a[j2 + 1];
+        y0r = a[j + 2] + a[j2 + 2];
+        y0i = -a[j + 3] - a[j2 + 3];
+        y1r = a[j + 2] - a[j2 + 2];
+        y1i = -a[j + 3] + a[j2 + 3];
+        x2r = a[j1] + a[j3];
+        x2i = a[j1 + 1] + a[j3 + 1];
+        x3r = a[j1] - a[j3];
+        x3i = a[j1 + 1] - a[j3 + 1];
+        y2r = a[j1 + 2] + a[j3 + 2];
+        y2i = a[j1 + 3] + a[j3 + 3];
+        y3r = a[j1 + 2] - a[j3 + 2];
+        y3i = a[j1 + 3] - a[j3 + 3];
+        a[j] = x0r + x2r;
+        a[j + 1] = x0i - x2i;
+        a[j + 2] = y0r + y2r;
+        a[j + 3] = y0i - y2i;
+        a[j1] = x0r - x2r;
+        a[j1 + 1] = x0i + x2i;
+        a[j1 + 2] = y0r - y2r;
+        a[j1 + 3] = y0i + y2i;
+        x0r = x1r + x3i;
+        x0i = x1i + x3r;
+        a[j2] = wk1r * x0r - wk1i * x0i;
+        a[j2 + 1] = wk1r * x0i + wk1i * x0r;
+        x0r = y1r + y3i;
+        x0i = y1i + y3r;
+        a[j2 + 2] = wd1r * x0r - wd1i * x0i;
+        a[j2 + 3] = wd1r * x0i + wd1i * x0r;
+        x0r = x1r - x3i;
+        x0i = x1i - x3r;
+        a[j3] = wk3r * x0r + wk3i * x0i;
+        a[j3 + 1] = wk3r * x0i - wk3i * x0r;
+        x0r = y1r - y3i;
+        x0i = y1i - y3r;
+        a[j3 + 2] = wd3r * x0r + wd3i * x0i;
+        a[j3 + 3] = wd3r * x0i - wd3i * x0r;
+        j0 = m - j;
+        j1 = j0 + m;
+        j2 = j1 + m;
+        j3 = j2 + m;
+        x0r = a[j0] + a[j2];
+        x0i = -a[j0 + 1] - a[j2 + 1];
+        x1r = a[j0] - a[j2];
+        x1i = -a[j0 + 1] + a[j2 + 1];
+        y0r = a[j0 - 2] + a[j2 - 2];
+        y0i = -a[j0 - 1] - a[j2 - 1];
+        y1r = a[j0 - 2] - a[j2 - 2];
+        y1i = -a[j0 - 1] + a[j2 - 1];
+        x2r = a[j1] + a[j3];
+        x2i = a[j1 + 1] + a[j3 + 1];
+        x3r = a[j1] - a[j3];
+        x3i = a[j1 + 1] - a[j3 + 1];
+        y2r = a[j1 - 2] + a[j3 - 2];
+        y2i = a[j1 - 1] + a[j3 - 1];
+        y3r = a[j1 - 2] - a[j3 - 2];
+        y3i = a[j1 - 1] - a[j3 - 1];
+        a[j0] = x0r + x2r;
+        a[j0 + 1] = x0i - x2i;
+        a[j0 - 2] = y0r + y2r;
+        a[j0 - 1] = y0i - y2i;
+        a[j1] = x0r - x2r;
+        a[j1 + 1] = x0i + x2i;
+        a[j1 - 2] = y0r - y2r;
+        a[j1 - 1] = y0i + y2i;
+        x0r = x1r + x3i;
+        x0i = x1i + x3r;
+        a[j2] = wk1i * x0r - wk1r * x0i;
+        a[j2 + 1] = wk1i * x0i + wk1r * x0r;
+        x0r = y1r + y3i;
+        x0i = y1i + y3r;
+        a[j2 - 2] = wd1i * x0r - wd1r * x0i;
+        a[j2 - 1] = wd1i * x0i + wd1r * x0r;
+        x0r = x1r - x3i;
+        x0i = x1i - x3r;
+        a[j3] = wk3i * x0r + wk3r * x0i;
+        a[j3 + 1] = wk3i * x0i - wk3r * x0r;
+        x0r = y1r - y3i;
+        x0i = y1i - y3r;
+        a[j3 - 2] = wd3i * x0r + wd3r * x0i;
+        a[j3 - 1] = wd3i * x0i - wd3r * x0r;
+    }
+    wk1r = csc1 * (wd1r + wn4r);
+    wk1i = csc1 * (wd1i + wn4r);
+    wk3r = csc3 * (wd3r - wn4r);
+    wk3i = csc3 * (wd3i - wn4r);
+    j0 = mh;
+    j1 = j0 + m;
+    j2 = j1 + m;
+    j3 = j2 + m;
+    x0r = a[j0 - 2] + a[j2 - 2];
+    x0i = -a[j0 - 1] - a[j2 - 1];
+    x1r = a[j0 - 2] - a[j2 - 2];
+    x1i = -a[j0 - 1] + a[j2 - 1];
+    x2r = a[j1 - 2] + a[j3 - 2];
+    x2i = a[j1 - 1] + a[j3 - 1];
+    x3r = a[j1 - 2] - a[j3 - 2];
+    x3i = a[j1 - 1] - a[j3 - 1];
+    a[j0 - 2] = x0r + x2r;
+    a[j0 - 1] = x0i - x2i;
+    a[j1 - 2] = x0r - x2r;
+    a[j1 - 1] = x0i + x2i;
+    x0r = x1r + x3i;
+    x0i = x1i + x3r;
+    a[j2 - 2] = wk1r * x0r - wk1i * x0i;
+    a[j2 - 1] = wk1r * x0i + wk1i * x0r;
+    x0r = x1r - x3i;
+    x0i = x1i - x3r;
+    a[j3 - 2] = wk3r * x0r + wk3i * x0i;
+    a[j3 - 1] = wk3r * x0i - wk3i * x0r;
+    x0r = a[j0] + a[j2];
+    x0i = -a[j0 + 1] - a[j2 + 1];
+    x1r = a[j0] - a[j2];
+    x1i = -a[j0 + 1] + a[j2 + 1];
+    x2r = a[j1] + a[j3];
+    x2i = a[j1 + 1] + a[j3 + 1];
+    x3r = a[j1] - a[j3];
+    x3i = a[j1 + 1] - a[j3 + 1];
+    a[j0] = x0r + x2r;
+    a[j0 + 1] = x0i - x2i;
+    a[j1] = x0r - x2r;
+    a[j1 + 1] = x0i + x2i;
+    x0r = x1r + x3i;
+    x0i = x1i + x3r;
+    a[j2] = wn4r * (x0r - x0i);
+    a[j2 + 1] = wn4r * (x0i + x0r);
+    x0r = x1r - x3i;
+    x0i = x1i - x3r;
+    a[j3] = -wn4r * (x0r + x0i);
+    a[j3 + 1] = -wn4r * (x0i - x0r);
+    x0r = a[j0 + 2] + a[j2 + 2];
+    x0i = -a[j0 + 3] - a[j2 + 3];
+    x1r = a[j0 + 2] - a[j2 + 2];
+    x1i = -a[j0 + 3] + a[j2 + 3];
+    x2r = a[j1 + 2] + a[j3 + 2];
+    x2i = a[j1 + 3] + a[j3 + 3];
+    x3r = a[j1 + 2] - a[j3 + 2];
+    x3i = a[j1 + 3] - a[j3 + 3];
+    a[j0 + 2] = x0r + x2r;
+    a[j0 + 3] = x0i - x2i;
+    a[j1 + 2] = x0r - x2r;
+    a[j1 + 3] = x0i + x2i;
+    x0r = x1r + x3i;
+    x0i = x1i + x3r;
+    a[j2 + 2] = wk1i * x0r - wk1r * x0i;
+    a[j2 + 3] = wk1i * x0i + wk1r * x0r;
+    x0r = x1r - x3i;
+    x0i = x1i - x3r;
+    a[j3 + 2] = wk3i * x0r + wk3r * x0i;
+    a[j3 + 3] = wk3i * x0i - wk3r * x0r;
+}
+
+
+#ifdef USE_CDFT_THREADS
+struct cdft_arg_st {
+    int n0;
+    int n;
+    double *a;
+    int nw;
+    double *w;
+};
+typedef struct cdft_arg_st cdft_arg_t;
+
+
+void cftrec4_th(int n, double *a, int nw, double *w) {
+    void *cftrec1_th(void *p);
+    void *cftrec2_th(void *p);
+    int i, idiv4, m, nthread;
+    cdft_thread_t th[4];
+    cdft_arg_t ag[4];
+
+    nthread = 2;
+    idiv4 = 0;
+    m = n >> 1;
+    if (n > CDFT_4THREADS_BEGIN_N) {
+        nthread = 4;
+        idiv4 = 1;
+        m >>= 1;
+    }
+    for (i = 0; i < nthread; i++) {
+        ag[i].n0 = n;
+        ag[i].n = m;
+        ag[i].a = &a[i * m];
+        ag[i].nw = nw;
+        ag[i].w = w;
+        if (i != idiv4) {
+            cdft_thread_create(&th[i], cftrec1_th, &ag[i]);
+        } else {
+            cdft_thread_create(&th[i], cftrec2_th, &ag[i]);
+        }
+    }
+    for (i = 0; i < nthread; i++) {
+        cdft_thread_wait(th[i]);
+    }
+}
+
+
+void *cftrec1_th(void *p) {
+    int cfttree(int n, int j, int k, double *a, int nw, double *w);
+    void cftleaf(int n, int isplt, double *a, int nw, double *w);
+    void cftmdl1(int n, double *a, double *w);
+    int isplt, j, k, m, n, n0, nw;
+    double *a, *w;
+
+    n0 = ((cdft_arg_t *)p)->n0;
+    n = ((cdft_arg_t *)p)->n;
+    a = ((cdft_arg_t *)p)->a;
+    nw = ((cdft_arg_t *)p)->nw;
+    w = ((cdft_arg_t *)p)->w;
+    m = n0;
+    while (m > 512) {
+        m >>= 2;
+        cftmdl1(m, &a[n - m], &w[nw - (m >> 1)]);
+    }
+    cftleaf(m, 1, &a[n - m], nw, w);
+    k = 0;
+    for (j = n - m; j > 0; j -= m) {
+        k++;
+        isplt = cfttree(m, j, k, a, nw, w);
+        cftleaf(m, isplt, &a[j - m], nw, w);
+    }
+    return (void *)0;
+}
+
+
+void *cftrec2_th(void *p) {
+    int cfttree(int n, int j, int k, double *a, int nw, double *w);
+    void cftleaf(int n, int isplt, double *a, int nw, double *w);
+    void cftmdl2(int n, double *a, double *w);
+    int isplt, j, k, m, n, n0, nw;
+    double *a, *w;
+
+    n0 = ((cdft_arg_t *)p)->n0;
+    n = ((cdft_arg_t *)p)->n;
+    a = ((cdft_arg_t *)p)->a;
+    nw = ((cdft_arg_t *)p)->nw;
+    w = ((cdft_arg_t *)p)->w;
+    k = 1;
+    m = n0;
+    while (m > 512) {
+        m >>= 2;
+        k <<= 2;
+        cftmdl2(m, &a[n - m], &w[nw - m]);
+    }
+    cftleaf(m, 0, &a[n - m], nw, w);
+    k >>= 1;
+    for (j = n - m; j > 0; j -= m) {
+        k++;
+        isplt = cfttree(m, j, k, a, nw, w);
+        cftleaf(m, isplt, &a[j - m], nw, w);
+    }
+    return (void *)0;
+}
+#endif /* USE_CDFT_THREADS */
+
+
+void cftrec4(int n, double *a, int nw, double *w) {
+    int cfttree(int n, int j, int k, double *a, int nw, double *w);
+    void cftleaf(int n, int isplt, double *a, int nw, double *w);
+    void cftmdl1(int n, double *a, double *w);
+    int isplt, j, k, m;
+
+    m = n;
+    while (m > 512) {
+        m >>= 2;
+        cftmdl1(m, &a[n - m], &w[nw - (m >> 1)]);
+    }
+    cftleaf(m, 1, &a[n - m], nw, w);
+    k = 0;
+    for (j = n - m; j > 0; j -= m) {
+        k++;
+        isplt = cfttree(m, j, k, a, nw, w);
+        cftleaf(m, isplt, &a[j - m], nw, w);
+    }
+}
+
+
+int cfttree(int n, int j, int k, double *a, int nw, double *w) {
+    void cftmdl1(int n, double *a, double *w);
+    void cftmdl2(int n, double *a, double *w);
+    int i, isplt, m;
+
+    if ((k & 3) != 0) {
+        isplt = k & 1;
+        if (isplt != 0) {
+            cftmdl1(n, &a[j - n], &w[nw - (n >> 1)]);
+        } else {
+            cftmdl2(n, &a[j - n], &w[nw - n]);
+        }
+    } else {
+        m = n;
+        for (i = k; (i & 3) == 0; i >>= 2) {
+            m <<= 2;
+        }
+        isplt = i & 1;
+        if (isplt != 0) {
+            while (m > 128) {
+                cftmdl1(m, &a[j - m], &w[nw - (m >> 1)]);
+                m >>= 2;
+            }
+        } else {
+            while (m > 128) {
+                cftmdl2(m, &a[j - m], &w[nw - m]);
+                m >>= 2;
+            }
+        }
+    }
+    return isplt;
+}
+
+
+void cftleaf(int n, int isplt, double *a, int nw, double *w) {
+    void cftmdl1(int n, double *a, double *w);
+    void cftmdl2(int n, double *a, double *w);
+    void cftf161(double *a, double *w);
+    void cftf162(double *a, double *w);
+    void cftf081(double *a, double *w);
+    void cftf082(double *a, double *w);
+
+    if (n == 512) {
+        cftmdl1(128, a, &w[nw - 64]);
+        cftf161(a, &w[nw - 8]);
+        cftf162(&a[32], &w[nw - 32]);
+        cftf161(&a[64], &w[nw - 8]);
+        cftf161(&a[96], &w[nw - 8]);
+        cftmdl2(128, &a[128], &w[nw - 128]);
+        cftf161(&a[128], &w[nw - 8]);
+        cftf162(&a[160], &w[nw - 32]);
+        cftf161(&a[192], &w[nw - 8]);
+        cftf162(&a[224], &w[nw - 32]);
+        cftmdl1(128, &a[256], &w[nw - 64]);
+        cftf161(&a[256], &w[nw - 8]);
+        cftf162(&a[288], &w[nw - 32]);
+        cftf161(&a[320], &w[nw - 8]);
+        cftf161(&a[352], &w[nw - 8]);
+        if (isplt != 0) {
+            cftmdl1(128, &a[384], &w[nw - 64]);
+            cftf161(&a[480], &w[nw - 8]);
+        } else {
+            cftmdl2(128, &a[384], &w[nw - 128]);
+            cftf162(&a[480], &w[nw - 32]);
+        }
+        cftf161(&a[384], &w[nw - 8]);
+        cftf162(&a[416], &w[nw - 32]);
+        cftf161(&a[448], &w[nw - 8]);
+    } else {
+        cftmdl1(64, a, &w[nw - 32]);
+        cftf081(a, &w[nw - 8]);
+        cftf082(&a[16], &w[nw - 8]);
+        cftf081(&a[32], &w[nw - 8]);
+        cftf081(&a[48], &w[nw - 8]);
+        cftmdl2(64, &a[64], &w[nw - 64]);
+        cftf081(&a[64], &w[nw - 8]);
+        cftf082(&a[80], &w[nw - 8]);
+        cftf081(&a[96], &w[nw - 8]);
+        cftf082(&a[112], &w[nw - 8]);
+        cftmdl1(64, &a[128], &w[nw - 32]);
+        cftf081(&a[128], &w[nw - 8]);
+        cftf082(&a[144], &w[nw - 8]);
+        cftf081(&a[160], &w[nw - 8]);
+        cftf081(&a[176], &w[nw - 8]);
+        if (isplt != 0) {
+            cftmdl1(64, &a[192], &w[nw - 32]);
+            cftf081(&a[240], &w[nw - 8]);
+        } else {
+            cftmdl2(64, &a[192], &w[nw - 64]);
+            cftf082(&a[240], &w[nw - 8]);
+        }
+        cftf081(&a[192], &w[nw - 8]);
+        cftf082(&a[208], &w[nw - 8]);
+        cftf081(&a[224], &w[nw - 8]);
+    }
+}
+
+
+void cftmdl1(int n, double *a, double *w) {
+    int j, j0, j1, j2, j3, k, m, mh;
+    double wn4r, wk1r, wk1i, wk3r, wk3i;
+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+
+    mh = n >> 3;
+    m = 2 * mh;
+    j1 = m;
+    j2 = j1 + m;
+    j3 = j2 + m;
+    x0r = a[0] + a[j2];
+    x0i = a[1] + a[j2 + 1];
+    x1r = a[0] - a[j2];
+    x1i = a[1] - a[j2 + 1];
+    x2r = a[j1] + a[j3];
+    x2i = a[j1 + 1] + a[j3 + 1];
+    x3r = a[j1] - a[j3];
+    x3i = a[j1 + 1] - a[j3 + 1];
+    a[0] = x0r + x2r;
+    a[1] = x0i + x2i;
+    a[j1] = x0r - x2r;
+    a[j1 + 1] = x0i - x2i;
+    a[j2] = x1r - x3i;
+    a[j2 + 1] = x1i + x3r;
+    a[j3] = x1r + x3i;
+    a[j3 + 1] = x1i - x3r;
+    wn4r = w[1];
+    k = 0;
+    for (j = 2; j < mh; j += 2) {
+        k += 4;
+        wk1r = w[k];
+        wk1i = w[k + 1];
+        wk3r = w[k + 2];
+        wk3i = w[k + 3];
+        j1 = j + m;
+        j2 = j1 + m;
+        j3 = j2 + m;
+        x0r = a[j] + a[j2];
+        x0i = a[j + 1] + a[j2 + 1];
+        x1r = a[j] - a[j2];
+        x1i = a[j + 1] - a[j2 + 1];
+        x2r = a[j1] + a[j3];
+        x2i = a[j1 + 1] + a[j3 + 1];
+        x3r = a[j1] - a[j3];
+        x3i = a[j1 + 1] - a[j3 + 1];
+        a[j] = x0r + x2r;
+        a[j + 1] = x0i + x2i;
+        a[j1] = x0r - x2r;
+        a[j1 + 1] = x0i - x2i;
+        x0r = x1r - x3i;
+        x0i = x1i + x3r;
+        a[j2] = wk1r * x0r - wk1i * x0i;
+        a[j2 + 1] = wk1r * x0i + wk1i * x0r;
+        x0r = x1r + x3i;
+        x0i = x1i - x3r;
+        a[j3] = wk3r * x0r + wk3i * x0i;
+        a[j3 + 1] = wk3r * x0i - wk3i * x0r;
+        j0 = m - j;
+        j1 = j0 + m;
+        j2 = j1 + m;
+        j3 = j2 + m;
+        x0r = a[j0] + a[j2];
+        x0i = a[j0 + 1] + a[j2 + 1];
+        x1r = a[j0] - a[j2];
+        x1i = a[j0 + 1] - a[j2 + 1];
+        x2r = a[j1] + a[j3];
+        x2i = a[j1 + 1] + a[j3 + 1];
+        x3r = a[j1] - a[j3];
+        x3i = a[j1 + 1] - a[j3 + 1];
+        a[j0] = x0r + x2r;
+        a[j0 + 1] = x0i + x2i;
+        a[j1] = x0r - x2r;
+        a[j1 + 1] = x0i - x2i;
+        x0r = x1r - x3i;
+        x0i = x1i + x3r;
+        a[j2] = wk1i * x0r - wk1r * x0i;
+        a[j2 + 1] = wk1i * x0i + wk1r * x0r;
+        x0r = x1r + x3i;
+        x0i = x1i - x3r;
+        a[j3] = wk3i * x0r + wk3r * x0i;
+        a[j3 + 1] = wk3i * x0i - wk3r * x0r;
+    }
+    j0 = mh;
+    j1 = j0 + m;
+    j2 = j1 + m;
+    j3 = j2 + m;
+    x0r = a[j0] + a[j2];
+    x0i = a[j0 + 1] + a[j2 + 1];
+    x1r = a[j0] - a[j2];
+    x1i = a[j0 + 1] - a[j2 + 1];
+    x2r = a[j1] + a[j3];
+    x2i = a[j1 + 1] + a[j3 + 1];
+    x3r = a[j1] - a[j3];
+    x3i = a[j1 + 1] - a[j3 + 1];
+    a[j0] = x0r + x2r;
+    a[j0 + 1] = x0i + x2i;
+    a[j1] = x0r - x2r;
+    a[j1 + 1] = x0i - x2i;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    a[j2] = wn4r * (x0r - x0i);
+    a[j2 + 1] = wn4r * (x0i + x0r);
+    x0r = x1r + x3i;
+    x0i = x1i - x3r;
+    a[j3] = -wn4r * (x0r + x0i);
+    a[j3 + 1] = -wn4r * (x0i - x0r);
+}
+
+
+void cftmdl2(int n, double *a, double *w) {
+    int j, j0, j1, j2, j3, k, kr, m, mh;
+    double wn4r, wk1r, wk1i, wk3r, wk3i, wd1r, wd1i, wd3r, wd3i;
+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, y0r, y0i, y2r, y2i;
+
+    mh = n >> 3;
+    m = 2 * mh;
+    wn4r = w[1];
+    j1 = m;
+    j2 = j1 + m;
+    j3 = j2 + m;
+    x0r = a[0] - a[j2 + 1];
+    x0i = a[1] + a[j2];
+    x1r = a[0] + a[j2 + 1];
+    x1i = a[1] - a[j2];
+    x2r = a[j1] - a[j3 + 1];
+    x2i = a[j1 + 1] + a[j3];
+    x3r = a[j1] + a[j3 + 1];
+    x3i = a[j1 + 1] - a[j3];
+    y0r = wn4r * (x2r - x2i);
+    y0i = wn4r * (x2i + x2r);
+    a[0] = x0r + y0r;
+    a[1] = x0i + y0i;
+    a[j1] = x0r - y0r;
+    a[j1 + 1] = x0i - y0i;
+    y0r = wn4r * (x3r - x3i);
+    y0i = wn4r * (x3i + x3r);
+    a[j2] = x1r - y0i;
+    a[j2 + 1] = x1i + y0r;
+    a[j3] = x1r + y0i;
+    a[j3 + 1] = x1i - y0r;
+    k = 0;
+    kr = 2 * m;
+    for (j = 2; j < mh; j += 2) {
+        k += 4;
+        wk1r = w[k];
+        wk1i = w[k + 1];
+        wk3r = w[k + 2];
+        wk3i = w[k + 3];
+        kr -= 4;
+        wd1i = w[kr];
+        wd1r = w[kr + 1];
+        wd3i = w[kr + 2];
+        wd3r = w[kr + 3];
+        j1 = j + m;
+        j2 = j1 + m;
+        j3 = j2 + m;
+        x0r = a[j] - a[j2 + 1];
+        x0i = a[j + 1] + a[j2];
+        x1r = a[j] + a[j2 + 1];
+        x1i = a[j + 1] - a[j2];
+        x2r = a[j1] - a[j3 + 1];
+        x2i = a[j1 + 1] + a[j3];
+        x3r = a[j1] + a[j3 + 1];
+        x3i = a[j1 + 1] - a[j3];
+        y0r = wk1r * x0r - wk1i * x0i;
+        y0i = wk1r * x0i + wk1i * x0r;
+        y2r = wd1r * x2r - wd1i * x2i;
+        y2i = wd1r * x2i + wd1i * x2r;
+        a[j] = y0r + y2r;
+        a[j + 1] = y0i + y2i;
+        a[j1] = y0r - y2r;
+        a[j1 + 1] = y0i - y2i;
+        y0r = wk3r * x1r + wk3i * x1i;
+        y0i = wk3r * x1i - wk3i * x1r;
+        y2r = wd3r * x3r + wd3i * x3i;
+        y2i = wd3r * x3i - wd3i * x3r;
+        a[j2] = y0r + y2r;
+        a[j2 + 1] = y0i + y2i;
+        a[j3] = y0r - y2r;
+        a[j3 + 1] = y0i - y2i;
+        j0 = m - j;
+        j1 = j0 + m;
+        j2 = j1 + m;
+        j3 = j2 + m;
+        x0r = a[j0] - a[j2 + 1];
+        x0i = a[j0 + 1] + a[j2];
+        x1r = a[j0] + a[j2 + 1];
+        x1i = a[j0 + 1] - a[j2];
+        x2r = a[j1] - a[j3 + 1];
+        x2i = a[j1 + 1] + a[j3];
+        x3r = a[j1] + a[j3 + 1];
+        x3i = a[j1 + 1] - a[j3];
+        y0r = wd1i * x0r - wd1r * x0i;
+        y0i = wd1i * x0i + wd1r * x0r;
+        y2r = wk1i * x2r - wk1r * x2i;
+        y2i = wk1i * x2i + wk1r * x2r;
+        a[j0] = y0r + y2r;
+        a[j0 + 1] = y0i + y2i;
+        a[j1] = y0r - y2r;
+        a[j1 + 1] = y0i - y2i;
+        y0r = wd3i * x1r + wd3r * x1i;
+        y0i = wd3i * x1i - wd3r * x1r;
+        y2r = wk3i * x3r + wk3r * x3i;
+        y2i = wk3i * x3i - wk3r * x3r;
+        a[j2] = y0r + y2r;
+        a[j2 + 1] = y0i + y2i;
+        a[j3] = y0r - y2r;
+        a[j3 + 1] = y0i - y2i;
+    }
+    wk1r = w[m];
+    wk1i = w[m + 1];
+    j0 = mh;
+    j1 = j0 + m;
+    j2 = j1 + m;
+    j3 = j2 + m;
+    x0r = a[j0] - a[j2 + 1];
+    x0i = a[j0 + 1] + a[j2];
+    x1r = a[j0] + a[j2 + 1];
+    x1i = a[j0 + 1] - a[j2];
+    x2r = a[j1] - a[j3 + 1];
+    x2i = a[j1 + 1] + a[j3];
+    x3r = a[j1] + a[j3 + 1];
+    x3i = a[j1 + 1] - a[j3];
+    y0r = wk1r * x0r - wk1i * x0i;
+    y0i = wk1r * x0i + wk1i * x0r;
+    y2r = wk1i * x2r - wk1r * x2i;
+    y2i = wk1i * x2i + wk1r * x2r;
+    a[j0] = y0r + y2r;
+    a[j0 + 1] = y0i + y2i;
+    a[j1] = y0r - y2r;
+    a[j1 + 1] = y0i - y2i;
+    y0r = wk1i * x1r - wk1r * x1i;
+    y0i = wk1i * x1i + wk1r * x1r;
+    y2r = wk1r * x3r - wk1i * x3i;
+    y2i = wk1r * x3i + wk1i * x3r;
+    a[j2] = y0r - y2r;
+    a[j2 + 1] = y0i - y2i;
+    a[j3] = y0r + y2r;
+    a[j3 + 1] = y0i + y2i;
+}
+
+
+void cftfx41(int n, double *a, int nw, double *w) {
+    void cftf161(double *a, double *w);
+    void cftf162(double *a, double *w);
+    void cftf081(double *a, double *w);
+    void cftf082(double *a, double *w);
+
+    if (n == 128) {
+        cftf161(a, &w[nw - 8]);
+        cftf162(&a[32], &w[nw - 32]);
+        cftf161(&a[64], &w[nw - 8]);
+        cftf161(&a[96], &w[nw - 8]);
+    } else {
+        cftf081(a, &w[nw - 8]);
+        cftf082(&a[16], &w[nw - 8]);
+        cftf081(&a[32], &w[nw - 8]);
+        cftf081(&a[48], &w[nw - 8]);
+    }
+}
+
+
+void cftf161(double *a, double *w) {
+    double wn4r, wk1r, wk1i, x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, y0r, y0i,
+        y1r, y1i, y2r, y2i, y3r, y3i, y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i,
+        y8r, y8i, y9r, y9i, y10r, y10i, y11r, y11i, y12r, y12i, y13r, y13i,
+        y14r, y14i, y15r, y15i;
+
+    wn4r = w[1];
+    wk1r = w[2];
+    wk1i = w[3];
+    x0r = a[0] + a[16];
+    x0i = a[1] + a[17];
+    x1r = a[0] - a[16];
+    x1i = a[1] - a[17];
+    x2r = a[8] + a[24];
+    x2i = a[9] + a[25];
+    x3r = a[8] - a[24];
+    x3i = a[9] - a[25];
+    y0r = x0r + x2r;
+    y0i = x0i + x2i;
+    y4r = x0r - x2r;
+    y4i = x0i - x2i;
+    y8r = x1r - x3i;
+    y8i = x1i + x3r;
+    y12r = x1r + x3i;
+    y12i = x1i - x3r;
+    x0r = a[2] + a[18];
+    x0i = a[3] + a[19];
+    x1r = a[2] - a[18];
+    x1i = a[3] - a[19];
+    x2r = a[10] + a[26];
+    x2i = a[11] + a[27];
+    x3r = a[10] - a[26];
+    x3i = a[11] - a[27];
+    y1r = x0r + x2r;
+    y1i = x0i + x2i;
+    y5r = x0r - x2r;
+    y5i = x0i - x2i;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    y9r = wk1r * x0r - wk1i * x0i;
+    y9i = wk1r * x0i + wk1i * x0r;
+    x0r = x1r + x3i;
+    x0i = x1i - x3r;
+    y13r = wk1i * x0r - wk1r * x0i;
+    y13i = wk1i * x0i + wk1r * x0r;
+    x0r = a[4] + a[20];
+    x0i = a[5] + a[21];
+    x1r = a[4] - a[20];
+    x1i = a[5] - a[21];
+    x2r = a[12] + a[28];
+    x2i = a[13] + a[29];
+    x3r = a[12] - a[28];
+    x3i = a[13] - a[29];
+    y2r = x0r + x2r;
+    y2i = x0i + x2i;
+    y6r = x0r - x2r;
+    y6i = x0i - x2i;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    y10r = wn4r * (x0r - x0i);
+    y10i = wn4r * (x0i + x0r);
+    x0r = x1r + x3i;
+    x0i = x1i - x3r;
+    y14r = wn4r * (x0r + x0i);
+    y14i = wn4r * (x0i - x0r);
+    x0r = a[6] + a[22];
+    x0i = a[7] + a[23];
+    x1r = a[6] - a[22];
+    x1i = a[7] - a[23];
+    x2r = a[14] + a[30];
+    x2i = a[15] + a[31];
+    x3r = a[14] - a[30];
+    x3i = a[15] - a[31];
+    y3r = x0r + x2r;
+    y3i = x0i + x2i;
+    y7r = x0r - x2r;
+    y7i = x0i - x2i;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    y11r = wk1i * x0r - wk1r * x0i;
+    y11i = wk1i * x0i + wk1r * x0r;
+    x0r = x1r + x3i;
+    x0i = x1i - x3r;
+    y15r = wk1r * x0r - wk1i * x0i;
+    y15i = wk1r * x0i + wk1i * x0r;
+    x0r = y12r - y14r;
+    x0i = y12i - y14i;
+    x1r = y12r + y14r;
+    x1i = y12i + y14i;
+    x2r = y13r - y15r;
+    x2i = y13i - y15i;
+    x3r = y13r + y15r;
+    x3i = y13i + y15i;
+    a[24] = x0r + x2r;
+    a[25] = x0i + x2i;
+    a[26] = x0r - x2r;
+    a[27] = x0i - x2i;
+    a[28] = x1r - x3i;
+    a[29] = x1i + x3r;
+    a[30] = x1r + x3i;
+    a[31] = x1i - x3r;
+    x0r = y8r + y10r;
+    x0i = y8i + y10i;
+    x1r = y8r - y10r;
+    x1i = y8i - y10i;
+    x2r = y9r + y11r;
+    x2i = y9i + y11i;
+    x3r = y9r - y11r;
+    x3i = y9i - y11i;
+    a[16] = x0r + x2r;
+    a[17] = x0i + x2i;
+    a[18] = x0r - x2r;
+    a[19] = x0i - x2i;
+    a[20] = x1r - x3i;
+    a[21] = x1i + x3r;
+    a[22] = x1r + x3i;
+    a[23] = x1i - x3r;
+    x0r = y5r - y7i;
+    x0i = y5i + y7r;
+    x2r = wn4r * (x0r - x0i);
+    x2i = wn4r * (x0i + x0r);
+    x0r = y5r + y7i;
+    x0i = y5i - y7r;
+    x3r = wn4r * (x0r - x0i);
+    x3i = wn4r * (x0i + x0r);
+    x0r = y4r - y6i;
+    x0i = y4i + y6r;
+    x1r = y4r + y6i;
+    x1i = y4i - y6r;
+    a[8] = x0r + x2r;
+    a[9] = x0i + x2i;
+    a[10] = x0r - x2r;
+    a[11] = x0i - x2i;
+    a[12] = x1r - x3i;
+    a[13] = x1i + x3r;
+    a[14] = x1r + x3i;
+    a[15] = x1i - x3r;
+    x0r = y0r + y2r;
+    x0i = y0i + y2i;
+    x1r = y0r - y2r;
+    x1i = y0i - y2i;
+    x2r = y1r + y3r;
+    x2i = y1i + y3i;
+    x3r = y1r - y3r;
+    x3i = y1i - y3i;
+    a[0] = x0r + x2r;
+    a[1] = x0i + x2i;
+    a[2] = x0r - x2r;
+    a[3] = x0i - x2i;
+    a[4] = x1r - x3i;
+    a[5] = x1i + x3r;
+    a[6] = x1r + x3i;
+    a[7] = x1i - x3r;
+}
+
+
+void cftf162(double *a, double *w) {
+    double wn4r, wk1r, wk1i, wk2r, wk2i, wk3r, wk3i, x0r, x0i, x1r, x1i, x2r,
+        x2i, y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i, y4r, y4i, y5r, y5i, y6r,
+        y6i, y7r, y7i, y8r, y8i, y9r, y9i, y10r, y10i, y11r, y11i, y12r, y12i,
+        y13r, y13i, y14r, y14i, y15r, y15i;
+
+    wn4r = w[1];
+    wk1r = w[4];
+    wk1i = w[5];
+    wk3r = w[6];
+    wk3i = -w[7];
+    wk2r = w[8];
+    wk2i = w[9];
+    x1r = a[0] - a[17];
+    x1i = a[1] + a[16];
+    x0r = a[8] - a[25];
+    x0i = a[9] + a[24];
+    x2r = wn4r * (x0r - x0i);
+    x2i = wn4r * (x0i + x0r);
+    y0r = x1r + x2r;
+    y0i = x1i + x2i;
+    y4r = x1r - x2r;
+    y4i = x1i - x2i;
+    x1r = a[0] + a[17];
+    x1i = a[1] - a[16];
+    x0r = a[8] + a[25];
+    x0i = a[9] - a[24];
+    x2r = wn4r * (x0r - x0i);
+    x2i = wn4r * (x0i + x0r);
+    y8r = x1r - x2i;
+    y8i = x1i + x2r;
+    y12r = x1r + x2i;
+    y12i = x1i - x2r;
+    x0r = a[2] - a[19];
+    x0i = a[3] + a[18];
+    x1r = wk1r * x0r - wk1i * x0i;
+    x1i = wk1r * x0i + wk1i * x0r;
+    x0r = a[10] - a[27];
+    x0i = a[11] + a[26];
+    x2r = wk3i * x0r - wk3r * x0i;
+    x2i = wk3i * x0i + wk3r * x0r;
+    y1r = x1r + x2r;
+    y1i = x1i + x2i;
+    y5r = x1r - x2r;
+    y5i = x1i - x2i;
+    x0r = a[2] + a[19];
+    x0i = a[3] - a[18];
+    x1r = wk3r * x0r - wk3i * x0i;
+    x1i = wk3r * x0i + wk3i * x0r;
+    x0r = a[10] + a[27];
+    x0i = a[11] - a[26];
+    x2r = wk1r * x0r + wk1i * x0i;
+    x2i = wk1r * x0i - wk1i * x0r;
+    y9r = x1r - x2r;
+    y9i = x1i - x2i;
+    y13r = x1r + x2r;
+    y13i = x1i + x2i;
+    x0r = a[4] - a[21];
+    x0i = a[5] + a[20];
+    x1r = wk2r * x0r - wk2i * x0i;
+    x1i = wk2r * x0i + wk2i * x0r;
+    x0r = a[12] - a[29];
+    x0i = a[13] + a[28];
+    x2r = wk2i * x0r - wk2r * x0i;
+    x2i = wk2i * x0i + wk2r * x0r;
+    y2r = x1r + x2r;
+    y2i = x1i + x2i;
+    y6r = x1r - x2r;
+    y6i = x1i - x2i;
+    x0r = a[4] + a[21];
+    x0i = a[5] - a[20];
+    x1r = wk2i * x0r - wk2r * x0i;
+    x1i = wk2i * x0i + wk2r * x0r;
+    x0r = a[12] + a[29];
+    x0i = a[13] - a[28];
+    x2r = wk2r * x0r - wk2i * x0i;
+    x2i = wk2r * x0i + wk2i * x0r;
+    y10r = x1r - x2r;
+    y10i = x1i - x2i;
+    y14r = x1r + x2r;
+    y14i = x1i + x2i;
+    x0r = a[6] - a[23];
+    x0i = a[7] + a[22];
+    x1r = wk3r * x0r - wk3i * x0i;
+    x1i = wk3r * x0i + wk3i * x0r;
+    x0r = a[14] - a[31];
+    x0i = a[15] + a[30];
+    x2r = wk1i * x0r - wk1r * x0i;
+    x2i = wk1i * x0i + wk1r * x0r;
+    y3r = x1r + x2r;
+    y3i = x1i + x2i;
+    y7r = x1r - x2r;
+    y7i = x1i - x2i;
+    x0r = a[6] + a[23];
+    x0i = a[7] - a[22];
+    x1r = wk1i * x0r + wk1r * x0i;
+    x1i = wk1i * x0i - wk1r * x0r;
+    x0r = a[14] + a[31];
+    x0i = a[15] - a[30];
+    x2r = wk3i * x0r - wk3r * x0i;
+    x2i = wk3i * x0i + wk3r * x0r;
+    y11r = x1r + x2r;
+    y11i = x1i + x2i;
+    y15r = x1r - x2r;
+    y15i = x1i - x2i;
+    x1r = y0r + y2r;
+    x1i = y0i + y2i;
+    x2r = y1r + y3r;
+    x2i = y1i + y3i;
+    a[0] = x1r + x2r;
+    a[1] = x1i + x2i;
+    a[2] = x1r - x2r;
+    a[3] = x1i - x2i;
+    x1r = y0r - y2r;
+    x1i = y0i - y2i;
+    x2r = y1r - y3r;
+    x2i = y1i - y3i;
+    a[4] = x1r - x2i;
+    a[5] = x1i + x2r;
+    a[6] = x1r + x2i;
+    a[7] = x1i - x2r;
+    x1r = y4r - y6i;
+    x1i = y4i + y6r;
+    x0r = y5r - y7i;
+    x0i = y5i + y7r;
+    x2r = wn4r * (x0r - x0i);
+    x2i = wn4r * (x0i + x0r);
+    a[8] = x1r + x2r;
+    a[9] = x1i + x2i;
+    a[10] = x1r - x2r;
+    a[11] = x1i - x2i;
+    x1r = y4r + y6i;
+    x1i = y4i - y6r;
+    x0r = y5r + y7i;
+    x0i = y5i - y7r;
+    x2r = wn4r * (x0r - x0i);
+    x2i = wn4r * (x0i + x0r);
+    a[12] = x1r - x2i;
+    a[13] = x1i + x2r;
+    a[14] = x1r + x2i;
+    a[15] = x1i - x2r;
+    x1r = y8r + y10r;
+    x1i = y8i + y10i;
+    x2r = y9r - y11r;
+    x2i = y9i - y11i;
+    a[16] = x1r + x2r;
+    a[17] = x1i + x2i;
+    a[18] = x1r - x2r;
+    a[19] = x1i - x2i;
+    x1r = y8r - y10r;
+    x1i = y8i - y10i;
+    x2r = y9r + y11r;
+    x2i = y9i + y11i;
+    a[20] = x1r - x2i;
+    a[21] = x1i + x2r;
+    a[22] = x1r + x2i;
+    a[23] = x1i - x2r;
+    x1r = y12r - y14i;
+    x1i = y12i + y14r;
+    x0r = y13r + y15i;
+    x0i = y13i - y15r;
+    x2r = wn4r * (x0r - x0i);
+    x2i = wn4r * (x0i + x0r);
+    a[24] = x1r + x2r;
+    a[25] = x1i + x2i;
+    a[26] = x1r - x2r;
+    a[27] = x1i - x2i;
+    x1r = y12r + y14i;
+    x1i = y12i - y14r;
+    x0r = y13r - y15i;
+    x0i = y13i + y15r;
+    x2r = wn4r * (x0r - x0i);
+    x2i = wn4r * (x0i + x0r);
+    a[28] = x1r - x2i;
+    a[29] = x1i + x2r;
+    a[30] = x1r + x2i;
+    a[31] = x1i - x2r;
+}
+
+
+void cftf081(double *a, double *w) {
+    double wn4r, x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, y0r, y0i, y1r, y1i,
+        y2r, y2i, y3r, y3i, y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i;
+
+    wn4r = w[1];
+    x0r = a[0] + a[8];
+    x0i = a[1] + a[9];
+    x1r = a[0] - a[8];
+    x1i = a[1] - a[9];
+    x2r = a[4] + a[12];
+    x2i = a[5] + a[13];
+    x3r = a[4] - a[12];
+    x3i = a[5] - a[13];
+    y0r = x0r + x2r;
+    y0i = x0i + x2i;
+    y2r = x0r - x2r;
+    y2i = x0i - x2i;
+    y1r = x1r - x3i;
+    y1i = x1i + x3r;
+    y3r = x1r + x3i;
+    y3i = x1i - x3r;
+    x0r = a[2] + a[10];
+    x0i = a[3] + a[11];
+    x1r = a[2] - a[10];
+    x1i = a[3] - a[11];
+    x2r = a[6] + a[14];
+    x2i = a[7] + a[15];
+    x3r = a[6] - a[14];
+    x3i = a[7] - a[15];
+    y4r = x0r + x2r;
+    y4i = x0i + x2i;
+    y6r = x0r - x2r;
+    y6i = x0i - x2i;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    x2r = x1r + x3i;
+    x2i = x1i - x3r;
+    y5r = wn4r * (x0r - x0i);
+    y5i = wn4r * (x0r + x0i);
+    y7r = wn4r * (x2r - x2i);
+    y7i = wn4r * (x2r + x2i);
+    a[8] = y1r + y5r;
+    a[9] = y1i + y5i;
+    a[10] = y1r - y5r;
+    a[11] = y1i - y5i;
+    a[12] = y3r - y7i;
+    a[13] = y3i + y7r;
+    a[14] = y3r + y7i;
+    a[15] = y3i - y7r;
+    a[0] = y0r + y4r;
+    a[1] = y0i + y4i;
+    a[2] = y0r - y4r;
+    a[3] = y0i - y4i;
+    a[4] = y2r - y6i;
+    a[5] = y2i + y6r;
+    a[6] = y2r + y6i;
+    a[7] = y2i - y6r;
+}
+
+
+void cftf082(double *a, double *w) {
+    double wn4r, wk1r, wk1i, x0r, x0i, x1r, x1i, y0r, y0i, y1r, y1i, y2r, y2i,
+        y3r, y3i, y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i;
+
+    wn4r = w[1];
+    wk1r = w[2];
+    wk1i = w[3];
+    y0r = a[0] - a[9];
+    y0i = a[1] + a[8];
+    y1r = a[0] + a[9];
+    y1i = a[1] - a[8];
+    x0r = a[4] - a[13];
+    x0i = a[5] + a[12];
+    y2r = wn4r * (x0r - x0i);
+    y2i = wn4r * (x0i + x0r);
+    x0r = a[4] + a[13];
+    x0i = a[5] - a[12];
+    y3r = wn4r * (x0r - x0i);
+    y3i = wn4r * (x0i + x0r);
+    x0r = a[2] - a[11];
+    x0i = a[3] + a[10];
+    y4r = wk1r * x0r - wk1i * x0i;
+    y4i = wk1r * x0i + wk1i * x0r;
+    x0r = a[2] + a[11];
+    x0i = a[3] - a[10];
+    y5r = wk1i * x0r - wk1r * x0i;
+    y5i = wk1i * x0i + wk1r * x0r;
+    x0r = a[6] - a[15];
+    x0i = a[7] + a[14];
+    y6r = wk1i * x0r - wk1r * x0i;
+    y6i = wk1i * x0i + wk1r * x0r;
+    x0r = a[6] + a[15];
+    x0i = a[7] - a[14];
+    y7r = wk1r * x0r - wk1i * x0i;
+    y7i = wk1r * x0i + wk1i * x0r;
+    x0r = y0r + y2r;
+    x0i = y0i + y2i;
+    x1r = y4r + y6r;
+    x1i = y4i + y6i;
+    a[0] = x0r + x1r;
+    a[1] = x0i + x1i;
+    a[2] = x0r - x1r;
+    a[3] = x0i - x1i;
+    x0r = y0r - y2r;
+    x0i = y0i - y2i;
+    x1r = y4r - y6r;
+    x1i = y4i - y6i;
+    a[4] = x0r - x1i;
+    a[5] = x0i + x1r;
+    a[6] = x0r + x1i;
+    a[7] = x0i - x1r;
+    x0r = y1r - y3i;
+    x0i = y1i + y3r;
+    x1r = y5r - y7r;
+    x1i = y5i - y7i;
+    a[8] = x0r + x1r;
+    a[9] = x0i + x1i;
+    a[10] = x0r - x1r;
+    a[11] = x0i - x1i;
+    x0r = y1r + y3i;
+    x0i = y1i - y3r;
+    x1r = y5r + y7r;
+    x1i = y5i + y7i;
+    a[12] = x0r - x1i;
+    a[13] = x0i + x1r;
+    a[14] = x0r + x1i;
+    a[15] = x0i - x1r;
+}
+
+
+void cftf040(double *a) {
+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+
+    x0r = a[0] + a[4];
+    x0i = a[1] + a[5];
+    x1r = a[0] - a[4];
+    x1i = a[1] - a[5];
+    x2r = a[2] + a[6];
+    x2i = a[3] + a[7];
+    x3r = a[2] - a[6];
+    x3i = a[3] - a[7];
+    a[0] = x0r + x2r;
+    a[1] = x0i + x2i;
+    a[2] = x1r - x3i;
+    a[3] = x1i + x3r;
+    a[4] = x0r - x2r;
+    a[5] = x0i - x2i;
+    a[6] = x1r + x3i;
+    a[7] = x1i - x3r;
+}
+
+
+void cftb040(double *a) {
+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+
+    x0r = a[0] + a[4];
+    x0i = a[1] + a[5];
+    x1r = a[0] - a[4];
+    x1i = a[1] - a[5];
+    x2r = a[2] + a[6];
+    x2i = a[3] + a[7];
+    x3r = a[2] - a[6];
+    x3i = a[3] - a[7];
+    a[0] = x0r + x2r;
+    a[1] = x0i + x2i;
+    a[2] = x1r + x3i;
+    a[3] = x1i - x3r;
+    a[4] = x0r - x2r;
+    a[5] = x0i - x2i;
+    a[6] = x1r - x3i;
+    a[7] = x1i + x3r;
+}
+
+
+void cftx020(double *a) {
+    double x0r, x0i;
+
+    x0r = a[0] - a[2];
+    x0i = a[1] - a[3];
+    a[0] += a[2];
+    a[1] += a[3];
+    a[2] = x0r;
+    a[3] = x0i;
+}
+
+
+void rftfsub(int n, double *a, int nc, double *c) {
+    int j, k, kk, ks, m;
+    double wkr, wki, xr, xi, yr, yi;
+
+    m = n >> 1;
+    ks = 2 * nc / m;
+    kk = 0;
+    for (j = 2; j < m; j += 2) {
+        k = n - j;
+        kk += ks;
+        wkr = 0.5 - c[nc - kk];
+        wki = c[kk];
+        xr = a[j] - a[k];
+        xi = a[j + 1] + a[k + 1];
+        yr = wkr * xr - wki * xi;
+        yi = wkr * xi + wki * xr;
+        a[j] -= yr;
+        a[j + 1] -= yi;
+        a[k] += yr;
+        a[k + 1] -= yi;
+    }
+}
+
+
+void rftbsub(int n, double *a, int nc, double *c) {
+    int j, k, kk, ks, m;
+    double wkr, wki, xr, xi, yr, yi;
+
+    m = n >> 1;
+    ks = 2 * nc / m;
+    kk = 0;
+    for (j = 2; j < m; j += 2) {
+        k = n - j;
+        kk += ks;
+        wkr = 0.5 - c[nc - kk];
+        wki = c[kk];
+        xr = a[j] - a[k];
+        xi = a[j + 1] + a[k + 1];
+        yr = wkr * xr + wki * xi;
+        yi = wkr * xi - wki * xr;
+        a[j] -= yr;
+        a[j + 1] -= yi;
+        a[k] += yr;
+        a[k + 1] -= yi;
+    }
+}
+
+
+void dctsub(int n, double *a, int nc, double *c) {
+    int j, k, kk, ks, m;
+    double wkr, wki, xr;
+
+    m = n >> 1;
+    ks = nc / n;
+    kk = 0;
+    for (j = 1; j < m; j++) {
+        k = n - j;
+        kk += ks;
+        wkr = c[kk] - c[nc - kk];
+        wki = c[kk] + c[nc - kk];
+        xr = wki * a[j] - wkr * a[k];
+        a[j] = wkr * a[j] + wki * a[k];
+        a[k] = xr;
+    }
+    a[m] *= c[0];
+}
+
+
+void dstsub(int n, double *a, int nc, double *c) {
+    int j, k, kk, ks, m;
+    double wkr, wki, xr;
+
+    m = n >> 1;
+    ks = nc / n;
+    kk = 0;
+    for (j = 1; j < m; j++) {
+        k = n - j;
+        kk += ks;
+        wkr = c[kk] - c[nc - kk];
+        wki = c[kk] + c[nc - kk];
+        xr = wki * a[k] - wkr * a[j];
+        a[k] = wkr * a[k] + wki * a[j];
+        a[j] = xr;
+    }
+    a[m] *= c[0];
+}
diff --git a/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/log.cc b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/log.cc
new file mode 100644
index 00000000..6922808a
--- /dev/null
+++ b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/log.cc
@@ -0,0 +1,143 @@
+/**
+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Stack trace related stuff is from kaldi.
+ * Refer to
+ * https://github.com/kaldi-asr/kaldi/blob/master/src/base/kaldi-error.cc
+ */
+
+#include "kaldi-native-fbank/csrc/log.h"
+
+#ifdef KNF_HAVE_EXECINFO_H
+#include <execinfo.h> // To get stack trace in error messages.
+#ifdef KNF_HAVE_CXXABI_H
+#include <cxxabi.h> // For name demangling.
+// Useful to decode the stack trace, but only used if we have execinfo.h
+#endif // KNF_HAVE_CXXABI_H
+#endif // KNF_HAVE_EXECINFO_H
+
+#include <stdlib.h>
+
+#include <ctime>
+#include <iomanip>
+#include <string>
+
+namespace knf {
+
+std::string GetDateTimeStr() {
+  std::ostringstream os;
+  std::time_t t = std::time(nullptr);
+  std::tm tm = *std::localtime(&t);
+  os << std::put_time(&tm, "%F %T"); // yyyy-mm-dd hh:mm:ss
+  return os.str();
+}
+
+static bool LocateSymbolRange(const std::string &trace_name, std::size_t *begin,
+                              std::size_t *end) {
+  // Find the first '_' with leading ' ' or '('.
+  *begin = std::string::npos;
+  for (std::size_t i = 1; i < trace_name.size(); ++i) {
+    if (trace_name[i] != '_') {
+      continue;
+    }
+    if (trace_name[i - 1] == ' ' || trace_name[i - 1] == '(') {
+      *begin = i;
+      break;
+    }
+  }
+  if (*begin == std::string::npos) {
+    return false;
+  }
+  *end = trace_name.find_first_of(" +", *begin);
+  return *end != std::string::npos;
+}
+
+#ifdef KNF_HAVE_EXECINFO_H
+static std::string Demangle(const std::string &trace_name) {
+#ifndef KNF_HAVE_CXXABI_H
+  return trace_name;
+#else  // KNF_HAVE_CXXABI_H
+  // Try demangle the symbol. We are trying to support the following formats
+  // produced by different platforms:
+  //
+  // Linux:
+  //   ./kaldi-error-test(_ZN5kaldi13UnitTestErrorEv+0xb) [0x804965d]
+  //
+  // Mac:
+  //   0 server 0x000000010f67614d _ZNK5kaldi13MessageLogger10LogMessageEv + 813
+  //
+  // We want to extract the name e.g., '_ZN5kaldi13UnitTestErrorEv' and
+  // demangle it info a readable name like kaldi::UnitTextError.
+  std::size_t begin, end;
+  if (!LocateSymbolRange(trace_name, &begin, &end)) {
+    return trace_name;
+  }
+  std::string symbol = trace_name.substr(begin, end - begin);
+  int status;
+  char *demangled_name = abi::__cxa_demangle(symbol.c_str(), 0, 0, &status);
+  if (status == 0 && demangled_name != nullptr) {
+    symbol = demangled_name;
+    free(demangled_name);
+  }
+  return trace_name.substr(0, begin) + symbol +
+         trace_name.substr(end, std::string::npos);
+#endif // KNF_HAVE_CXXABI_H
+}
+#endif // KNF_HAVE_EXECINFO_H
+
+std::string GetStackTrace() {
+  std::string ans;
+#ifdef KNF_HAVE_EXECINFO_H
+  constexpr const std::size_t kMaxTraceSize = 50;
+  constexpr const std::size_t kMaxTracePrint = 50; // Must be even.
+                                                   // Buffer for the trace.
+  void *trace[kMaxTraceSize];
+  // Get the trace.
+  std::size_t size = backtrace(trace, kMaxTraceSize);
+  // Get the trace symbols.
+  char **trace_symbol = backtrace_symbols(trace, size);
+  if (trace_symbol == nullptr)
+    return ans;
+
+  // Compose a human-readable backtrace string.
+  ans += "[ Stack-Trace: ]\n";
+  if (size <= kMaxTracePrint) {
+    for (std::size_t i = 0; i < size; ++i) {
+      ans += Demangle(trace_symbol[i]) + "\n";
+    }
+  } else { // Print out first+last (e.g.) 5.
+    for (std::size_t i = 0; i < kMaxTracePrint / 2; ++i) {
+      ans += Demangle(trace_symbol[i]) + "\n";
+    }
+    ans += ".\n.\n.\n";
+    for (std::size_t i = size - kMaxTracePrint / 2; i < size; ++i) {
+      ans += Demangle(trace_symbol[i]) + "\n";
+    }
+    if (size == kMaxTraceSize)
+      ans += ".\n.\n.\n"; // Stack was too long, probably a bug.
+  }
+
+  // We must free the array of pointers allocated by backtrace_symbols(),
+  // but not the strings themselves.
+  free(trace_symbol);
+#endif // KNF_HAVE_EXECINFO_H
+  return ans;
+}
+
+} // namespace knf
diff --git a/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/log.h b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/log.h
new file mode 100644
index 00000000..feb38db1
--- /dev/null
+++ b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/log.h
@@ -0,0 +1,347 @@
+/**
+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// The content in this file is copied/modified from
+// https://github.com/k2-fsa/k2/blob/master/k2/csrc/log.h
+#ifndef KALDI_NATIVE_FBANK_CSRC_LOG_H_
+#define KALDI_NATIVE_FBANK_CSRC_LOG_H_
+
+#include <stdio.h>
+
+#include <mutex>  // NOLINT
+#include <sstream>
+#include <string>
+
+namespace knf {
+
+#if defined(NDEBUG)
+constexpr bool kDisableDebug = true;
+#else
+constexpr bool kDisableDebug = false;
+#endif
+
+enum class LogLevel {
+  kTrace = 0,
+  kDebug = 1,
+  kInfo = 2,
+  kWarning = 3,
+  kError = 4,
+  kFatal = 5,  // print message and abort the program
+};
+
+// They are used in KNF_LOG(xxx), so their names
+// do not follow the google c++ code style
+//
+// You can use them in the following way:
+//
+//  KNF_LOG(TRACE) << "some message";
+//  KNF_LOG(DEBUG) << "some message";
+#ifndef _MSC_VER
+constexpr LogLevel TRACE = LogLevel::kTrace;
+constexpr LogLevel DEBUG = LogLevel::kDebug;
+constexpr LogLevel INFO = LogLevel::kInfo;
+constexpr LogLevel WARNING = LogLevel::kWarning;
+constexpr LogLevel ERROR = LogLevel::kError;
+constexpr LogLevel FATAL = LogLevel::kFatal;
+#else
+#define TRACE LogLevel::kTrace
+#define DEBUG LogLevel::kDebug
+#define INFO LogLevel::kInfo
+#define WARNING LogLevel::kWarning
+#define ERROR LogLevel::kError
+#define FATAL LogLevel::kFatal
+#endif
+
+std::string GetStackTrace();
+
+/* Return the current log level.
+
+
+   If the current log level is TRACE, then all logged messages are printed out.
+
+   If the current log level is DEBUG, log messages with "TRACE" level are not
+   shown and all other levels are printed out.
+
+   Similarly, if the current log level is INFO, log message with "TRACE" and
+   "DEBUG" are not shown and all other levels are printed out.
+
+   If it is FATAL, then only FATAL messages are shown.
+ */
+inline LogLevel GetCurrentLogLevel() {
+  static LogLevel log_level = INFO;
+  static std::once_flag init_flag;
+  std::call_once(init_flag, []() {
+    const char *env_log_level = std::getenv("KNF_LOG_LEVEL");
+    if (env_log_level == nullptr) return;
+
+    std::string s = env_log_level;
+    if (s == "TRACE")
+      log_level = TRACE;
+    else if (s == "DEBUG")
+      log_level = DEBUG;
+    else if (s == "INFO")
+      log_level = INFO;
+    else if (s == "WARNING")
+      log_level = WARNING;
+    else if (s == "ERROR")
+      log_level = ERROR;
+    else if (s == "FATAL")
+      log_level = FATAL;
+    else
+      fprintf(stderr,
+              "Unknown KNF_LOG_LEVEL: %s"
+              "\nSupported values are: "
+              "TRACE, DEBUG, INFO, WARNING, ERROR, FATAL",
+              s.c_str());
+  });
+  return log_level;
+}
+
+inline bool EnableAbort() {
+  static std::once_flag init_flag;
+  static bool enable_abort = false;
+  std::call_once(init_flag, []() {
+    enable_abort = (std::getenv("KNF_ABORT") != nullptr);
+  });
+  return enable_abort;
+}
+
+class Logger {
+ public:
+  Logger(const char *filename, const char *func_name, uint32_t line_num,
+         LogLevel level)
+      : filename_(filename),
+        func_name_(func_name),
+        line_num_(line_num),
+        level_(level) {
+    cur_level_ = GetCurrentLogLevel();
+    fprintf(stderr, "here\n");
+    switch (level) {
+      case TRACE:
+        if (cur_level_ <= TRACE) fprintf(stderr, "[T] ");
+        break;
+      case DEBUG:
+        if (cur_level_ <= DEBUG) fprintf(stderr, "[D] ");
+        break;
+      case INFO:
+        if (cur_level_ <= INFO) fprintf(stderr, "[I] ");
+        break;
+      case WARNING:
+        if (cur_level_ <= WARNING) fprintf(stderr, "[W] ");
+        break;
+      case ERROR:
+        if (cur_level_ <= ERROR) fprintf(stderr, "[E] ");
+        break;
+      case FATAL:
+        if (cur_level_ <= FATAL) fprintf(stderr, "[F] ");
+        break;
+    }
+
+    if (cur_level_ <= level_) {
+      fprintf(stderr, "%s:%u:%s ", filename, line_num, func_name);
+    }
+  }
+
+  ~Logger() noexcept(false) {
+    static constexpr const char *kErrMsg = R"(
+    Some bad things happened. Please read the above error messages and stack
+    trace. If you are using Python, the following command may be helpful:
+
+      gdb --args python /path/to/your/code.py
+
+    (You can use `gdb` to debug the code. Please consider compiling
+    a debug version of KNF.).
+
+    If you are unable to fix it, please open an issue at:
+
+      https://github.com/csukuangfj/kaldi-native-fbank/issues/new
+    )";
+    fprintf(stderr, "\n");
+    if (level_ == FATAL) {
+      std::string stack_trace = GetStackTrace();
+      if (!stack_trace.empty()) {
+        fprintf(stderr, "\n\n%s\n", stack_trace.c_str());
+      }
+
+      fflush(nullptr);
+
+#ifndef __ANDROID_API__
+      if (EnableAbort()) {
+        // NOTE: abort() will terminate the program immediately without
+        // printing the Python stack backtrace.
+        abort();
+      }
+
+      throw std::runtime_error(kErrMsg);
+#else
+      abort();
+#endif
+    }
+  }
+
+  const Logger &operator<<(bool b) const {
+    if (cur_level_ <= level_) {
+      fprintf(stderr, b ? "true" : "false");
+    }
+    return *this;
+  }
+
+  const Logger &operator<<(int8_t i) const {
+    if (cur_level_ <= level_) fprintf(stderr, "%d", i);
+    return *this;
+  }
+
+  const Logger &operator<<(const char *s) const {
+    if (cur_level_ <= level_) fprintf(stderr, "%s", s);
+    return *this;
+  }
+
+  const Logger &operator<<(int32_t i) const {
+    if (cur_level_ <= level_) fprintf(stderr, "%d", i);
+    return *this;
+  }
+
+  const Logger &operator<<(uint32_t i) const {
+    if (cur_level_ <= level_) fprintf(stderr, "%u", i);
+    return *this;
+  }
+
+  const Logger &operator<<(uint64_t i) const {
+    if (cur_level_ <= level_)
+      fprintf(stderr, "%llu", (long long unsigned int)i);  // NOLINT
+    return *this;
+  }
+
+  const Logger &operator<<(int64_t i) const {
+    if (cur_level_ <= level_)
+      fprintf(stderr, "%lli", (long long int)i);  // NOLINT
+    return *this;
+  }
+
+  const Logger &operator<<(float f) const {
+    if (cur_level_ <= level_) fprintf(stderr, "%f", f);
+    return *this;
+  }
+
+  const Logger &operator<<(double d) const {
+    if (cur_level_ <= level_) fprintf(stderr, "%f", d);
+    return *this;
+  }
+
+  template <typename T>
+  const Logger &operator<<(const T &t) const {
+    // require T overloads operator<<
+    std::ostringstream os;
+    os << t;
+    return *this << os.str().c_str();
+  }
+
+  // specialization to fix compile error: `stringstream << nullptr` is ambiguous
+  const Logger &operator<<(const std::nullptr_t &null) const {
+    if (cur_level_ <= level_) *this << "(null)";
+    return *this;
+  }
+
+ private:
+  const char *filename_;
+  const char *func_name_;
+  uint32_t line_num_;
+  LogLevel level_;
+  LogLevel cur_level_;
+};
+
+class Voidifier {
+ public:
+  void operator&(const Logger &)const {}
+};
+
+}  // namespace knf
+
+#if defined(__clang__) || defined(__GNUC__) || defined(__GNUG__) || \
+    defined(__PRETTY_FUNCTION__)
+// for clang and GCC
+#define KNF_FUNC __PRETTY_FUNCTION__
+#else
+// for other compilers
+#define KNF_FUNC __func__
+#endif
+
+#define KNF_STATIC_ASSERT(x) static_assert(x, "")
+
+#define KNF_CHECK(x)                                                  \
+  (x) ? (void)0                                                       \
+      : ::knf::Voidifier() &                                          \
+            ::knf::Logger(__FILE__, KNF_FUNC, __LINE__, ::knf::FATAL) \
+                << "Check failed: " << #x << " "
+
+// WARNING: x and y may be evaluated multiple times, but this happens only
+// when the check fails. Since the program aborts if it fails, we don't think
+// the extra evaluation of x and y matters.
+//
+// CAUTION: we recommend the following use case:
+//
+//      auto x = Foo();
+//      auto y = Bar();
+//      KNF_CHECK_EQ(x, y) << "Some message";
+//
+//  And please avoid
+//
+//      KNF_CHECK_EQ(Foo(), Bar());
+//
+//  if `Foo()` or `Bar()` causes some side effects, e.g., changing some
+//  local static variables or global variables.
+#define _KNF_CHECK_OP(x, y, op)                                              \
+  ((x)op(y)) ? (void)0                                                       \
+             : ::knf::Voidifier() &                                          \
+                   ::knf::Logger(__FILE__, KNF_FUNC, __LINE__, ::knf::FATAL) \
+                       << "Check failed: " << #x << " " << #op << " " << #y  \
+                       << " (" << (x) << " vs. " << (y) << ") "
+
+#define KNF_CHECK_EQ(x, y) _KNF_CHECK_OP(x, y, ==)
+#define KNF_CHECK_NE(x, y) _KNF_CHECK_OP(x, y, !=)
+#define KNF_CHECK_LT(x, y) _KNF_CHECK_OP(x, y, <)
+#define KNF_CHECK_LE(x, y) _KNF_CHECK_OP(x, y, <=)
+#define KNF_CHECK_GT(x, y) _KNF_CHECK_OP(x, y, >)
+#define KNF_CHECK_GE(x, y) _KNF_CHECK_OP(x, y, >=)
+
+#define KNF_LOG(x) ::knf::Logger(__FILE__, KNF_FUNC, __LINE__, ::knf::x)
+
+// ------------------------------------------------------------
+//       For debug check
+// ------------------------------------------------------------
+// If you define the macro "-D NDEBUG" while compiling kaldi-native-fbank,
+// the following macros are in fact empty and does nothing.
+
+#define KNF_DCHECK(x) ::knf::kDisableDebug ? (void)0 : KNF_CHECK(x)
+
+#define KNF_DCHECK_EQ(x, y) ::knf::kDisableDebug ? (void)0 : KNF_CHECK_EQ(x, y)
+
+#define KNF_DCHECK_NE(x, y) ::knf::kDisableDebug ? (void)0 : KNF_CHECK_NE(x, y)
+
+#define KNF_DCHECK_LT(x, y) ::knf::kDisableDebug ? (void)0 : KNF_CHECK_LT(x, y)
+
+#define KNF_DCHECK_LE(x, y) ::knf::kDisableDebug ? (void)0 : KNF_CHECK_LE(x, y)
+
+#define KNF_DCHECK_GT(x, y) ::knf::kDisableDebug ? (void)0 : KNF_CHECK_GT(x, y)
+
+#define KNF_DCHECK_GE(x, y) ::knf::kDisableDebug ? (void)0 : KNF_CHECK_GE(x, y)
+
+#define KNF_DLOG(x) \
+  ::knf::kDisableDebug ? (void)0 : ::knf::Voidifier() & KNF_LOG(x)
+
+#endif  // KALDI_NATIVE_FBANK_CSRC_LOG_H_
diff --git a/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/mel-computations.cc b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/mel-computations.cc
new file mode 100644
index 00000000..dade576b
--- /dev/null
+++ b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/mel-computations.cc
@@ -0,0 +1,256 @@
+/**
+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// This file is copied/modified from kaldi/src/feat/mel-computations.cc
+
+#include "kaldi-native-fbank/csrc/mel-computations.h"
+
+#include <algorithm>
+#include <sstream>
+
+#include "kaldi-native-fbank/csrc/feature-window.h"
+
+namespace knf {
+
+std::ostream &operator<<(std::ostream &os, const MelBanksOptions &opts) {
+  os << opts.ToString();
+  return os;
+}
+
+float MelBanks::VtlnWarpFreq(
+    float vtln_low_cutoff,  // upper+lower frequency cutoffs for VTLN.
+    float vtln_high_cutoff,
+    float low_freq,  // upper+lower frequency cutoffs in mel computation
+    float high_freq, float vtln_warp_factor, float freq) {
+  /// This computes a VTLN warping function that is not the same as HTK's one,
+  /// but has similar inputs (this function has the advantage of never producing
+  /// empty bins).
+
+  /// This function computes a warp function F(freq), defined between low_freq
+  /// and high_freq inclusive, with the following properties:
+  ///  F(low_freq) == low_freq
+  ///  F(high_freq) == high_freq
+  /// The function is continuous and piecewise linear with two inflection
+  ///   points.
+  /// The lower inflection point (measured in terms of the unwarped
+  ///  frequency) is at frequency l, determined as described below.
+  /// The higher inflection point is at a frequency h, determined as
+  ///   described below.
+  /// If l <= f <= h, then F(f) = f/vtln_warp_factor.
+  /// If the higher inflection point (measured in terms of the unwarped
+  ///   frequency) is at h, then max(h, F(h)) == vtln_high_cutoff.
+  ///   Since (by the last point) F(h) == h/vtln_warp_factor, then
+  ///   max(h, h/vtln_warp_factor) == vtln_high_cutoff, so
+  ///   h = vtln_high_cutoff / max(1, 1/vtln_warp_factor).
+  ///     = vtln_high_cutoff * min(1, vtln_warp_factor).
+  /// If the lower inflection point (measured in terms of the unwarped
+  ///   frequency) is at l, then min(l, F(l)) == vtln_low_cutoff
+  ///   This implies that l = vtln_low_cutoff / min(1, 1/vtln_warp_factor)
+  ///                       = vtln_low_cutoff * max(1, vtln_warp_factor)
+
+  if (freq < low_freq || freq > high_freq)
+    return freq;  // in case this gets called
+  // for out-of-range frequencies, just return the freq.
+
+  KNF_CHECK_GT(vtln_low_cutoff, low_freq);
+  KNF_CHECK_LT(vtln_high_cutoff, high_freq);
+
+  float one = 1.0f;
+  float l = vtln_low_cutoff * std::max(one, vtln_warp_factor);
+  float h = vtln_high_cutoff * std::min(one, vtln_warp_factor);
+  float scale = 1.0f / vtln_warp_factor;
+  float Fl = scale * l;  // F(l);
+  float Fh = scale * h;  // F(h);
+  KNF_CHECK(l > low_freq && h < high_freq);
+  // slope of left part of the 3-piece linear function
+  float scale_left = (Fl - low_freq) / (l - low_freq);
+  // [slope of center part is just "scale"]
+
+  // slope of right part of the 3-piece linear function
+  float scale_right = (high_freq - Fh) / (high_freq - h);
+
+  if (freq < l) {
+    return low_freq + scale_left * (freq - low_freq);
+  } else if (freq < h) {
+    return scale * freq;
+  } else {  // freq >= h
+    return high_freq + scale_right * (freq - high_freq);
+  }
+}
+
+float MelBanks::VtlnWarpMelFreq(
+    float vtln_low_cutoff,  // upper+lower frequency cutoffs for VTLN.
+    float vtln_high_cutoff,
+    float low_freq,  // upper+lower frequency cutoffs in mel computation
+    float high_freq, float vtln_warp_factor, float mel_freq) {
+  return MelScale(VtlnWarpFreq(vtln_low_cutoff, vtln_high_cutoff, low_freq,
+                               high_freq, vtln_warp_factor,
+                               InverseMelScale(mel_freq)));
+}
+
+MelBanks::MelBanks(const MelBanksOptions &opts,
+                   const FrameExtractionOptions &frame_opts,
+                   float vtln_warp_factor)
+    : htk_mode_(opts.htk_mode) {
+  int32_t num_bins = opts.num_bins;
+  if (num_bins < 3) KNF_LOG(FATAL) << "Must have at least 3 mel bins";
+
+  float sample_freq = frame_opts.samp_freq;
+  int32_t window_length_padded = frame_opts.PaddedWindowSize();
+  KNF_CHECK_EQ(window_length_padded % 2, 0);
+
+  int32_t num_fft_bins = window_length_padded / 2;
+  float nyquist = 0.5f * sample_freq;
+
+  float low_freq = opts.low_freq, high_freq;
+  if (opts.high_freq > 0.0f)
+    high_freq = opts.high_freq;
+  else
+    high_freq = nyquist + opts.high_freq;
+
+  if (low_freq < 0.0f || low_freq >= nyquist || high_freq <= 0.0f ||
+      high_freq > nyquist || high_freq <= low_freq) {
+    KNF_LOG(FATAL) << "Bad values in options: low-freq " << low_freq
+                   << " and high-freq " << high_freq << " vs. nyquist "
+                   << nyquist;
+  }
+
+  float fft_bin_width = sample_freq / window_length_padded;
+  // fft-bin width [think of it as Nyquist-freq / half-window-length]
+
+  float mel_low_freq = MelScale(low_freq);
+  float mel_high_freq = MelScale(high_freq);
+
+  debug_ = opts.debug_mel;
+
+  // divide by num_bins+1 in next line because of end-effects where the bins
+  // spread out to the sides.
+  float mel_freq_delta = (mel_high_freq - mel_low_freq) / (num_bins + 1);
+
+  float vtln_low = opts.vtln_low, vtln_high = opts.vtln_high;
+  if (vtln_high < 0.0f) {
+    vtln_high += nyquist;
+  }
+
+  if (vtln_warp_factor != 1.0f &&
+      (vtln_low < 0.0f || vtln_low <= low_freq || vtln_low >= high_freq ||
+       vtln_high <= 0.0f || vtln_high >= high_freq || vtln_high <= vtln_low)) {
+    KNF_LOG(FATAL) << "Bad values in options: vtln-low " << vtln_low
+                   << " and vtln-high " << vtln_high << ", versus "
+                   << "low-freq " << low_freq << " and high-freq " << high_freq;
+  }
+
+  bins_.resize(num_bins);
+  center_freqs_.resize(num_bins);
+
+  for (int32_t bin = 0; bin < num_bins; ++bin) {
+    float left_mel = mel_low_freq + bin * mel_freq_delta,
+          center_mel = mel_low_freq + (bin + 1) * mel_freq_delta,
+          right_mel = mel_low_freq + (bin + 2) * mel_freq_delta;
+
+    if (vtln_warp_factor != 1.0f) {
+      left_mel = VtlnWarpMelFreq(vtln_low, vtln_high, low_freq, high_freq,
+                                 vtln_warp_factor, left_mel);
+      center_mel = VtlnWarpMelFreq(vtln_low, vtln_high, low_freq, high_freq,
+                                   vtln_warp_factor, center_mel);
+      right_mel = VtlnWarpMelFreq(vtln_low, vtln_high, low_freq, high_freq,
+                                  vtln_warp_factor, right_mel);
+    }
+    center_freqs_[bin] = InverseMelScale(center_mel);
+
+    // this_bin will be a vector of coefficients that is only
+    // nonzero where this mel bin is active.
+    std::vector<float> this_bin(num_fft_bins);
+
+    int32_t first_index = -1, last_index = -1;
+    for (int32_t i = 0; i < num_fft_bins; ++i) {
+      float freq = (fft_bin_width * i);  // Center frequency of this fft
+                                         // bin.
+      float mel = MelScale(freq);
+      if (mel > left_mel && mel < right_mel) {
+        float weight;
+        if (mel <= center_mel)
+          weight = (mel - left_mel) / (center_mel - left_mel);
+        else
+          weight = (right_mel - mel) / (right_mel - center_mel);
+        this_bin[i] = weight;
+        if (first_index == -1) first_index = i;
+        last_index = i;
+      }
+    }
+    KNF_CHECK(first_index != -1 && last_index >= first_index &&
+              "You may have set num_mel_bins too large.");
+
+    bins_[bin].first = first_index;
+    int32_t size = last_index + 1 - first_index;
+    bins_[bin].second.insert(bins_[bin].second.end(),
+                             this_bin.begin() + first_index,
+                             this_bin.begin() + first_index + size);
+
+    // Replicate a bug in HTK, for testing purposes.
+    if (opts.htk_mode && bin == 0 && mel_low_freq != 0.0f) {
+      bins_[bin].second[0] = 0.0;
+    }
+  }  // for (int32_t bin = 0; bin < num_bins; ++bin) {
+
+  if (debug_) {
+    std::ostringstream os;
+    for (size_t i = 0; i < bins_.size(); i++) {
+      os << "bin " << i << ", offset = " << bins_[i].first << ", vec = ";
+      for (auto k : bins_[i].second) os << k << ", ";
+      os << "\n";
+    }
+    KNF_LOG(INFO) << os.str();
+  }
+}
+
+// "power_spectrum" contains fft energies.
+void MelBanks::Compute(const float *power_spectrum,
+                       float *mel_energies_out) const {
+  int32_t num_bins = bins_.size();
+
+  for (int32_t i = 0; i < num_bins; i++) {
+    int32_t offset = bins_[i].first;
+    const auto &v = bins_[i].second;
+    float energy = 0;
+    for (int32_t k = 0; k != v.size(); ++k) {
+      energy += v[k] * power_spectrum[k + offset];
+    }
+
+    // HTK-like flooring- for testing purposes (we prefer dither)
+    if (htk_mode_ && energy < 1.0) {
+      energy = 1.0;
+    }
+
+    mel_energies_out[i] = energy;
+
+    // The following assert was added due to a problem with OpenBlas that
+    // we had at one point (it was a bug in that library).  Just to detect
+    // it early.
+    KNF_CHECK_EQ(energy, energy);  // check that energy is not nan
+  }
+
+  if (debug_) {
+    fprintf(stderr, "MEL BANKS:\n");
+    for (int32_t i = 0; i < num_bins; i++)
+      fprintf(stderr, " %f", mel_energies_out[i]);
+    fprintf(stderr, "\n");
+  }
+}
+
+}  // namespace knf
diff --git a/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/mel-computations.h b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/mel-computations.h
new file mode 100644
index 00000000..e743243a
--- /dev/null
+++ b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/mel-computations.h
@@ -0,0 +1,115 @@
+/**
+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+// This file is copied/modified from kaldi/src/feat/mel-computations.h
+#ifndef KALDI_NATIVE_FBANK_CSRC_MEL_COMPUTATIONS_H_
+#define KALDI_NATIVE_FBANK_CSRC_MEL_COMPUTATIONS_H_
+
+#include <cmath>
+#include <string>
+
+#include "kaldi-native-fbank/csrc/feature-window.h"
+
+namespace knf {
+
+struct MelBanksOptions {
+  int32_t num_bins = 25;  // e.g. 25; number of triangular bins
+  float low_freq = 20;    // e.g. 20; lower frequency cutoff
+
+  // an upper frequency cutoff; 0 -> no cutoff, negative
+  // ->added to the Nyquist frequency to get the cutoff.
+  float high_freq = 0;
+
+  float vtln_low = 100;  // vtln lower cutoff of warping function.
+
+  // vtln upper cutoff of warping function: if negative, added
+  // to the Nyquist frequency to get the cutoff.
+  float vtln_high = -500;
+
+  bool debug_mel = false;
+  // htk_mode is a "hidden" config, it does not show up on command line.
+  // Enables more exact compatibility with HTK, for testing purposes.  Affects
+  // mel-energy flooring and reproduces a bug in HTK.
+  bool htk_mode = false;
+
+  std::string ToString() const {
+    std::ostringstream os;
+    os << "num_bins: " << num_bins << "\n";
+    os << "low_freq: " << low_freq << "\n";
+    os << "high_freq: " << high_freq << "\n";
+    os << "vtln_low: " << vtln_low << "\n";
+    os << "vtln_high: " << vtln_high << "\n";
+    os << "debug_mel: " << debug_mel << "\n";
+    os << "htk_mode: " << htk_mode << "\n";
+    return os.str();
+  }
+};
+
+std::ostream &operator<<(std::ostream &os, const MelBanksOptions &opts);
+
+class MelBanks {
+ public:
+  static inline float InverseMelScale(float mel_freq) {
+    return 700.0f * (expf(mel_freq / 1127.0f) - 1.0f);
+  }
+
+  static inline float MelScale(float freq) {
+    return 1127.0f * logf(1.0f + freq / 700.0f);
+  }
+
+  static float VtlnWarpFreq(
+      float vtln_low_cutoff,
+      float vtln_high_cutoff,  // discontinuities in warp func
+      float low_freq,
+      float high_freq,  // upper+lower frequency cutoffs in
+      // the mel computation
+      float vtln_warp_factor, float freq);
+
+  static float VtlnWarpMelFreq(float vtln_low_cutoff, float vtln_high_cutoff,
+                               float low_freq, float high_freq,
+                               float vtln_warp_factor, float mel_freq);
+
+  // TODO(fangjun): Remove vtln_warp_factor
+  MelBanks(const MelBanksOptions &opts,
+           const FrameExtractionOptions &frame_opts, float vtln_warp_factor);
+
+  /// Compute Mel energies (note: not log energies).
+  /// At input, "fft_energies" contains the FFT energies (not log).
+  ///
+  /// @param fft_energies 1-D array of size num_fft_bins/2+1
+  /// @param mel_energies_out  1-D array of size num_mel_bins
+  void Compute(const float *fft_energies, float *mel_energies_out) const;
+
+  int32_t NumBins() const { return bins_.size(); }
+
+ private:
+  // center frequencies of bins, numbered from 0 ... num_bins-1.
+  // Needed by GetCenterFreqs().
+  std::vector<float> center_freqs_;
+
+  // the "bins_" vector is a vector, one for each bin, of a pair:
+  // (the first nonzero fft-bin), (the vector of weights).
+  std::vector<std::pair<int32_t, std::vector<float>>> bins_;
+
+  // TODO(fangjun): Remove debug_ and htk_mode_
+  bool debug_;
+  bool htk_mode_;
+};
+
+}  // namespace knf
+
+#endif  // KALDI_NATIVE_FBANK_CSRC_MEL_COMPUTATIONS_H_
diff --git a/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/rfft.cc b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/rfft.cc
new file mode 100644
index 00000000..69bfde5f
--- /dev/null
+++ b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/rfft.cc
@@ -0,0 +1,66 @@
+/**
+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kaldi-native-fbank/csrc/rfft.h"
+
+#include <cmath>
+#include <vector>
+
+#include "kaldi-native-fbank/csrc/log.h"
+
+// see fftsg.c
+#ifdef __cplusplus
+extern "C" void rdft(int n, int isgn, double *a, int *ip, double *w);
+#else
+void rdft(int n, int isgn, double *a, int *ip, double *w);
+#endif
+
+namespace knf {
+class Rfft::RfftImpl {
+ public:
+  explicit RfftImpl(int32_t n) : n_(n), ip_(2 + std::sqrt(n / 2)), w_(n / 2) {
+    KNF_CHECK_EQ(n & (n - 1), 0);
+  }
+
+  void Compute(float *in_out) {
+    std::vector<double> d(in_out, in_out + n_);
+
+    Compute(d.data());
+
+    std::copy(d.begin(), d.end(), in_out);
+  }
+
+  void Compute(double *in_out) {
+    // 1 means forward fft
+    rdft(n_, 1, in_out, ip_.data(), w_.data());
+  }
+
+ private:
+  int32_t n_;
+  std::vector<int32_t> ip_;
+  std::vector<double> w_;
+};
+
+Rfft::Rfft(int32_t n) : impl_(std::make_unique<RfftImpl>(n)) {}
+
+Rfft::~Rfft() = default;
+
+void Rfft::Compute(float *in_out) { impl_->Compute(in_out); }
+void Rfft::Compute(double *in_out) { impl_->Compute(in_out); }
+
+}  // namespace knf
diff --git a/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/rfft.h b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/rfft.h
new file mode 100644
index 00000000..c8cb9f8c
--- /dev/null
+++ b/audio/paddleaudio/third_party/kaldi-native-fbank/csrc/rfft.h
@@ -0,0 +1,56 @@
+/**
+ * Copyright (c)  2022  Xiaomi Corporation (authors: Fangjun Kuang)
+ *
+ * See LICENSE for clarification regarding multiple authors
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef KALDI_NATIVE_FBANK_CSRC_RFFT_H_
+#define KALDI_NATIVE_FBANK_CSRC_RFFT_H_
+
+#include <memory>
+
+namespace knf {
+
+// n-point Real discrete Fourier transform
+// where n is a power of 2. n >= 2
+//
+//  R[k] = sum_j=0^n-1 in[j]*cos(2*pi*j*k/n), 0<=k<=n/2
+//  I[k] = sum_j=0^n-1 in[j]*sin(2*pi*j*k/n), 0<k<n/2
+class Rfft {
+ public:
+  // @param n Number of fft bins. it should be a power of 2.
+  explicit Rfft(int32_t n);
+  ~Rfft();
+
+  /** @param in_out A 1-D array of size n.
+   *             On return:
+   *               in_out[0] = R[0]
+   *               in_out[1] = R[n/2]
+   *               for 1 < k < n/2,
+   *                 in_out[2*k] = R[k]
+   *                 in_out[2*k+1] = I[k]
+   *
+   */
+  void Compute(float *in_out);
+  void Compute(double *in_out);
+
+ private:
+  class RfftImpl;
+  std::unique_ptr<RfftImpl> impl_;
+};
+
+}  // namespace knf
+
+#endif  // KALDI_NATIVE_FBANK_CSRC_RFFT_H_
diff --git a/audio/paddleaudio/third_party/kaldi/CMakeLists.txt b/audio/paddleaudio/third_party/kaldi/CMakeLists.txt
deleted file mode 100644
index e63fb578..00000000
--- a/audio/paddleaudio/third_party/kaldi/CMakeLists.txt
+++ /dev/null
@@ -1,111 +0,0 @@
-# checkout the thirdparty/kaldi/base/kaldi-types.h
-# compile kaldi without openfst
-add_definitions("-DCOMPILE_WITHOUT_OPENFST")
-
-if ((NOT EXISTS ${CMAKE_CURRENT_LIST_DIR}/base))
-    file(COPY ../../../../speechx/speechx/kaldi/base DESTINATION ${CMAKE_CURRENT_LIST_DIR})
-    file(COPY ../../../../speechx/speechx/kaldi/feat DESTINATION ${CMAKE_CURRENT_LIST_DIR})
-    file(COPY ../../../../speechx/speechx/kaldi/matrix DESTINATION ${CMAKE_CURRENT_LIST_DIR})
-    file(COPY ../../../../speechx/speechx/kaldi/util DESTINATION ${CMAKE_CURRENT_LIST_DIR})
-endif()
-
-# kaldi-base
-add_library(kaldi-base STATIC
-  base/io-funcs.cc
-  base/kaldi-error.cc
-  base/kaldi-math.cc
-  base/kaldi-utils.cc
-  base/timer.cc
-)
-target_include_directories(kaldi-base PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
-
-# kaldi-matrix
-add_library(kaldi-matrix STATIC
-  matrix/compressed-matrix.cc
-  matrix/matrix-functions.cc
-  matrix/kaldi-matrix.cc
-  matrix/kaldi-vector.cc
-  matrix/optimization.cc
-  matrix/packed-matrix.cc
-  matrix/qr.cc
-  matrix/sparse-matrix.cc
-  matrix/sp-matrix.cc
-  matrix/srfft.cc
-  matrix/tp-matrix.cc
-)
-target_include_directories(kaldi-matrix PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
-
-if (NOT MSVC)
-    target_link_libraries(kaldi-matrix PUBLIC kaldi-base libopenblas)
-else()
-    target_link_libraries(kaldi-matrix PUBLIC kaldi-base openblas)
-endif()
-
-# kaldi-util
-add_library(kaldi-util STATIC
-  util/kaldi-holder.cc
-  util/kaldi-io.cc
-  util/kaldi-semaphore.cc
-  util/kaldi-table.cc
-  util/kaldi-thread.cc
-  util/parse-options.cc
-  util/simple-io-funcs.cc
-  util/simple-options.cc
-  util/text-utils.cc
-)
-target_include_directories(kaldi-util PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
-target_link_libraries(kaldi-util PUBLIC kaldi-base kaldi-matrix)
-
-# kaldi-feat-common
-add_library(kaldi-feat-common STATIC
-  feat/cmvn.cc
-  feat/feature-functions.cc
-  feat/feature-window.cc
-  feat/mel-computations.cc
-  feat/pitch-functions.cc
-  feat/resample.cc
-  feat/signal.cc
-  feat/wave-reader.cc
-)
-target_include_directories(kaldi-feat-common PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
-target_link_libraries(kaldi-feat-common PUBLIC kaldi-base kaldi-matrix kaldi-util)
-
-
-# kaldi-mfcc
-add_library(kaldi-mfcc STATIC
-  feat/feature-mfcc.cc
-)
-target_include_directories(kaldi-mfcc PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
-target_link_libraries(kaldi-mfcc PUBLIC kaldi-feat-common)
-
-
-# kaldi-fbank
-add_library(kaldi-fbank STATIC
-  feat/feature-fbank.cc
-)
-target_include_directories(kaldi-fbank PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
-target_link_libraries(kaldi-fbank PUBLIC kaldi-feat-common)
-
-
-set(KALDI_LIBRARIES
-  ${CMAKE_CURRENT_BINARY_DIR}/libkaldi-base.a
-  ${CMAKE_CURRENT_BINARY_DIR}/libkaldi-matrix.a
-  ${CMAKE_CURRENT_BINARY_DIR}/libkaldi-util.a
-  ${CMAKE_CURRENT_BINARY_DIR}/libkaldi-feat-common.a
-  ${CMAKE_CURRENT_BINARY_DIR}/libkaldi-mfcc.a
-  ${CMAKE_CURRENT_BINARY_DIR}/libkaldi-fbank.a
-)
-
-add_library(libkaldi INTERFACE)
-add_dependencies(libkaldi kaldi-base kaldi-matrix kaldi-util kaldi-feat-common kaldi-mfcc kaldi-fbank)
-target_include_directories(libkaldi INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})
-
-if (APPLE)
-    target_link_libraries(libkaldi INTERFACE ${KALDI_LIBRARIES} libopenblas ${GFORTRAN_LIBRARIES_DIR}/libgfortran.a ${GFORTRAN_LIBRARIES_DIR}/libquadmath.a ${GFORTRAN_LIBRARIES_DIR}/libgcc_s.1.1.dylib)
-elseif (MSVC)
-    target_link_libraries(libkaldi INTERFACE kaldi-base kaldi-matrix kaldi-util kaldi-feat-common kaldi-mfcc kaldi-fbank openblas)
-else()
-    target_link_libraries(libkaldi INTERFACE -Wl,--start-group -Wl,--whole-archive ${KALDI_LIBRARIES} libopenblas.a gfortran -Wl,--no-whole-archive -Wl,--end-group)
-endif()
-
-target_compile_definitions(libkaldi INTERFACE "-DCOMPILE_WITHOUT_OPENFST")
diff --git a/audio/setup.py b/audio/setup.py
index d7208a43..823e5dfa 100644
--- a/audio/setup.py
+++ b/audio/setup.py
@@ -40,19 +40,13 @@ COMMITID = 'none'
 base = [
     "kaldiio",
     "librosa==0.8.1",
-    "scipy>=1.0.0",
-    "soundfile~=0.10",
-    "colorlog",
-    "pathos == 0.2.8",
+    "pathos",
     "pybind11",
     "parameterized",
-    "tqdm",
-    "scikit-learn"
 ]
 
 requirements = {
-    "install":
-    base,
+    "install": base,
     "develop": [
         "sox",
         "soxbindings",
@@ -60,6 +54,7 @@ requirements = {
     ],
 }
 
+
 def check_call(cmd: str, shell=False, executable=None):
     try:
         sp.check_call(
@@ -92,6 +87,7 @@ def check_output(cmd: Union[str, List[str], Tuple[str]], shell=False):
             file=sys.stderr)
     return out_bytes.strip().decode('utf8')
 
+
 def _run_cmd(cmd):
     try:
         return subprocess.check_output(
@@ -100,6 +96,7 @@ def _run_cmd(cmd):
     except Exception:
         return None
 
+
 @contextlib.contextmanager
 def pushd(new_dir):
     old_dir = os.getcwd()
@@ -109,22 +106,26 @@ def pushd(new_dir):
     os.chdir(old_dir)
     print(old_dir)
 
+
 def read(*names, **kwargs):
     with io.open(
             os.path.join(os.path.dirname(__file__), *names),
             encoding=kwargs.get("encoding", "utf8")) as fp:
         return fp.read()
 
+
 def _remove(files: str):
     for f in files:
         f.unlink()
 
+
 ################################# Install ##################################
 
 
 def _post_install(install_lib_dir):
     pass
 
+
 class DevelopCommand(develop):
     def run(self):
         develop.run(self)
@@ -142,7 +143,7 @@ class TestCommand(test):
         # Run nose ensuring that argv simulates running nosetests directly
         import nose
         nose.run_exit(argv=['nosetests', '-w', 'tests'])
-    
+
     def run_benchmark(self):
         for benchmark_item in glob.glob('tests/benchmark/*py'):
             os.system(f'pytest {benchmark_item}')
@@ -188,6 +189,7 @@ def _make_version_file(version, sha):
     with open(version_path, "a") as f:
         f.write(f"__version__ = '{version}'\n")
 
+
 def _rm_version():
     file_ = ROOT_DIR / "paddleaudio" / "__init__.py"
     with open(file_, "r") as f:
@@ -235,8 +237,8 @@ def main():
     if platform.system() != 'Windows' and platform.system() != 'Linux':
         lib_package_data = {'paddleaudio': ['lib/libgcc_s.1.1.dylib']}
 
-    if platform.system() == 'Linux':
-        lib_package_data = {'paddleaudio': ['lib/lib*']}
+    #if platform.system() == 'Linux':
+    #    lib_package_data = {'paddleaudio': ['lib/lib*']}
 
     setup_info = dict(
         # Metadata
@@ -254,8 +256,7 @@ def main():
         python_requires='>=3.7',
         install_requires=requirements["install"],
         extras_require={
-            'develop':
-            requirements["develop"],
+            'develop': requirements["develop"],
             #'test': ["nose", "torchaudio==0.10.2", "pytest-benchmark", "librosa=0.8.1", "parameterized", "paddlepaddle"],
         },
         cmdclass={
@@ -267,7 +268,7 @@ def main():
         },
 
         # Package info
-        packages=find_packages(include=('paddleaudio*')),
+        packages=find_packages(include=['paddleaudio*']),
         package_data=lib_package_data,
         ext_modules=setup_helpers.get_ext_modules(),
         zip_safe=True,
@@ -284,11 +285,11 @@ def main():
             'Programming Language :: Python :: 3.8',
             'Programming Language :: Python :: 3.9',
             'Programming Language :: Python :: 3.10',
-        ],
-    )
+        ], )
 
     setup(**setup_info)
     _rm_version()
 
+
 if __name__ == '__main__':
     main()
diff --git a/dataset/aidatatang_200zh/aidatatang_200zh.py b/dataset/aidatatang_200zh/aidatatang_200zh.py
index 85f478c2..3b706c49 100644
--- a/dataset/aidatatang_200zh/aidatatang_200zh.py
+++ b/dataset/aidatatang_200zh/aidatatang_200zh.py
@@ -18,139 +18,7 @@ Manifest file is a json-format file with each line containing the
 meta data (i.e. audio filepath, transcript and audio duration)
 of each audio file in the data set.
 """
-import argparse
-import codecs
-import json
-import os
-from pathlib import Path
-
-import soundfile
-
-from utils.utility import download
-from utils.utility import unpack
-
-DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
-
-URL_ROOT = 'http://www.openslr.org/resources/62'
-# URL_ROOT = 'https://openslr.magicdatatech.com/resources/62'
-DATA_URL = URL_ROOT + '/aidatatang_200zh.tgz'
-MD5_DATA = '6e0f4f39cd5f667a7ee53c397c8d0949'
-
-parser = argparse.ArgumentParser(description=__doc__)
-parser.add_argument(
-    "--target_dir",
-    default=DATA_HOME + "/aidatatang_200zh",
-    type=str,
-    help="Directory to save the dataset. (default: %(default)s)")
-parser.add_argument(
-    "--manifest_prefix",
-    default="manifest",
-    type=str,
-    help="Filepath prefix for output manifests. (default: %(default)s)")
-args = parser.parse_args()
-
-
-def create_manifest(data_dir, manifest_path_prefix):
-    print("Creating manifest %s ..." % manifest_path_prefix)
-    json_lines = []
-    transcript_path = os.path.join(data_dir, 'transcript',
-                                   'aidatatang_200_zh_transcript.txt')
-    transcript_dict = {}
-    for line in codecs.open(transcript_path, 'r', 'utf-8'):
-        line = line.strip()
-        if line == '':
-            continue
-        audio_id, text = line.split(' ', 1)
-        # remove withespace, charactor text
-        text = ''.join(text.split())
-        transcript_dict[audio_id] = text
-
-    data_types = ['train', 'dev', 'test']
-    for dtype in data_types:
-        del json_lines[:]
-        total_sec = 0.0
-        total_text = 0.0
-        total_num = 0
-
-        audio_dir = os.path.join(data_dir, 'corpus/', dtype)
-        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
-            for fname in filelist:
-                if not fname.endswith('.wav'):
-                    continue
-
-                audio_path = os.path.abspath(os.path.join(subfolder, fname))
-                audio_id = os.path.basename(fname)[:-4]
-                utt2spk = Path(audio_path).parent.name
-
-                audio_data, samplerate = soundfile.read(audio_path)
-                duration = float(len(audio_data) / samplerate)
-                text = transcript_dict[audio_id]
-                json_lines.append(
-                    json.dumps(
-                        {
-                            'utt': audio_id,
-                            'utt2spk': str(utt2spk),
-                            'feat': audio_path,
-                            'feat_shape': (duration, ),  # second
-                            'text': text,
-                        },
-                        ensure_ascii=False))
-
-                total_sec += duration
-                total_text += len(text)
-                total_num += 1
-
-        manifest_path = manifest_path_prefix + '.' + dtype
-        with codecs.open(manifest_path, 'w', 'utf-8') as fout:
-            for line in json_lines:
-                fout.write(line + '\n')
-
-        manifest_dir = os.path.dirname(manifest_path_prefix)
-        meta_path = os.path.join(manifest_dir, dtype) + '.meta'
-        with open(meta_path, 'w') as f:
-            print(f"{dtype}:", file=f)
-            print(f"{total_num} utts", file=f)
-            print(f"{total_sec / (60*60)} h", file=f)
-            print(f"{total_text} text", file=f)
-            print(f"{total_text / total_sec} text/sec", file=f)
-            print(f"{total_sec / total_num} sec/utt", file=f)
-
-
-def prepare_dataset(url, md5sum, target_dir, manifest_path, subset):
-    """Download, unpack and create manifest file."""
-    data_dir = os.path.join(target_dir, subset)
-    if not os.path.exists(data_dir):
-        filepath = download(url, md5sum, target_dir)
-        unpack(filepath, target_dir)
-        # unpack all audio tar files
-        audio_dir = os.path.join(data_dir, 'corpus')
-        for subfolder, dirlist, filelist in sorted(os.walk(audio_dir)):
-            for sub in dirlist:
-                print(f"unpack dir {sub}...")
-                for folder, _, filelist in sorted(
-                        os.walk(os.path.join(subfolder, sub))):
-                    for ftar in filelist:
-                        unpack(os.path.join(folder, ftar), folder, True)
-    else:
-        print("Skip downloading and unpacking. Data already exists in %s." %
-              target_dir)
-
-    create_manifest(data_dir, manifest_path)
-
-
-def main():
-    if args.target_dir.startswith('~'):
-        args.target_dir = os.path.expanduser(args.target_dir)
-
-    prepare_dataset(
-        url=DATA_URL,
-        md5sum=MD5_DATA,
-        target_dir=args.target_dir,
-        manifest_path=args.manifest_prefix,
-        subset='aidatatang_200zh')
-
-    print("Data download and manifest prepare done!")
-
+from paddlespeech.dataset.aidatatang_200zh import aidatatang_200zh_main
 
 if __name__ == '__main__':
-    main()
+    aidatatang_200zh_main()
diff --git a/dataset/aishell/README.md b/dataset/aishell/README.md
deleted file mode 100644
index a7dd0cf3..00000000
--- a/dataset/aishell/README.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# [Aishell1](http://openslr.elda.org/33/)
-
-This Open Source Mandarin Speech Corpus, AISHELL-ASR0009-OS1, is 178 hours long. It is a part of AISHELL-ASR0009, of which utterance contains 11 domains, including smart home, autonomous driving, and industrial production. The whole recording was put in quiet indoor environment, using 3 different devices at the same time: high fidelity microphone (44.1kHz, 16-bit,); Android-system mobile phone (16kHz, 16-bit), iOS-system mobile phone (16kHz, 16-bit). Audios in high fidelity were re-sampled to 16kHz to build AISHELL- ASR0009-OS1. 400 speakers from different accent areas in China were invited to participate in the recording. The manual transcription accuracy rate is above 95%, through professional speech annotation and strict quality inspection. The corpus is divided into training, development and testing sets. ( This database is free for academic research, not in the commerce, if without permission. )
diff --git a/dataset/aishell/aishell.py b/dataset/aishell/aishell.py
index ec43104d..b3288757 100644
--- a/dataset/aishell/aishell.py
+++ b/dataset/aishell/aishell.py
@@ -18,143 +18,7 @@ Manifest file is a json-format file with each line containing the
 meta data (i.e. audio filepath, transcript and audio duration)
 of each audio file in the data set.
 """
-import argparse
-import codecs
-import json
-import os
-from pathlib import Path
-
-import soundfile
-
-from utils.utility import download
-from utils.utility import unpack
-
-DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
-
-URL_ROOT = 'http://openslr.elda.org/resources/33'
-# URL_ROOT = 'https://openslr.magicdatatech.com/resources/33'
-DATA_URL = URL_ROOT + '/data_aishell.tgz'
-MD5_DATA = '2f494334227864a8a8fec932999db9d8'
-RESOURCE_URL = URL_ROOT + '/resource_aishell.tgz'
-MD5_RESOURCE = '957d480a0fcac85fc18e550756f624e5'
-
-parser = argparse.ArgumentParser(description=__doc__)
-parser.add_argument(
-    "--target_dir",
-    default=DATA_HOME + "/Aishell",
-    type=str,
-    help="Directory to save the dataset. (default: %(default)s)")
-parser.add_argument(
-    "--manifest_prefix",
-    default="manifest",
-    type=str,
-    help="Filepath prefix for output manifests. (default: %(default)s)")
-args = parser.parse_args()
-
-
-def create_manifest(data_dir, manifest_path_prefix):
-    print("Creating manifest %s ..." % manifest_path_prefix)
-    json_lines = []
-    transcript_path = os.path.join(data_dir, 'transcript',
-                                   'aishell_transcript_v0.8.txt')
-    transcript_dict = {}
-    for line in codecs.open(transcript_path, 'r', 'utf-8'):
-        line = line.strip()
-        if line == '':
-            continue
-        audio_id, text = line.split(' ', 1)
-        # remove withespace, charactor text
-        text = ''.join(text.split())
-        transcript_dict[audio_id] = text
-
-    data_types = ['train', 'dev', 'test']
-    for dtype in data_types:
-        del json_lines[:]
-        total_sec = 0.0
-        total_text = 0.0
-        total_num = 0
-
-        audio_dir = os.path.join(data_dir, 'wav', dtype)
-        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
-            for fname in filelist:
-                audio_path = os.path.abspath(os.path.join(subfolder, fname))
-                audio_id = os.path.basename(fname)[:-4]
-                # if no transcription for audio then skipped
-                if audio_id not in transcript_dict:
-                    continue
-
-                utt2spk = Path(audio_path).parent.name
-                audio_data, samplerate = soundfile.read(audio_path)
-                duration = float(len(audio_data) / samplerate)
-                text = transcript_dict[audio_id]
-                json_lines.append(
-                    json.dumps(
-                        {
-                            'utt': audio_id,
-                            'utt2spk': str(utt2spk),
-                            'feat': audio_path,
-                            'feat_shape': (duration, ),  # second
-                            'text': text
-                        },
-                        ensure_ascii=False))
-
-                total_sec += duration
-                total_text += len(text)
-                total_num += 1
-
-        manifest_path = manifest_path_prefix + '.' + dtype
-        with codecs.open(manifest_path, 'w', 'utf-8') as fout:
-            for line in json_lines:
-                fout.write(line + '\n')
-
-        manifest_dir = os.path.dirname(manifest_path_prefix)
-        meta_path = os.path.join(manifest_dir, dtype) + '.meta'
-        with open(meta_path, 'w') as f:
-            print(f"{dtype}:", file=f)
-            print(f"{total_num} utts", file=f)
-            print(f"{total_sec / (60*60)} h", file=f)
-            print(f"{total_text} text", file=f)
-            print(f"{total_text / total_sec} text/sec", file=f)
-            print(f"{total_sec / total_num} sec/utt", file=f)
-
-
-def prepare_dataset(url, md5sum, target_dir, manifest_path=None):
-    """Download, unpack and create manifest file."""
-    data_dir = os.path.join(target_dir, 'data_aishell')
-    if not os.path.exists(data_dir):
-        filepath = download(url, md5sum, target_dir)
-        unpack(filepath, target_dir)
-        # unpack all audio tar files
-        audio_dir = os.path.join(data_dir, 'wav')
-        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
-            for ftar in filelist:
-                unpack(os.path.join(subfolder, ftar), subfolder, True)
-    else:
-        print("Skip downloading and unpacking. Data already exists in %s." %
-              target_dir)
-
-    if manifest_path:
-        create_manifest(data_dir, manifest_path)
-
-
-def main():
-    if args.target_dir.startswith('~'):
-        args.target_dir = os.path.expanduser(args.target_dir)
-
-    prepare_dataset(
-        url=DATA_URL,
-        md5sum=MD5_DATA,
-        target_dir=args.target_dir,
-        manifest_path=args.manifest_prefix)
-
-    prepare_dataset(
-        url=RESOURCE_URL,
-        md5sum=MD5_RESOURCE,
-        target_dir=args.target_dir,
-        manifest_path=None)
-
-    print("Data download and manifest prepare done!")
-
+from paddlespeech.dataset.aishell import aishell_main
 
 if __name__ == '__main__':
-    main()
+    aishell_main()
diff --git a/dataset/librispeech/librispeech.py b/dataset/librispeech/librispeech.py
index 2d6f1763..44567b0c 100644
--- a/dataset/librispeech/librispeech.py
+++ b/dataset/librispeech/librispeech.py
@@ -28,8 +28,8 @@ from multiprocessing.pool import Pool
 import distutils.util
 import soundfile
 
-from utils.utility import download
-from utils.utility import unpack
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unpack
 
 URL_ROOT = "http://openslr.elda.org/resources/12"
 #URL_ROOT = "https://openslr.magicdatatech.com/resources/12"
diff --git a/dataset/mini_librispeech/mini_librispeech.py b/dataset/mini_librispeech/mini_librispeech.py
index 0eb80bf8..24bd98d8 100644
--- a/dataset/mini_librispeech/mini_librispeech.py
+++ b/dataset/mini_librispeech/mini_librispeech.py
@@ -27,8 +27,8 @@ from multiprocessing.pool import Pool
 
 import soundfile
 
-from utils.utility import download
-from utils.utility import unpack
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unpack
 
 URL_ROOT = "http://openslr.elda.org/resources/31"
 URL_TRAIN_CLEAN = URL_ROOT + "/train-clean-5.tar.gz"
diff --git a/dataset/musan/musan.py b/dataset/musan/musan.py
index ae3430b2..85d986e8 100644
--- a/dataset/musan/musan.py
+++ b/dataset/musan/musan.py
@@ -29,8 +29,8 @@ import os
 
 import soundfile
 
-from utils.utility import download
-from utils.utility import unpack
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unpack
 
 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
 
diff --git a/dataset/rir_noise/rir_noise.py b/dataset/rir_noise/rir_noise.py
index b1d47558..b98dff72 100644
--- a/dataset/rir_noise/rir_noise.py
+++ b/dataset/rir_noise/rir_noise.py
@@ -29,8 +29,8 @@ import os
 
 import soundfile
 
-from utils.utility import download
-from utils.utility import unzip
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unzip
 
 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
 
diff --git a/dataset/tal_cs/README.md b/dataset/tal_cs/README.md
new file mode 100644
index 00000000..63305636
--- /dev/null
+++ b/dataset/tal_cs/README.md
@@ -0,0 +1,13 @@
+# [TAL_CSASR](https://ai.100tal.com/dataset/)
+
+This data set is TAL English class audio, including mixed Chinese and English speech. Each audio has only one speaker, and this data set has more than 100 speakers. (File 63.36G) This data contains the sample of intra sentence and inter sentence mixing. The ratio between Chinese characters and English words in the data is 13:1. 
+
+- Total data: 587H (train_set: 555.9H, dev_set: 8H, test_set: 23.6H)
+- Sample rate: 16000
+- Sample bit: 16
+- Recording device: microphone
+- Speaker number: 200+
+- Recording time: 2019
+- Data format: audio: .wav; test: .txt
+- Audio duration: 1-60s
+- Data type: audio of English teachers' teaching
diff --git a/dataset/tal_cs/tal_cs.py b/dataset/tal_cs/tal_cs.py
new file mode 100644
index 00000000..2024b21e
--- /dev/null
+++ b/dataset/tal_cs/tal_cs.py
@@ -0,0 +1,116 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Prepare TALCS ASR datasets.
+
+create manifest files.
+Manifest file is a json-format file with each line containing the
+meta data (i.e. audio filepath, transcript and audio duration)
+of each audio file in the data set.
+"""
+import argparse
+import codecs
+import io
+import json
+import os
+
+import soundfile
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--target_dir",
+    type=str,
+    help="Directory to save the dataset. (default: %(default)s)")
+parser.add_argument(
+    "--manifest_prefix",
+    type=str,
+    help="Filepath prefix for output manifests. (default: %(default)s)")
+args = parser.parse_args()
+
+TRAIN_SET = os.path.join(args.target_dir, "train_set")
+DEV_SET = os.path.join(args.target_dir, "dev_set")
+TEST_SET = os.path.join(args.target_dir, "test_set")
+
+manifest_train_path = os.path.join(args.manifest_prefix, "manifest.train.raw")
+manifest_dev_path = os.path.join(args.manifest_prefix, "manifest.dev.raw")
+manifest_test_path = os.path.join(args.manifest_prefix, "manifest.test.raw")
+
+
+def create_manifest(data_dir, manifest_path):
+    """Create a manifest json file summarizing the data set, with each line
+    containing the meta data (i.e. audio filepath, transcription text, audio
+    duration) of each audio file within the data set.
+    """
+    print("Creating manifest %s ..." % manifest_path)
+    json_lines = []
+    total_sec = 0.0
+    total_char = 0.0
+    total_num = 0
+    wav_dir = os.path.join(data_dir, 'wav')
+    text_filepath = os.path.join(data_dir, 'label.txt')
+    for subfolder, _, filelist in sorted(os.walk(wav_dir)):
+        for line in io.open(text_filepath, encoding="utf8"):
+            segments = line.strip().split()
+            nchars = len(segments[1:])
+            text = ' '.join(segments[1:]).lower()
+
+            audio_filepath = os.path.abspath(
+                os.path.join(subfolder, segments[0] + '.wav'))
+            audio_data, samplerate = soundfile.read(audio_filepath)
+            duration = float(len(audio_data)) / samplerate
+
+            utt = os.path.splitext(os.path.basename(audio_filepath))[0]
+            utt2spk = '-'.join(utt.split('-')[:2])
+
+            json_lines.append(
+                json.dumps({
+                    'utt': utt,
+                    'utt2spk': utt2spk,
+                    'feat': audio_filepath,
+                    'feat_shape': (duration, ),  # second
+                    'text': text,
+                }))
+
+            total_sec += duration
+            total_char += nchars
+            total_num += 1
+
+    with codecs.open(manifest_path, 'w', 'utf-8') as out_file:
+        for line in json_lines:
+            out_file.write(line + '\n')
+
+    subset = os.path.splitext(manifest_path)[1][1:]
+    manifest_dir = os.path.dirname(manifest_path)
+    data_dir_name = os.path.split(data_dir)[-1]
+    meta_path = os.path.join(manifest_dir, data_dir_name) + '.meta'
+    with open(meta_path, 'w') as f:
+        print(f"{subset}:", file=f)
+        print(f"{total_num} utts", file=f)
+        print(f"{total_sec / (60*60)} h", file=f)
+        print(f"{total_char} char", file=f)
+        print(f"{total_char / total_sec} char/sec", file=f)
+        print(f"{total_sec / total_num} sec/utt", file=f)
+
+
+def main():
+    if args.target_dir.startswith('~'):
+        args.target_dir = os.path.expanduser(args.target_dir)
+
+    create_manifest(TRAIN_SET, manifest_train_path)
+    create_manifest(DEV_SET, manifest_dev_path)
+    create_manifest(TEST_SET, manifest_test_path)
+    print("Data download and manifest prepare done!")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/dataset/thchs30/thchs30.py b/dataset/thchs30/thchs30.py
index d41c0e17..c5c3eb7a 100644
--- a/dataset/thchs30/thchs30.py
+++ b/dataset/thchs30/thchs30.py
@@ -27,8 +27,8 @@ from pathlib import Path
 
 import soundfile
 
-from utils.utility import download
-from utils.utility import unpack
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unpack
 
 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
 
diff --git a/dataset/timit/timit.py b/dataset/timit/timit.py
index c4a9f066..f3889d17 100644
--- a/dataset/timit/timit.py
+++ b/dataset/timit/timit.py
@@ -28,7 +28,7 @@ from pathlib import Path
 
 import soundfile
 
-from utils.utility import unzip
+from paddlespeech.dataset.download import unzip
 
 URL_ROOT = ""
 MD5_DATA = "45c68037c7fdfe063a43c851f181fb2d"
diff --git a/dataset/voxceleb/voxceleb1.py b/dataset/voxceleb/voxceleb1.py
index 95827f70..8d410067 100644
--- a/dataset/voxceleb/voxceleb1.py
+++ b/dataset/voxceleb/voxceleb1.py
@@ -31,9 +31,9 @@ from pathlib import Path
 
 import soundfile
 
-from utils.utility import check_md5sum
-from utils.utility import download
-from utils.utility import unzip
+from paddlespeech.dataset.download import check_md5sum
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unzip
 
 # all the data will be download in the current data/voxceleb directory default
 DATA_HOME = os.path.expanduser('.')
diff --git a/dataset/voxceleb/voxceleb2.py b/dataset/voxceleb/voxceleb2.py
index fe9e8b9c..6df6d1f3 100644
--- a/dataset/voxceleb/voxceleb2.py
+++ b/dataset/voxceleb/voxceleb2.py
@@ -27,9 +27,9 @@ from pathlib import Path
 
 import soundfile
 
-from utils.utility import check_md5sum
-from utils.utility import download
-from utils.utility import unzip
+from paddlespeech.dataset.download import check_md5sum
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unzip
 
 # all the data will be download in the current data/voxceleb directory default
 DATA_HOME = os.path.expanduser('.')
diff --git a/dataset/voxforge/voxforge.py b/dataset/voxforge/voxforge.py
index 373791bf..327d200b 100644
--- a/dataset/voxforge/voxforge.py
+++ b/dataset/voxforge/voxforge.py
@@ -28,9 +28,9 @@ import subprocess
 
 import soundfile
 
-from utils.utility import download_multi
-from utils.utility import getfile_insensitive
-from utils.utility import unpack
+from paddlespeech.dataset.download import download_multi
+from paddlespeech.dataset.download import getfile_insensitive
+from paddlespeech.dataset.download import unpack
 
 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
 
diff --git a/demos/TTSAndroid/README.md b/demos/TTSAndroid/README.md
index d6013562..36848cbe 100644
--- a/demos/TTSAndroid/README.md
+++ b/demos/TTSAndroid/README.md
@@ -1,6 +1,6 @@
 # 语音合成 Java API Demo 使用指南
 
-在 Android 上实现语音合成功能，此 Demo 有很好的的易用性和开放性，如在 Demo 中跑自己训练好的模型等。
+在 Android 上实现语音合成功能，此 Demo 有很好的易用性和开放性，如在 Demo 中跑自己训练好的模型等。
 
 本文主要介绍语音合成 Demo 运行方法。
 
@@ -157,8 +157,11 @@ Android 示例基于 Java API 开发，调用 Paddle Lite `Java API` 包括以
 
 ### 更新输入
 
-**本 Demo 不包含文本前端模块**，通过下拉框选择预先设置好的文本，在代码中映射成对应的 phone_id，**如需文本前端模块请自行处理**，`phone_id_map.txt`
-请参考 [fastspeech2_cnndecoder_csmsc_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_pdlite_1.3.0.zip)。
+**本 Demo 不包含文本前端模块**，通过下拉框选择预先设置好的文本，在代码中映射成对应的 phone_id，**如需文本前端模块请自行处理**，可参考：
+- C++ 中文前端 [lym0302/paddlespeech_tts_cpp](https://github.com/lym0302/paddlespeech_tts_cpp)
+- C++ 英文 g2p [yazone/g2pE_mobile](https://github.com/yazone/g2pE_mobile)
+
+`phone_id_map.txt` 请参考 [fastspeech2_cnndecoder_csmsc_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_pdlite_1.3.0.zip)。
 
 ## 通过 setting 界面更新语音合成的相关参数
 
diff --git a/demos/TTSArmLinux/.gitignore b/demos/TTSArmLinux/.gitignore
new file mode 100644
index 00000000..f18480d7
--- /dev/null
+++ b/demos/TTSArmLinux/.gitignore
@@ -0,0 +1,8 @@
+# 目录
+build/
+output/
+libs/
+models/
+
+# 符号连接
+dict
diff --git a/demos/TTSArmLinux/README.md b/demos/TTSArmLinux/README.md
new file mode 100644
index 00000000..a4ccba6c
--- /dev/null
+++ b/demos/TTSArmLinux/README.md
@@ -0,0 +1,91 @@
+# TTS ARM Linux C++ Demo
+
+修改自 [demos/TTSAndroid](../TTSAndroid)，模型也来自该安卓 Demo。
+
+### 配置编译选项
+
+打开 [config.sh](config.sh) 按需修改配置。
+
+默认编译 64 位版本，如果要编译 32 位版本，把 `ARM_ABI=armv8` 改成 `ARM_ABI=armv7hf` 。
+
+### 安装依赖
+
+```bash
+# Ubuntu
+sudo apt install build-essential cmake pkg-config wget tar unzip
+
+# CentOS
+sudo yum groupinstall "Development Tools"
+sudo yum install cmake wget tar unzip
+```
+
+### 下载 Paddle Lite 库文件和模型文件
+
+预编译的二进制使用与安卓 Demo 版本相同的 Paddle Lite 推理库（[Paddle-Lite:68b66fd35](https://github.com/PaddlePaddle/Paddle-Lite/tree/68b66fd356c875c92167d311ad458e6093078449)）和模型（[fs2cnn_mbmelgan_cpu_v1.3.0](https://paddlespeech.bj.bcebos.com/demos/TTSAndroid/fs2cnn_mbmelgan_cpu_v1.3.0.tar.gz)）。
+
+可用以下命令下载：
+
+```bash
+./download.sh
+```
+
+### 编译 Demo
+
+```bash
+./build.sh
+```
+
+预编译的二进制兼容 Ubuntu 16.04 到 20.04。
+
+如果编译或链接失败，说明发行版与预编译库不兼容，请尝试手动编译 Paddle Lite 库，具体步骤在最下面。
+
+### 运行
+
+你可以修改 `./front.conf` 中 `--phone2id_path` 参数为你自己的声学模型的 `phone_id_map.txt` 。
+
+```bash
+./run.sh
+./run.sh --sentence "语音合成测试"
+./run.sh --sentence "输出到指定的音频文件" --output_wav ./output/test.wav
+./run.sh --help
+```
+
+目前只支持中文合成，出现任何英文都会导致程序崩溃。
+
+如果未指定`--wav_file`，默认输出到`./output/tts.wav`。
+
+## 手动编译 Paddle Lite 库
+
+预编译的二进制兼容 Ubuntu 16.04 到 20.04，如果你的发行版与其不兼容，可以自行从源代码编译。
+
+注意，我们只能保证 [Paddle-Lite:68b66fd35](https://github.com/PaddlePaddle/Paddle-Lite/tree/68b66fd356c875c92167d311ad458e6093078449) 与通过 `download.sh` 下载的模型兼容。
+如果使用其他版本的 Paddle Lite 库，可能需要用对应版本的 opt 工具重新导出模型。
+
+此外，[Paddle-Lite 2.12](https://github.com/PaddlePaddle/Paddle-Lite/releases/tag/v2.12) 与 TTS 不兼容，无法导出或运行 TTS 模型，需要使用更新的版本（比如 `develop` 分支中的代码）。
+但 `develop` 分支中的代码可能与通过 `download.sh` 下载的模型不兼容，Demo 运行起来可能会崩溃。
+
+### 安装 Paddle Lite 的编译依赖
+
+```bash
+# Ubuntu
+sudo apt install build-essential cmake git python
+
+# CentOS
+sudo yum groupinstall "Development Tools"
+sudo yum install cmake git python
+```
+
+### 编译 Paddle Lite 68b66fd35
+
+```
+git clone https://github.com/PaddlePaddle/Paddle-Lite.git
+cd Paddle-Lite
+git checkout 68b66fd356c875c92167d311ad458e6093078449
+./lite/tools/build_linux.sh --with_extra=ON
+```
+
+编译完成后，打开 Demo 的 [config.sh](config.sh)，把 `PADDLE_LITE_DIR` 改成以下值即可（注意替换 `/path/to/` 为实际目录）：
+
+```
+PADDLE_LITE_DIR="/path/to/Paddle-Lite/build.lite.linux.${ARM_ABI}.gcc/inference_lite_lib.armlinux.${ARM_ABI}/cxx"
+```
diff --git a/demos/TTSArmLinux/build-depends.sh b/demos/TTSArmLinux/build-depends.sh
new file mode 120000
index 00000000..fd3aec9c
--- /dev/null
+++ b/demos/TTSArmLinux/build-depends.sh
@@ -0,0 +1 @@
+src/TTSCppFrontend/build-depends.sh
\ No newline at end of file
diff --git a/demos/TTSArmLinux/build.sh b/demos/TTSArmLinux/build.sh
new file mode 100755
index 00000000..5d31173e
--- /dev/null
+++ b/demos/TTSArmLinux/build.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+set -e
+set -x
+
+cd "$(dirname "$(realpath "$0")")"
+
+BASE_DIR="$PWD"
+
+# load configure
+. ./config.sh
+
+# build
+echo "ARM_ABI is ${ARM_ABI}"
+echo "PADDLE_LITE_DIR is ${PADDLE_LITE_DIR}"
+
+echo "Build depends..."
+./build-depends.sh "$@"
+
+mkdir -p "$BASE_DIR/build"
+cd "$BASE_DIR/build"
+cmake -DPADDLE_LITE_DIR="${PADDLE_LITE_DIR}" -DARM_ABI="${ARM_ABI}" ../src
+
+if [ "$*" = "" ]; then
+    make -j$(nproc)
+else
+    make "$@"
+fi
+
+echo "make successful!"
diff --git a/demos/TTSArmLinux/clean.sh b/demos/TTSArmLinux/clean.sh
new file mode 100755
index 00000000..2743801c
--- /dev/null
+++ b/demos/TTSArmLinux/clean.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+set -e
+set -x
+
+cd "$(dirname "$(realpath "$0")")"
+
+BASE_DIR="$PWD"
+
+# load configure
+. ./config.sh
+
+# remove dirs
+set -x
+
+rm -rf "$OUTPUT_DIR"
+rm -rf "$LIBS_DIR"
+rm -rf "$MODELS_DIR"
+rm -rf "$BASE_DIR/build"
+
+"$BASE_DIR/src/TTSCppFrontend/clean.sh"
+
+# 符号连接
+rm "$BASE_DIR/dict"
diff --git a/demos/TTSArmLinux/config.sh b/demos/TTSArmLinux/config.sh
new file mode 100644
index 00000000..bf38d7d6
--- /dev/null
+++ b/demos/TTSArmLinux/config.sh
@@ -0,0 +1,15 @@
+# configuration
+
+ARM_ABI=armv8
+#ARM_ABI=armv7hf
+
+MODELS_DIR="${PWD}/models"
+LIBS_DIR="${PWD}/libs"
+OUTPUT_DIR="${PWD}/output"
+
+PADDLE_LITE_DIR="${LIBS_DIR}/inference_lite_lib.armlinux.${ARM_ABI}.gcc.with_extra.with_cv/cxx"
+#PADDLE_LITE_DIR="/path/to/Paddle-Lite/build.lite.linux.${ARM_ABI}.gcc/inference_lite_lib.armlinux.${ARM_ABI}/cxx"
+
+ACOUSTIC_MODEL_PATH="${MODELS_DIR}/cpu/fastspeech2_csmsc_arm.nb"
+VOCODER_PATH="${MODELS_DIR}/cpu/mb_melgan_csmsc_arm.nb"
+FRONT_CONF="${PWD}/front.conf"
diff --git a/demos/TTSArmLinux/download.sh b/demos/TTSArmLinux/download.sh
new file mode 100755
index 00000000..7eaa836a
--- /dev/null
+++ b/demos/TTSArmLinux/download.sh
@@ -0,0 +1,70 @@
+#!/bin/bash
+set -e
+
+cd "$(dirname "$(realpath "$0")")"
+
+BASE_DIR="$PWD"
+
+# load configure
+. ./config.sh
+
+mkdir -p "$LIBS_DIR" "$MODELS_DIR"
+
+download() {
+    file="$1"
+    url="$2"
+    md5="$3"
+    dir="$4"
+
+    cd "$dir"
+
+    if [ -f "$file" ] && [ "$(md5sum "$file" | awk '{ print $1 }')" = "$md5" ]; then
+        echo "File $file (MD5: $md5) has been downloaded."
+    else
+        echo "Downloading $file..."
+        wget -O "$file" "$url"
+
+        # MD5 verify
+        fileMd5="$(md5sum "$file" | awk '{ print $1 }')"
+        if [ "$fileMd5" == "$md5" ]; then
+            echo "File $file (MD5: $md5) has been downloaded."
+        else
+            echo "MD5 mismatch, file may be corrupt"
+            echo "$file MD5: $fileMd5, it should be $md5"
+        fi
+    fi
+
+    echo "Extracting $file..."
+    echo '-----------------------'
+    tar -vxf "$file"
+    echo '======================='
+}
+
+########################################
+
+echo "Download models..."
+
+download 'inference_lite_lib.armlinux.armv8.gcc.with_extra.with_cv.tar.gz' \
+    'https://paddlespeech.bj.bcebos.com/demos/TTSArmLinux/inference_lite_lib.armlinux.armv8.gcc.with_extra.with_cv.tar.gz' \
+    '39e0c6604f97c70f5d13c573d7e709b9' \
+    "$LIBS_DIR"
+
+download 'inference_lite_lib.armlinux.armv7hf.gcc.with_extra.with_cv.tar.gz' \
+    'https://paddlespeech.bj.bcebos.com/demos/TTSArmLinux/inference_lite_lib.armlinux.armv7hf.gcc.with_extra.with_cv.tar.gz' \
+    'f5ceb509f0b610dafb8379889c5f36f8' \
+    "$LIBS_DIR"
+
+download 'fs2cnn_mbmelgan_cpu_v1.3.0.tar.gz' \
+    'https://paddlespeech.bj.bcebos.com/demos/TTSAndroid/fs2cnn_mbmelgan_cpu_v1.3.0.tar.gz' \
+    '93ef17d44b498aff3bea93e2c5c09a1e' \
+    "$MODELS_DIR"
+
+echo "Done."
+
+########################################
+
+echo "Download dictionary files..."
+
+ln -s src/TTSCppFrontend/front_demo/dict "$BASE_DIR/"
+
+"$BASE_DIR/src/TTSCppFrontend/download.sh"
diff --git a/demos/TTSArmLinux/front.conf b/demos/TTSArmLinux/front.conf
new file mode 100644
index 00000000..5960b32a
--- /dev/null
+++ b/demos/TTSArmLinux/front.conf
@@ -0,0 +1,21 @@
+# jieba conf
+--jieba_dict_path=./dict/jieba/jieba.dict.utf8
+--jieba_hmm_path=./dict/jieba/hmm_model.utf8
+--jieba_user_dict_path=./dict/jieba/user.dict.utf8
+--jieba_idf_path=./dict/jieba/idf.utf8
+--jieba_stop_word_path=./dict/jieba/stop_words.utf8
+
+# dict conf fastspeech2_0.4
+--separate_tone=false
+--word2phone_path=./dict/fastspeech2_nosil_baker_ckpt_0.4/word2phone_fs2.dict
+--phone2id_path=./dict/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
+--tone2id_path=./dict/fastspeech2_nosil_baker_ckpt_0.4/word2phone_fs2.dict
+
+# dict conf speedyspeech_0.5
+#--separate_tone=true
+#--word2phone_path=./dict/speedyspeech_nosil_baker_ckpt_0.5/word2phone.dict
+#--phone2id_path=./dict/speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt
+#--tone2id_path=./dict/speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt
+
+# dict of tranditional_to_simplified
+--trand2simpd_path=./dict/tranditional_to_simplified/trand2simp.txt
diff --git a/demos/TTSArmLinux/run.sh b/demos/TTSArmLinux/run.sh
new file mode 100755
index 00000000..d0860f04
--- /dev/null
+++ b/demos/TTSArmLinux/run.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+set -e
+
+cd "$(dirname "$(realpath "$0")")"
+
+# load configure
+. ./config.sh
+
+# create dir
+mkdir -p "$OUTPUT_DIR"
+
+# run
+set -x
+./build/paddlespeech_tts_demo \
+    --front_conf "$FRONT_CONF" \
+    --acoustic_model "$ACOUSTIC_MODEL_PATH" \
+    --vocoder "$VOCODER_PATH" \
+    "$@"
+# end
diff --git a/demos/TTSArmLinux/src/CMakeLists.txt b/demos/TTSArmLinux/src/CMakeLists.txt
new file mode 100644
index 00000000..f8240d0c
--- /dev/null
+++ b/demos/TTSArmLinux/src/CMakeLists.txt
@@ -0,0 +1,80 @@
+cmake_minimum_required(VERSION 3.10)
+project(paddlespeech_tts_demo)
+
+
+########## Global Options ##########
+
+option(WITH_FRONT_DEMO "Build front demo" OFF)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+set(ABSL_PROPAGATE_CXX_STD ON)
+
+
+########## ARM Options ##########
+
+set(CMAKE_SYSTEM_NAME Linux)
+if(ARM_ABI STREQUAL "armv8")
+    set(CMAKE_SYSTEM_PROCESSOR aarch64)
+    #set(CMAKE_C_COMPILER "aarch64-linux-gnu-gcc")
+    #set(CMAKE_CXX_COMPILER "aarch64-linux-gnu-g++")
+elseif(ARM_ABI STREQUAL "armv7hf")
+    set(CMAKE_SYSTEM_PROCESSOR arm)
+    #set(CMAKE_C_COMPILER "arm-linux-gnueabihf-gcc")
+    #set(CMAKE_CXX_COMPILER "arm-linux-gnueabihf-g++")
+else()
+    message(FATAL_ERROR "Unknown arch abi ${ARM_ABI}, only support armv8 and armv7hf.")
+    return()
+endif()
+
+
+########## Paddle Lite Options ##########
+
+message(STATUS "TARGET ARCH ABI: ${ARM_ABI}")
+message(STATUS "PADDLE LITE DIR: ${PADDLE_LITE_DIR}")
+
+include_directories(${PADDLE_LITE_DIR}/include)
+link_directories(${PADDLE_LITE_DIR}/libs/${ARM_ABI})
+link_directories(${PADDLE_LITE_DIR}/lib)
+
+if(ARM_ABI STREQUAL "armv8")
+    set(CMAKE_CXX_FLAGS "-march=armv8-a ${CMAKE_CXX_FLAGS}")
+    set(CMAKE_C_FLAGS "-march=armv8-a ${CMAKE_C_FLAGS}")
+elseif(ARM_ABI STREQUAL "armv7hf")
+    set(CMAKE_CXX_FLAGS "-march=armv7-a -mfloat-abi=hard -mfpu=neon-vfpv4 ${CMAKE_CXX_FLAGS}")
+    set(CMAKE_C_FLAGS "-march=armv7-a -mfloat-abi=hard -mfpu=neon-vfpv4 ${CMAKE_C_FLAGS}" )
+endif()
+
+
+########## Dependencies ##########
+
+find_package(OpenMP REQUIRED)
+if(OpenMP_FOUND OR OpenMP_CXX_FOUND)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+    message(STATUS "Found OpenMP ${OpenMP_VERSION} ${OpenMP_CXX_VERSION}")
+    message(STATUS "OpenMP C flags:  ${OpenMP_C_FLAGS}")
+    message(STATUS "OpenMP CXX flags:  ${OpenMP_CXX_FLAGS}")
+    message(STATUS "OpenMP OpenMP_CXX_LIB_NAMES:  ${OpenMP_CXX_LIB_NAMES}")
+    message(STATUS "OpenMP OpenMP_CXX_LIBRARIES:  ${OpenMP_CXX_LIBRARIES}")
+else()
+    message(FATAL_ERROR "Could not found OpenMP!")
+    return()
+endif()
+
+
+############### tts cpp frontend ###############
+
+add_subdirectory(TTSCppFrontend)
+
+include_directories(
+    TTSCppFrontend/src
+    third-party/build/src/cppjieba/include
+    third-party/build/src/limonp/include
+)
+
+
+############### paddlespeech_tts_demo ###############
+
+add_executable(paddlespeech_tts_demo main.cc)
+target_link_libraries(paddlespeech_tts_demo paddle_light_api_shared paddlespeech_tts_front)
diff --git a/demos/TTSArmLinux/src/Predictor.hpp b/demos/TTSArmLinux/src/Predictor.hpp
new file mode 100644
index 00000000..f173abb5
--- /dev/null
+++ b/demos/TTSArmLinux/src/Predictor.hpp
@@ -0,0 +1,320 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <algorithm>
+#include <chrono>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle_api.h"
+
+using namespace paddle::lite_api;
+
+class PredictorInterface {
+  public:
+    virtual ~PredictorInterface() = 0;
+    virtual bool Init(const std::string &AcousticModelPath,
+                      const std::string &VocoderPath,
+                      PowerMode cpuPowerMode,
+                      int cpuThreadNum,
+                      // WAV采样率（必须与模型输出匹配）
+                      // 如果播放速度和音调异常，请修改采样率
+                      // 常见采样率：16000, 24000, 32000, 44100, 48000, 96000
+                      uint32_t wavSampleRate) = 0;
+    virtual std::shared_ptr<PaddlePredictor> LoadModel(
+        const std::string &modelPath,
+        int cpuThreadNum,
+        PowerMode cpuPowerMode) = 0;
+    virtual void ReleaseModel() = 0;
+    virtual bool RunModel(const std::vector<int64_t> &phones) = 0;
+    virtual std::unique_ptr<const Tensor> GetAcousticModelOutput(
+        const std::vector<int64_t> &phones) = 0;
+    virtual std::unique_ptr<const Tensor> GetVocoderOutput(
+        std::unique_ptr<const Tensor> &&amOutput) = 0;
+    virtual void VocoderOutputToWav(
+        std::unique_ptr<const Tensor> &&vocOutput) = 0;
+    virtual void SaveFloatWav(float *floatWav, int64_t size) = 0;
+    virtual bool IsLoaded() = 0;
+    virtual float GetInferenceTime() = 0;
+    virtual int GetWavSize() = 0;
+    // 获取WAV持续时间（单位：毫秒）
+    virtual float GetWavDuration() = 0;
+    // 获取RTF（合成时间 / 音频时长）
+    virtual float GetRTF() = 0;
+    virtual void ReleaseWav() = 0;
+    virtual bool WriteWavToFile(const std::string &wavPath) = 0;
+};
+
+PredictorInterface::~PredictorInterface() {}
+
+// WavDataType: WAV数据类型
+// 可在 int16_t 和 float 之间切换，
+// 用于生成 16-bit PCM 或 32-bit IEEE float 格式的 WAV
+template <typename WavDataType>
+class Predictor : public PredictorInterface {
+  public:
+    bool Init(const std::string &AcousticModelPath,
+              const std::string &VocoderPath,
+              PowerMode cpuPowerMode,
+              int cpuThreadNum,
+              // WAV采样率（必须与模型输出匹配）
+              // 如果播放速度和音调异常，请修改采样率
+              // 常见采样率：16000, 24000, 32000, 44100, 48000, 96000
+              uint32_t wavSampleRate) override {
+        // Release model if exists
+        ReleaseModel();
+
+        acoustic_model_predictor_ =
+            LoadModel(AcousticModelPath, cpuThreadNum, cpuPowerMode);
+        if (acoustic_model_predictor_ == nullptr) {
+            return false;
+        }
+        vocoder_predictor_ = LoadModel(VocoderPath, cpuThreadNum, cpuPowerMode);
+        if (vocoder_predictor_ == nullptr) {
+            return false;
+        }
+
+        wav_sample_rate_ = wavSampleRate;
+
+        return true;
+    }
+
+    virtual ~Predictor() {
+        ReleaseModel();
+        ReleaseWav();
+    }
+
+    std::shared_ptr<PaddlePredictor> LoadModel(
+        const std::string &modelPath,
+        int cpuThreadNum,
+        PowerMode cpuPowerMode) override {
+        if (modelPath.empty()) {
+            return nullptr;
+        }
+
+        // 设置MobileConfig
+        MobileConfig config;
+        config.set_model_from_file(modelPath);
+        config.set_threads(cpuThreadNum);
+        config.set_power_mode(cpuPowerMode);
+
+        return CreatePaddlePredictor<MobileConfig>(config);
+    }
+
+    void ReleaseModel() override {
+        acoustic_model_predictor_ = nullptr;
+        vocoder_predictor_ = nullptr;
+    }
+
+    bool RunModel(const std::vector<int64_t> &phones) override {
+        if (!IsLoaded()) {
+            return false;
+        }
+
+        // 计时开始
+        auto start = std::chrono::system_clock::now();
+
+        // 执行推理
+        VocoderOutputToWav(GetVocoderOutput(GetAcousticModelOutput(phones)));
+
+        // 计时结束
+        auto end = std::chrono::system_clock::now();
+
+        // 计算用时
+        std::chrono::duration<float> duration = end - start;
+        inference_time_ = duration.count() * 1000;  // 单位：毫秒
+
+        return true;
+    }
+
+    std::unique_ptr<const Tensor> GetAcousticModelOutput(
+        const std::vector<int64_t> &phones) override {
+        auto phones_handle = acoustic_model_predictor_->GetInput(0);
+        phones_handle->Resize({static_cast<int64_t>(phones.size())});
+        phones_handle->CopyFromCpu(phones.data());
+        acoustic_model_predictor_->Run();
+
+        // 获取输出Tensor
+        auto am_output_handle = acoustic_model_predictor_->GetOutput(0);
+        // 打印输出Tensor的shape
+        std::cout << "Acoustic Model Output shape: ";
+        auto shape = am_output_handle->shape();
+        for (auto s : shape) {
+            std::cout << s << ", ";
+        }
+        std::cout << std::endl;
+
+        return am_output_handle;
+    }
+
+    std::unique_ptr<const Tensor> GetVocoderOutput(
+        std::unique_ptr<const Tensor> &&amOutput) override {
+        auto mel_handle = vocoder_predictor_->GetInput(0);
+        // [?, 80]
+        auto dims = amOutput->shape();
+        mel_handle->Resize(dims);
+        auto am_output_data = amOutput->mutable_data<float>();
+        mel_handle->CopyFromCpu(am_output_data);
+        vocoder_predictor_->Run();
+
+        // 获取输出Tensor
+        auto voc_output_handle = vocoder_predictor_->GetOutput(0);
+        // 打印输出Tensor的shape
+        std::cout << "Vocoder Output shape: ";
+        auto shape = voc_output_handle->shape();
+        for (auto s : shape) {
+            std::cout << s << ", ";
+        }
+        std::cout << std::endl;
+
+        return voc_output_handle;
+    }
+
+    void VocoderOutputToWav(
+        std::unique_ptr<const Tensor> &&vocOutput) override {
+        // 获取输出Tensor的数据
+        int64_t output_size = 1;
+        for (auto dim : vocOutput->shape()) {
+            output_size *= dim;
+        }
+        auto output_data = vocOutput->mutable_data<float>();
+
+        SaveFloatWav(output_data, output_size);
+    }
+
+    void SaveFloatWav(float *floatWav, int64_t size) override;
+
+    bool IsLoaded() override {
+        return acoustic_model_predictor_ != nullptr &&
+               vocoder_predictor_ != nullptr;
+    }
+
+    float GetInferenceTime() override { return inference_time_; }
+
+    const std::vector<WavDataType> &GetWav() { return wav_; }
+
+    int GetWavSize() override { return wav_.size() * sizeof(WavDataType); }
+
+    // 获取WAV持续时间（单位：毫秒）
+    float GetWavDuration() override {
+        return static_cast<float>(GetWavSize()) / sizeof(WavDataType) /
+               static_cast<float>(wav_sample_rate_) * 1000;
+    }
+
+    // 获取RTF（合成时间 / 音频时长）
+    float GetRTF() override { return GetInferenceTime() / GetWavDuration(); }
+
+    void ReleaseWav() override { wav_.clear(); }
+
+    bool WriteWavToFile(const std::string &wavPath) override {
+        std::ofstream fout(wavPath, std::ios::binary);
+        if (!fout.is_open()) {
+            return false;
+        }
+
+        // 写入头信息
+        WavHeader header;
+        header.audio_format = GetWavAudioFormat();
+        header.data_size = GetWavSize();
+        header.size = sizeof(header) - 8 + header.data_size;
+        header.sample_rate = wav_sample_rate_;
+        header.byte_rate = header.sample_rate * header.num_channels *
+                           header.bits_per_sample / 8;
+        header.block_align = header.num_channels * header.bits_per_sample / 8;
+        fout.write(reinterpret_cast<const char *>(&header), sizeof(header));
+
+        // 写入wav数据
+        fout.write(reinterpret_cast<const char *>(wav_.data()),
+                   header.data_size);
+
+        fout.close();
+        return true;
+    }
+
+  protected:
+    struct WavHeader {
+        // RIFF 头
+        char riff[4] = {'R', 'I', 'F', 'F'};
+        uint32_t size = 0;
+        char wave[4] = {'W', 'A', 'V', 'E'};
+
+        // FMT 头
+        char fmt[4] = {'f', 'm', 't', ' '};
+        uint32_t fmt_size = 16;
+        uint16_t audio_format = 0;
+        uint16_t num_channels = 1;
+        uint32_t sample_rate = 0;
+        uint32_t byte_rate = 0;
+        uint16_t block_align = 0;
+        uint16_t bits_per_sample = sizeof(WavDataType) * 8;
+
+        // DATA 头
+        char data[4] = {'d', 'a', 't', 'a'};
+        uint32_t data_size = 0;
+    };
+
+    enum WavAudioFormat {
+        WAV_FORMAT_16BIT_PCM = 1,   // 16-bit PCM 格式
+        WAV_FORMAT_32BIT_FLOAT = 3  // 32-bit IEEE float 格式
+    };
+
+  protected:
+    // 返回值通过模板特化由 WavDataType 决定
+    inline uint16_t GetWavAudioFormat();
+
+    inline float Abs(float number) { return (number < 0) ? -number : number; }
+
+  protected:
+    float inference_time_ = 0;
+    uint32_t wav_sample_rate_ = 0;
+    std::vector<WavDataType> wav_;
+    std::shared_ptr<PaddlePredictor> acoustic_model_predictor_ = nullptr;
+    std::shared_ptr<PaddlePredictor> vocoder_predictor_ = nullptr;
+};
+
+template <>
+uint16_t Predictor<int16_t>::GetWavAudioFormat() {
+    return Predictor::WAV_FORMAT_16BIT_PCM;
+}
+
+template <>
+uint16_t Predictor<float>::GetWavAudioFormat() {
+    return Predictor::WAV_FORMAT_32BIT_FLOAT;
+}
+
+// 保存 16-bit PCM 格式 WAV
+template <>
+void Predictor<int16_t>::SaveFloatWav(float *floatWav, int64_t size) {
+    wav_.resize(size);
+    float maxSample = 0.01;
+    // 寻找最大采样值
+    for (int64_t i = 0; i < size; i++) {
+        float sample = Abs(floatWav[i]);
+        if (sample > maxSample) {
+            maxSample = sample;
+        }
+    }
+    // 把采样值缩放到 int_16 范围
+    for (int64_t i = 0; i < size; i++) {
+        wav_[i] = floatWav[i] * 32767.0f / maxSample;
+    }
+}
+
+// 保存 32-bit IEEE float 格式 WAV
+template <>
+void Predictor<float>::SaveFloatWav(float *floatWav, int64_t size) {
+    wav_.resize(size);
+    std::copy_n(floatWav, size, wav_.data());
+}
diff --git a/demos/TTSArmLinux/src/TTSCppFrontend b/demos/TTSArmLinux/src/TTSCppFrontend
new file mode 120000
index 00000000..25953976
--- /dev/null
+++ b/demos/TTSArmLinux/src/TTSCppFrontend
@@ -0,0 +1 @@
+../../TTSCppFrontend/
\ No newline at end of file
diff --git a/demos/TTSArmLinux/src/main.cc b/demos/TTSArmLinux/src/main.cc
new file mode 100644
index 00000000..0b8e26bc
--- /dev/null
+++ b/demos/TTSArmLinux/src/main.cc
@@ -0,0 +1,162 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <front/front_interface.h>
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <paddle_api.h>
+#include <cstdlib>
+#include <iostream>
+#include <map>
+#include <memory>
+#include <string>
+#include "Predictor.hpp"
+
+using namespace paddle::lite_api;
+
+DEFINE_string(
+    sentence,
+    "你好，欢迎使用语音合成服务",
+    "Text to be synthesized (Chinese only. English will crash the program.)");
+DEFINE_string(front_conf, "./front.conf", "Front configuration file");
+DEFINE_string(acoustic_model,
+              "./models/cpu/fastspeech2_csmsc_arm.nb",
+              "Acoustic model .nb file");
+DEFINE_string(vocoder,
+              "./models/cpu/fastspeech2_csmsc_arm.nb",
+              "vocoder .nb file");
+DEFINE_string(output_wav, "./output/tts.wav", "Output WAV file");
+DEFINE_string(wav_bit_depth,
+              "16",
+              "WAV bit depth, 16 (16-bit PCM) or 32 (32-bit IEEE float)");
+DEFINE_string(wav_sample_rate,
+              "24000",
+              "WAV sample rate, should match the output of the vocoder");
+DEFINE_string(cpu_thread, "1", "CPU thread numbers");
+
+int main(int argc, char *argv[]) {
+    gflags::ParseCommandLineFlags(&argc, &argv, true);
+
+    PredictorInterface *predictor;
+
+    if (FLAGS_wav_bit_depth == "16") {
+        predictor = new Predictor<int16_t>();
+    } else if (FLAGS_wav_bit_depth == "32") {
+        predictor = new Predictor<float>();
+    } else {
+        LOG(ERROR) << "Unsupported WAV bit depth: " << FLAGS_wav_bit_depth;
+        return -1;
+    }
+
+
+    /////////////////////////// 前端：文本转音素 ///////////////////////////
+
+    // 实例化文本前端引擎
+    ppspeech::FrontEngineInterface *front_inst = nullptr;
+    front_inst = new ppspeech::FrontEngineInterface(FLAGS_front_conf);
+    if ((!front_inst) || (front_inst->init())) {
+        LOG(ERROR) << "Creater tts engine failed!";
+        if (front_inst != nullptr) {
+            delete front_inst;
+        }
+        front_inst = nullptr;
+        return -1;
+    }
+
+    std::wstring ws_sentence = ppspeech::utf8string2wstring(FLAGS_sentence);
+
+    // 繁体转简体
+    std::wstring sentence_simp;
+    front_inst->Trand2Simp(ws_sentence, &sentence_simp);
+    ws_sentence = sentence_simp;
+
+    std::string s_sentence;
+    std::vector<std::wstring> sentence_part;
+    std::vector<int> phoneids = {};
+    std::vector<int> toneids = {};
+
+    // 根据标点进行分句
+    LOG(INFO) << "Start to segment sentences by punctuation";
+    front_inst->SplitByPunc(ws_sentence, &sentence_part);
+    LOG(INFO) << "Segment sentences through punctuation successfully";
+
+    // 分句后获取音素id
+    LOG(INFO)
+        << "Start to get the phoneme and tone id sequence of each sentence";
+    for (int i = 0; i < sentence_part.size(); i++) {
+        LOG(INFO) << "Raw sentence is: "
+                  << ppspeech::wstring2utf8string(sentence_part[i]);
+        front_inst->SentenceNormalize(&sentence_part[i]);
+        s_sentence = ppspeech::wstring2utf8string(sentence_part[i]);
+        LOG(INFO) << "After normalization sentence is: " << s_sentence;
+
+        if (0 != front_inst->GetSentenceIds(s_sentence, &phoneids, &toneids)) {
+            LOG(ERROR) << "TTS inst get sentence phoneids and toneids failed";
+            return -1;
+        }
+    }
+    LOG(INFO) << "The phoneids of the sentence is: "
+              << limonp::Join(phoneids.begin(), phoneids.end(), " ");
+    LOG(INFO) << "The toneids of the sentence is: "
+              << limonp::Join(toneids.begin(), toneids.end(), " ");
+    LOG(INFO) << "Get the phoneme id sequence of each sentence successfully";
+
+
+    /////////////////////////// 后端：音素转音频 ///////////////////////////
+
+    // WAV采样率（必须与模型输出匹配）
+    // 如果播放速度和音调异常，请修改采样率
+    // 常见采样率：16000, 24000, 32000, 44100, 48000, 96000
+    const uint32_t wavSampleRate = std::stoul(FLAGS_wav_sample_rate);
+
+    // CPU线程数
+    const int cpuThreadNum = std::stol(FLAGS_cpu_thread);
+
+    // CPU电源模式
+    const PowerMode cpuPowerMode = PowerMode::LITE_POWER_HIGH;
+
+    if (!predictor->Init(FLAGS_acoustic_model,
+                         FLAGS_vocoder,
+                         cpuPowerMode,
+                         cpuThreadNum,
+                         wavSampleRate)) {
+        LOG(ERROR) << "predictor init failed" << std::endl;
+        return -1;
+    }
+
+    std::vector<int64_t> phones(phoneids.size());
+    std::transform(phoneids.begin(), phoneids.end(), phones.begin(), [](int x) {
+        return static_cast<int64_t>(x);
+    });
+
+    if (!predictor->RunModel(phones)) {
+        LOG(ERROR) << "predictor run model failed" << std::endl;
+        return -1;
+    }
+
+    LOG(INFO) << "Inference time: " << predictor->GetInferenceTime() << " ms, "
+              << "WAV size (without header): " << predictor->GetWavSize()
+              << " bytes, "
+              << "WAV duration: " << predictor->GetWavDuration() << " ms, "
+              << "RTF: " << predictor->GetRTF() << std::endl;
+
+    if (!predictor->WriteWavToFile(FLAGS_output_wav)) {
+        LOG(ERROR) << "write wav file failed" << std::endl;
+        return -1;
+    }
+
+    delete predictor;
+
+    return 0;
+}
diff --git a/demos/TTSArmLinux/src/third-party b/demos/TTSArmLinux/src/third-party
new file mode 120000
index 00000000..851b2c1e
--- /dev/null
+++ b/demos/TTSArmLinux/src/third-party
@@ -0,0 +1 @@
+TTSCppFrontend/third-party
\ No newline at end of file
diff --git a/demos/TTSCppFrontend/.gitignore b/demos/TTSCppFrontend/.gitignore
new file mode 100644
index 00000000..0075a901
--- /dev/null
+++ b/demos/TTSCppFrontend/.gitignore
@@ -0,0 +1,2 @@
+build/
+dict/
diff --git a/demos/TTSCppFrontend/CMakeLists.txt b/demos/TTSCppFrontend/CMakeLists.txt
new file mode 100644
index 00000000..14245372
--- /dev/null
+++ b/demos/TTSCppFrontend/CMakeLists.txt
@@ -0,0 +1,63 @@
+cmake_minimum_required(VERSION 3.10)
+project(paddlespeech_tts_cpp)
+
+
+########## Global Options ##########
+
+option(WITH_FRONT_DEMO "Build front demo" ON)
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+set(ABSL_PROPAGATE_CXX_STD ON)
+
+
+########## Dependencies ##########
+
+set(ENV{PKG_CONFIG_PATH} "${CMAKE_SOURCE_DIR}/third-party/build/lib/pkgconfig:${CMAKE_SOURCE_DIR}/third-party/build/lib64/pkgconfig")
+find_package(PkgConfig REQUIRED)
+
+# It is hard to load xxx-config.cmake in a custom location, so use pkgconfig instead.
+pkg_check_modules(ABSL   REQUIRED absl_strings IMPORTED_TARGET)
+pkg_check_modules(GFLAGS REQUIRED gflags       IMPORTED_TARGET)
+pkg_check_modules(GLOG   REQUIRED libglog      IMPORTED_TARGET)
+
+# load header-only libraries
+include_directories(
+    ${CMAKE_SOURCE_DIR}/third-party/build/src/cppjieba/include
+    ${CMAKE_SOURCE_DIR}/third-party/build/src/limonp/include
+)
+
+find_package(Threads REQUIRED)
+
+
+########## paddlespeech_tts_front ##########
+
+include_directories(src)
+
+file(GLOB FRONT_SOURCES
+    ./src/base/*.cpp
+    ./src/front/*.cpp
+)
+add_library(paddlespeech_tts_front STATIC ${FRONT_SOURCES})
+
+target_link_libraries(
+    paddlespeech_tts_front
+    PUBLIC
+    PkgConfig::GFLAGS
+    PkgConfig::GLOG
+    PkgConfig::ABSL
+    Threads::Threads
+)
+
+
+########## tts_front_demo ##########
+
+if (WITH_FRONT_DEMO)
+
+    file(GLOB FRONT_DEMO_SOURCES front_demo/*.cpp)
+    add_executable(tts_front_demo ${FRONT_DEMO_SOURCES})
+
+    target_include_directories(tts_front_demo PRIVATE ./front_demo)
+    target_link_libraries(tts_front_demo PRIVATE paddlespeech_tts_front)
+
+endif (WITH_FRONT_DEMO)
diff --git a/demos/TTSCppFrontend/README.md b/demos/TTSCppFrontend/README.md
new file mode 100644
index 00000000..c179fdd0
--- /dev/null
+++ b/demos/TTSCppFrontend/README.md
@@ -0,0 +1,56 @@
+# PaddleSpeech TTS CPP Frontend
+
+A TTS frontend that implements text-to-phoneme conversion.
+
+Currently it only supports Chinese, any English word will crash the demo.
+
+## Install Build Tools
+
+```bash
+# Ubuntu
+sudo apt install build-essential cmake pkg-config
+
+# CentOS
+sudo yum groupinstall "Development Tools"
+sudo yum install cmake
+```
+
+If your cmake version is too old, you can go here to download a precompiled new version: https://cmake.org/download/
+
+## Build
+
+```bash
+# Build with all CPU cores
+./build.sh
+
+# Build with 1 core
+./build.sh -j1
+```
+
+Dependent libraries will be automatically downloaded to the `third-party/build` folder.
+
+If the download speed is too slow, you can open [third-party/CMakeLists.txt](third-party/CMakeLists.txt) and modify `GIT_REPOSITORY` URLs.
+
+## Download dictionary files
+
+```bash
+./download.sh
+```
+
+## Run
+You can change `--phone2id_path` in `./front_demo/front.conf` to the `phone_id_map.txt` of your own acoustic model.
+
+```bash
+./run_front_demo.sh
+./run_front_demo.sh --help
+./run_front_demo.sh --sentence "这是语音合成服务的文本前端，用于将文本转换为音素序号数组。"
+./run_front_demo.sh --front_conf ./front_demo/front.conf --sentence "你还需要一个语音合成后端才能将其转换为实际的声音。"
+```
+
+## Clean
+
+```bash
+./clean.sh
+```
+
+The folders `front_demo/dict`, `build` and `third-party/build` will be deleted.
diff --git a/demos/TTSCppFrontend/build-depends.sh b/demos/TTSCppFrontend/build-depends.sh
new file mode 100755
index 00000000..c5f2ca12
--- /dev/null
+++ b/demos/TTSCppFrontend/build-depends.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+set -e
+set -x
+
+cd "$(dirname "$(realpath "$0")")"
+
+cd ./third-party
+
+mkdir -p build
+cd build
+
+cmake ..
+
+if [ "$*" = "" ]; then
+    make -j$(nproc)
+else
+    make "$@"
+fi
+
+echo "Done."
diff --git a/demos/TTSCppFrontend/build.sh b/demos/TTSCppFrontend/build.sh
new file mode 100755
index 00000000..a136cb93
--- /dev/null
+++ b/demos/TTSCppFrontend/build.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+set -e
+set -x
+
+cd "$(dirname "$(realpath "$0")")"
+
+echo "************* Download & Build Dependencies *************"
+./build-depends.sh "$@"
+
+echo "************* Build Front Lib and Demo *************"
+mkdir -p ./build
+cd ./build
+cmake ..
+
+if [ "$*" = "" ]; then
+    make -j$(nproc)
+else
+    make "$@"
+fi
+
+echo "Done."
diff --git a/demos/TTSCppFrontend/clean.sh b/demos/TTSCppFrontend/clean.sh
new file mode 100755
index 00000000..efbb2887
--- /dev/null
+++ b/demos/TTSCppFrontend/clean.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+set -e
+set -x
+
+cd "$(dirname "$(realpath "$0")")"
+rm -rf "./front_demo/dict"
+rm -rf "./build"
+rm -rf "./third-party/build"
+
+echo "Done."
diff --git a/demos/TTSCppFrontend/download.sh b/demos/TTSCppFrontend/download.sh
new file mode 100755
index 00000000..0953e3a5
--- /dev/null
+++ b/demos/TTSCppFrontend/download.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+set -e
+
+cd "$(dirname "$(realpath "$0")")"
+
+download() {
+    file="$1"
+    url="$2"
+    md5="$3"
+    dir="$4"
+
+    cd "$dir"
+
+    if [ -f "$file" ] && [ "$(md5sum "$file" | awk '{ print $1 }')" = "$md5" ]; then
+        echo "File $file (MD5: $md5) has been downloaded."
+    else
+        echo "Downloading $file..."
+        wget -O "$file" "$url"
+
+        # MD5 verify
+        fileMd5="$(md5sum "$file" | awk '{ print $1 }')"
+        if [ "$fileMd5" == "$md5" ]; then
+            echo "File $file (MD5: $md5) has been downloaded."
+        else
+            echo "MD5 mismatch, file may be corrupt"
+            echo "$file MD5: $fileMd5, it should be $md5"
+        fi
+    fi
+
+    echo "Extracting $file..."
+    echo '-----------------------'
+    tar -vxf "$file"
+    echo '======================='
+}
+
+########################################
+
+DIST_DIR="$PWD/front_demo/dict"
+
+mkdir -p "$DIST_DIR"
+
+download 'fastspeech2_nosil_baker_ckpt_0.4.tar.gz' \
+    'https://paddlespeech.bj.bcebos.com/t2s/text_frontend/fastspeech2_nosil_baker_ckpt_0.4.tar.gz' \
+    '7bf1bab1737375fa123c413eb429c573' \
+    "$DIST_DIR"
+
+download 'speedyspeech_nosil_baker_ckpt_0.5.tar.gz' \
+    'https://paddlespeech.bj.bcebos.com/t2s/text_frontend/speedyspeech_nosil_baker_ckpt_0.5.tar.gz' \
+    '0b7754b21f324789aef469c61f4d5b8f' \
+    "$DIST_DIR"
+
+download 'jieba.tar.gz' \
+    'https://paddlespeech.bj.bcebos.com/t2s/text_frontend/jieba.tar.gz' \
+    '6d30f426bd8c0025110a483f051315ca' \
+    "$DIST_DIR"
+
+download 'tranditional_to_simplified.tar.gz' \
+    'https://paddlespeech.bj.bcebos.com/t2s/text_frontend/tranditional_to_simplified.tar.gz' \
+    '258f5b59d5ebfe96d02007ca1d274a7f' \
+    "$DIST_DIR"
+
+echo "Done."
diff --git a/demos/TTSCppFrontend/front_demo/front.conf b/demos/TTSCppFrontend/front_demo/front.conf
new file mode 100644
index 00000000..abff4447
--- /dev/null
+++ b/demos/TTSCppFrontend/front_demo/front.conf
@@ -0,0 +1,21 @@
+# jieba conf
+--jieba_dict_path=./front_demo/dict/jieba/jieba.dict.utf8
+--jieba_hmm_path=./front_demo/dict/jieba/hmm_model.utf8
+--jieba_user_dict_path=./front_demo/dict/jieba/user.dict.utf8
+--jieba_idf_path=./front_demo/dict/jieba/idf.utf8
+--jieba_stop_word_path=./front_demo/dict/jieba/stop_words.utf8
+
+# dict conf fastspeech2_0.4
+--separate_tone=false
+--word2phone_path=./front_demo/dict/fastspeech2_nosil_baker_ckpt_0.4/word2phone_fs2.dict
+--phone2id_path=./front_demo/dict/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
+--tone2id_path=./front_demo/dict/fastspeech2_nosil_baker_ckpt_0.4/word2phone_fs2.dict
+
+# dict conf speedyspeech_0.5
+#--separate_tone=true
+#--word2phone_path=./front_demo/dict/speedyspeech_nosil_baker_ckpt_0.5/word2phone.dict
+#--phone2id_path=./front_demo/dict/speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt
+#--tone2id_path=./front_demo/dict/speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt
+
+# dict of tranditional_to_simplified
+--trand2simpd_path=./front_demo/dict/tranditional_to_simplified/trand2simp.txt
diff --git a/demos/TTSCppFrontend/front_demo/front_demo.cpp b/demos/TTSCppFrontend/front_demo/front_demo.cpp
new file mode 100644
index 00000000..77f3fc72
--- /dev/null
+++ b/demos/TTSCppFrontend/front_demo/front_demo.cpp
@@ -0,0 +1,79 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <map>
+#include <string>
+#include "front/front_interface.h"
+
+DEFINE_string(sentence, "你好，欢迎使用语音合成服务", "Text to be synthesized");
+DEFINE_string(front_conf, "./front_demo/front.conf", "Front conf file");
+// DEFINE_string(separate_tone, "true", "If true, get phoneids and tonesid");
+
+
+int main(int argc, char** argv) {
+    gflags::ParseCommandLineFlags(&argc, &argv, true);
+    // 实例化文本前端引擎
+    ppspeech::FrontEngineInterface* front_inst = nullptr;
+    front_inst = new ppspeech::FrontEngineInterface(FLAGS_front_conf);
+    if ((!front_inst) || (front_inst->init())) {
+        LOG(ERROR) << "Creater tts engine failed!";
+        if (front_inst != nullptr) {
+            delete front_inst;
+        }
+        front_inst = nullptr;
+        return -1;
+    }
+
+    std::wstring ws_sentence = ppspeech::utf8string2wstring(FLAGS_sentence);
+
+    // 繁体转简体
+    std::wstring sentence_simp;
+    front_inst->Trand2Simp(ws_sentence, &sentence_simp);
+    ws_sentence = sentence_simp;
+
+    std::string s_sentence;
+    std::vector<std::wstring> sentence_part;
+    std::vector<int> phoneids = {};
+    std::vector<int> toneids = {};
+
+    // 根据标点进行分句
+    LOG(INFO) << "Start to segment sentences by punctuation";
+    front_inst->SplitByPunc(ws_sentence, &sentence_part);
+    LOG(INFO) << "Segment sentences through punctuation successfully";
+
+    // 分句后获取音素id
+    LOG(INFO)
+        << "Start to get the phoneme and tone id sequence of each sentence";
+    for (int i = 0; i < sentence_part.size(); i++) {
+        LOG(INFO) << "Raw sentence is: "
+                  << ppspeech::wstring2utf8string(sentence_part[i]);
+        front_inst->SentenceNormalize(&sentence_part[i]);
+        s_sentence = ppspeech::wstring2utf8string(sentence_part[i]);
+        LOG(INFO) << "After normalization sentence is: " << s_sentence;
+
+        if (0 != front_inst->GetSentenceIds(s_sentence, &phoneids, &toneids)) {
+            LOG(ERROR) << "TTS inst get sentence phoneids and toneids failed";
+            return -1;
+        }
+    }
+    LOG(INFO) << "The phoneids of the sentence is: "
+              << limonp::Join(phoneids.begin(), phoneids.end(), " ");
+    LOG(INFO) << "The toneids of the sentence is: "
+              << limonp::Join(toneids.begin(), toneids.end(), " ");
+    LOG(INFO) << "Get the phoneme id sequence of each sentence successfully";
+
+    return EXIT_SUCCESS;
+}
diff --git a/demos/TTSCppFrontend/front_demo/gentools/gen_dict_paddlespeech.py b/demos/TTSCppFrontend/front_demo/gentools/gen_dict_paddlespeech.py
new file mode 100644
index 00000000..5aaa6e34
--- /dev/null
+++ b/demos/TTSCppFrontend/front_demo/gentools/gen_dict_paddlespeech.py
@@ -0,0 +1,111 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import configparser
+
+from paddlespeech.t2s.frontend.zh_frontend import Frontend
+
+
+def get_phone(frontend,
+              word,
+              merge_sentences=True,
+              print_info=False,
+              robot=False,
+              get_tone_ids=False):
+    phonemes = frontend.get_phonemes(word, merge_sentences, print_info, robot)
+    # Some optimizations
+    phones, tones = frontend._get_phone_tone(phonemes[0], get_tone_ids)
+    #print(type(phones), phones)
+    #print(type(tones), tones)
+    return phones, tones
+
+
+def gen_word2phone_dict(frontend,
+                        jieba_words_dict,
+                        word2phone_dict,
+                        get_tone=False):
+    with open(jieba_words_dict, "r") as f1, open(word2phone_dict, "w+") as f2:
+        for line in f1.readlines():
+            word = line.split(" ")[0]
+            phone, tone = get_phone(frontend, word, get_tone_ids=get_tone)
+            phone_str = ""
+
+            if tone:
+                assert (len(phone) == len(tone))
+                for i in range(len(tone)):
+                    phone_tone = phone[i] + tone[i]
+                    phone_str += (" " + phone_tone)
+                phone_str = phone_str.strip("sp0").strip(" ")
+            else:
+                for x in phone:
+                    phone_str += (" " + x)
+                phone_str = phone_str.strip("sp").strip(" ")
+            print(phone_str)
+            f2.write(word + " " + phone_str + "\n")
+    print("Generate word2phone dict successfully.")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Generate dictionary")
+    parser.add_argument(
+        "--config", type=str, default="./config.ini", help="config file.")
+    parser.add_argument(
+        "--am_type",
+        type=str,
+        default="fastspeech2",
+        help="fastspeech2 or speedyspeech")
+    args = parser.parse_args()
+
+    # Read config
+    cf = configparser.ConfigParser()
+    cf.read(args.config)
+    jieba_words_dict_file = cf.get("jieba",
+                                   "jieba_words_dict")  # get words dict
+
+    am_type = args.am_type
+    if (am_type == "fastspeech2"):
+        phone2id_dict_file = cf.get(am_type, "phone2id_dict")
+        word2phone_dict_file = cf.get(am_type, "word2phone_dict")
+
+        frontend = Frontend(phone_vocab_path=phone2id_dict_file)
+        print("frontend done!")
+
+        gen_word2phone_dict(
+            frontend,
+            jieba_words_dict_file,
+            word2phone_dict_file,
+            get_tone=False)
+
+    elif (am_type == "speedyspeech"):
+        phone2id_dict_file = cf.get(am_type, "phone2id_dict")
+        tone2id_dict_file = cf.get(am_type, "tone2id_dict")
+        word2phone_dict_file = cf.get(am_type, "word2phone_dict")
+
+        frontend = Frontend(
+            phone_vocab_path=phone2id_dict_file,
+            tone_vocab_path=tone2id_dict_file)
+        print("frontend done!")
+
+        gen_word2phone_dict(
+            frontend,
+            jieba_words_dict_file,
+            word2phone_dict_file,
+            get_tone=True)
+
+    else:
+        print("Please set correct am type, fastspeech2 or speedyspeech.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/demos/TTSCppFrontend/front_demo/gentools/genid.py b/demos/TTSCppFrontend/front_demo/gentools/genid.py
new file mode 100644
index 00000000..cf83623f
--- /dev/null
+++ b/demos/TTSCppFrontend/front_demo/gentools/genid.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+PHONESFILE = "./dict/phones.txt"
+PHONES_ID_FILE = "./dict/phonesid.dict"
+TONESFILE = "./dict/tones.txt"
+TONES_ID_FILE = "./dict/tonesid.dict"
+
+
+def GenIdFile(file, idfile):
+    id = 2
+    with open(file, 'r') as f1, open(idfile, "w+") as f2:
+        f2.write("<pad> 0\n")
+        f2.write("<unk> 1\n")
+        for line in f1.readlines():
+            phone = line.strip()
+            print(phone + " " + str(id) + "\n")
+            f2.write(phone + " " + str(id) + "\n")
+            id += 1
+
+
+if __name__ == "__main__":
+    GenIdFile(PHONESFILE, PHONES_ID_FILE)
+    GenIdFile(TONESFILE, TONES_ID_FILE)
diff --git a/demos/TTSCppFrontend/front_demo/gentools/word2phones.py b/demos/TTSCppFrontend/front_demo/gentools/word2phones.py
new file mode 100644
index 00000000..d9baeea9
--- /dev/null
+++ b/demos/TTSCppFrontend/front_demo/gentools/word2phones.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import re
+
+from pypinyin import lazy_pinyin
+from pypinyin import Style
+
+worddict = "./dict/jieba_part.dict.utf8"
+newdict = "./dict/word_phones.dict"
+
+
+def GenPhones(initials, finals, separate=True):
+
+    phones = []
+    for c, v in zip(initials, finals):
+        if re.match(r'i\d', v):
+            if c in ['z', 'c', 's']:
+                v = re.sub('i', 'ii', v)
+            elif c in ['zh', 'ch', 'sh', 'r']:
+                v = re.sub('i', 'iii', v)
+        if c:
+            if separate is True:
+                phones.append(c + '0')
+            elif separate is False:
+                phones.append(c)
+            else:
+                print("Not sure whether phone and tone need to be separated")
+        if v:
+            phones.append(v)
+    return phones
+
+
+with open(worddict, "r") as f1, open(newdict, "w+") as f2:
+    for line in f1.readlines():
+        word = line.split(" ")[0]
+        initials = lazy_pinyin(
+            word, neutral_tone_with_five=True, style=Style.INITIALS)
+        finals = lazy_pinyin(
+            word, neutral_tone_with_five=True, style=Style.FINALS_TONE3)
+
+        phones = GenPhones(initials, finals, True)
+
+        temp = " ".join(phones)
+        f2.write(word + " " + temp + "\n")
diff --git a/demos/TTSCppFrontend/run_front_demo.sh b/demos/TTSCppFrontend/run_front_demo.sh
new file mode 100755
index 00000000..4dcded5c
--- /dev/null
+++ b/demos/TTSCppFrontend/run_front_demo.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+set -e
+set -x
+
+cd "$(dirname "$(realpath "$0")")"
+
+./build/tts_front_demo "$@"
diff --git a/demos/TTSCppFrontend/src/base/type_conv.cpp b/demos/TTSCppFrontend/src/base/type_conv.cpp
new file mode 100644
index 00000000..b7ff6364
--- /dev/null
+++ b/demos/TTSCppFrontend/src/base/type_conv.cpp
@@ -0,0 +1,28 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "base/type_conv.h"
+
+namespace ppspeech {
+// wstring to string
+std::string wstring2utf8string(const std::wstring& str) {
+    static std::wstring_convert<std::codecvt_utf8<wchar_t>> strCnv;
+    return strCnv.to_bytes(str);
+}
+
+// string to wstring
+std::wstring utf8string2wstring(const std::string& str) {
+    static std::wstring_convert<std::codecvt_utf8<wchar_t>> strCnv;
+    return strCnv.from_bytes(str);
+}
+}  // namespace ppspeech
diff --git a/demos/TTSCppFrontend/src/base/type_conv.h b/demos/TTSCppFrontend/src/base/type_conv.h
new file mode 100644
index 00000000..6aecfc43
--- /dev/null
+++ b/demos/TTSCppFrontend/src/base/type_conv.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef BASE_TYPE_CONVC_H
+#define BASE_TYPE_CONVC_H
+
+#include <codecvt>
+#include <locale>
+#include <string>
+
+
+namespace ppspeech {
+// wstring to string
+std::string wstring2utf8string(const std::wstring& str);
+
+// string to wstring
+std::wstring utf8string2wstring(const std::string& str);
+}
+
+#endif  // BASE_TYPE_CONVC_H
\ No newline at end of file
diff --git a/demos/TTSCppFrontend/src/front/front_interface.cpp b/demos/TTSCppFrontend/src/front/front_interface.cpp
new file mode 100644
index 00000000..e7b08c79
--- /dev/null
+++ b/demos/TTSCppFrontend/src/front/front_interface.cpp
@@ -0,0 +1,1130 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "front/front_interface.h"
+
+namespace ppspeech {
+
+int FrontEngineInterface::init() {
+    if (_initialed) {
+        return 0;
+    }
+    if (0 != ReadConfFile()) {
+        LOG(ERROR) << "Read front conf file failed";
+        return -1;
+    }
+
+    _jieba = new cppjieba::Jieba(_jieba_dict_path,
+                                 _jieba_hmm_path,
+                                 _jieba_user_dict_path,
+                                 _jieba_idf_path,
+                                 _jieba_stop_word_path);
+
+    _punc = {"，",
+             "。",
+             "、",
+             "？",
+             "：",
+             "；",
+             "~",
+             "！",
+             ",",
+             ".",
+             "?",
+             "!",
+             ":",
+             ";",
+             "/",
+             "\\"};
+    _punc_omit = {"“", "”", "\"", "\""};
+
+    // 需要儿化音处理的词语
+    must_erhua = {
+        "小院儿", "胡同儿", "范儿", "老汉儿", "撒欢儿", "寻老礼儿", "妥妥儿"};
+    not_erhua = {"虐儿",   "为儿",   "护儿",   "瞒儿",   "救儿",   "替儿",
+                 "有儿",   "一儿",   "我儿",   "俺儿",   "妻儿",   "拐儿",
+                 "聋儿",   "乞儿",   "患儿",   "幼儿",   "孤儿",   "婴儿",
+                 "婴幼儿", "连体儿", "脑瘫儿", "流浪儿", "体弱儿", "混血儿",
+                 "蜜雪儿", "舫儿",   "祖儿",   "美儿",   "应采儿", "可儿",
+                 "侄儿",   "孙儿",   "侄孙儿", "女儿",   "男儿",   "红孩儿",
+                 "花儿",   "虫儿",   "马儿",   "鸟儿",   "猪儿",   "猫儿",
+                 "狗儿"};
+
+    must_not_neural_tone_words = {
+        "男子", "女子", "分子", "原子", "量子", "莲子", "石子", "瓜子", "电子"};
+    // 需要轻声处理的词语
+    must_neural_tone_words = {
+        "麻烦", "麻利", "鸳鸯", "高粱", "骨头", "骆驼", "马虎", "首饰", "馒头",
+        "馄饨", "风筝", "难为", "队伍", "阔气", "闺女", "门道", "锄头", "铺盖",
+        "铃铛", "铁匠", "钥匙", "里脊", "里头", "部分", "那么", "道士", "造化",
+        "迷糊", "连累", "这么", "这个", "运气", "过去", "软和", "转悠", "踏实",
+        "跳蚤", "跟头", "趔趄", "财主", "豆腐", "讲究", "记性", "记号", "认识",
+        "规矩", "见识", "裁缝", "补丁", "衣裳", "衣服", "衙门", "街坊", "行李",
+        "行当", "蛤蟆", "蘑菇", "薄荷", "葫芦", "葡萄", "萝卜", "荸荠", "苗条",
+        "苗头", "苍蝇", "芝麻", "舒服", "舒坦", "舌头", "自在", "膏药", "脾气",
+        "脑袋", "脊梁", "能耐", "胳膊", "胭脂", "胡萝", "胡琴", "胡同", "聪明",
+        "耽误", "耽搁", "耷拉", "耳朵", "老爷", "老实", "老婆", "老头", "老太",
+        "翻腾", "罗嗦", "罐头", "编辑", "结实", "红火", "累赘", "糨糊", "糊涂",
+        "精神", "粮食", "簸箕", "篱笆", "算计", "算盘", "答应", "笤帚", "笑语",
+        "笑话", "窟窿", "窝囊", "窗户", "稳当", "稀罕", "称呼", "秧歌", "秀气",
+        "秀才", "福气", "祖宗", "砚台", "码头", "石榴", "石头", "石匠", "知识",
+        "眼睛", "眯缝", "眨巴", "眉毛", "相声", "盘算", "白净", "痢疾", "痛快",
+        "疟疾", "疙瘩", "疏忽", "畜生", "生意", "甘蔗", "琵琶", "琢磨", "琉璃",
+        "玻璃", "玫瑰", "玄乎", "狐狸", "状元", "特务", "牲口", "牙碜", "牌楼",
+        "爽快", "爱人", "热闹", "烧饼", "烟筒", "烂糊", "点心", "炊帚", "灯笼",
+        "火候", "漂亮", "滑溜", "溜达", "温和", "清楚", "消息", "浪头", "活泼",
+        "比方", "正经", "欺负", "模糊", "槟榔", "棺材", "棒槌", "棉花", "核桃",
+        "栅栏", "柴火", "架势", "枕头", "枇杷", "机灵", "本事", "木头", "木匠",
+        "朋友", "月饼", "月亮", "暖和", "明白", "时候", "新鲜", "故事", "收拾",
+        "收成", "提防", "挖苦", "挑剔", "指甲", "指头", "拾掇", "拳头", "拨弄",
+        "招牌", "招呼", "抬举", "护士", "折腾", "扫帚", "打量", "打算", "打点",
+        "打扮", "打听", "打发", "扎实", "扁担", "戒指", "懒得", "意识", "意思",
+        "情形", "悟性", "怪物", "思量", "怎么", "念头", "念叨", "快活", "忙活",
+        "志气", "心思", "得罪", "张罗", "弟兄", "开通", "应酬", "庄稼", "干事",
+        "帮手", "帐篷", "希罕", "师父", "师傅", "巴结", "巴掌", "差事", "工夫",
+        "岁数", "屁股", "尾巴", "少爷", "小气", "小伙", "将就", "对头", "对付",
+        "寡妇", "家伙", "客气", "实在", "官司", "学问", "学生", "字号", "嫁妆",
+        "媳妇", "媒人", "婆家", "娘家", "委屈", "姑娘", "姐夫", "妯娌", "妥当",
+        "妖精", "奴才", "女婿", "头发", "太阳", "大爷", "大方", "大意", "大夫",
+        "多少", "多么", "外甥", "壮实", "地道", "地方", "在乎", "困难", "嘴巴",
+        "嘱咐", "嘟囔", "嘀咕", "喜欢", "喇嘛", "喇叭", "商量", "唾沫", "哑巴",
+        "哈欠", "哆嗦", "咳嗽", "和尚", "告诉", "告示", "含糊", "吓唬", "后头",
+        "名字", "名堂", "合同", "吆喝", "叫唤", "口袋", "厚道", "厉害", "千斤",
+        "包袱", "包涵", "匀称", "勤快", "动静", "动弹", "功夫", "力气", "前头",
+        "刺猬", "刺激", "别扭", "利落", "利索", "利害", "分析", "出息", "凑合",
+        "凉快", "冷战", "冤枉", "冒失", "养活", "关系", "先生", "兄弟", "便宜",
+        "使唤", "佩服", "作坊", "体面", "位置", "似的", "伙计", "休息", "什么",
+        "人家", "亲戚", "亲家", "交情", "云彩", "事情", "买卖", "主意", "丫头",
+        "丧气", "两口", "东西", "东家", "世故", "不由", "不在", "下水", "下巴",
+        "上头", "上司", "丈夫", "丈人", "一辈", "那个", "菩萨", "父亲", "母亲",
+        "咕噜", "邋遢", "费用", "冤家", "甜头", "介绍", "荒唐", "大人", "泥鳅",
+        "幸福", "熟悉", "计划", "扑腾", "蜡烛", "姥爷", "照顾", "喉咙", "吉他",
+        "弄堂", "蚂蚱", "凤凰", "拖沓", "寒碜", "糟蹋", "倒腾", "报复", "逻辑",
+        "盘缠", "喽啰", "牢骚", "咖喱", "扫把", "惦记"};
+
+
+    // 生成词典（词到音素的映射）
+    if (0 != GenDict(_word2phone_path, &word_phone_map)) {
+        LOG(ERROR) << "Genarate word2phone dict failed";
+        return -1;
+    }
+
+    // 生成音素字典（音素到音素id的映射）
+    if (0 != GenDict(_phone2id_path, &phone_id_map)) {
+        LOG(ERROR) << "Genarate phone2id dict failed";
+        return -1;
+    }
+
+    // 生成音调字典（音调到音调id的映射）
+    if (_separate_tone == "true") {
+        if (0 != GenDict(_tone2id_path, &tone_id_map)) {
+            LOG(ERROR) << "Genarate tone2id dict failed";
+            return -1;
+        }
+    }
+
+    // 生成繁简字典（繁体到简体id的映射）
+    if (0 != GenDict(_trand2simp_path, &trand_simp_map)) {
+        LOG(ERROR) << "Genarate trand2simp dict failed";
+        return -1;
+    }
+
+    _initialed = true;
+    return 0;
+}
+
+int FrontEngineInterface::ReadConfFile() {
+    std::ifstream is(_conf_file.c_str(), std::ifstream::in);
+    if (!is.good()) {
+        LOG(ERROR) << "Cannot open config file: " << _conf_file;
+        return -1;
+    }
+    std::string line, key, value;
+    while (std::getline(is, line)) {
+        if (line.substr(0, 2) == "--") {
+            size_t pos = line.find_first_of("=", 0);
+            std::string key = line.substr(2, pos - 2);
+            std::string value = line.substr(pos + 1);
+            conf_map[key] = value;
+            LOG(INFO) << "Key: " << key << "; Value: " << value;
+        }
+    }
+
+    // jieba conf path
+    _jieba_dict_path = conf_map["jieba_dict_path"];
+    _jieba_hmm_path = conf_map["jieba_hmm_path"];
+    _jieba_user_dict_path = conf_map["jieba_user_dict_path"];
+    _jieba_idf_path = conf_map["jieba_idf_path"];
+    _jieba_stop_word_path = conf_map["jieba_stop_word_path"];
+
+    // dict path
+    _separate_tone = conf_map["separate_tone"];
+    _word2phone_path = conf_map["word2phone_path"];
+    _phone2id_path = conf_map["phone2id_path"];
+    _tone2id_path = conf_map["tone2id_path"];
+    _trand2simp_path = conf_map["trand2simpd_path"];
+
+    return 0;
+}
+
+int FrontEngineInterface::Trand2Simp(const std::wstring &sentence,
+                                     std::wstring *sentence_simp) {
+    // sentence_simp = sentence;
+    for (int i = 0; i < sentence.length(); i++) {
+        std::wstring temp(1, sentence[i]);
+        std::string sigle_word = ppspeech::wstring2utf8string(temp);
+        // 单个字是否在繁转简的字典里
+        if (trand_simp_map.find(sigle_word) == trand_simp_map.end()) {
+            sentence_simp->append(temp);
+        } else {
+            sentence_simp->append(
+                (ppspeech::utf8string2wstring(trand_simp_map[sigle_word])));
+        }
+    }
+
+    return 0;
+}
+
+int FrontEngineInterface::GenDict(const std::string &dict_file,
+                                  std::map<std::string, std::string> *map) {
+    std::ifstream is(dict_file.c_str(), std::ifstream::in);
+    if (!is.good()) {
+        LOG(ERROR) << "Cannot open dict file: " << dict_file;
+        return -1;
+    }
+    std::string line, key, value;
+    while (std::getline(is, line)) {
+        size_t pos = line.find_first_of(" ", 0);
+        key = line.substr(0, pos);
+        value = line.substr(pos + 1);
+        (*map)[key] = value;
+    }
+    return 0;
+}
+
+int FrontEngineInterface::GetSegResult(
+    std::vector<std::pair<std::string, std::string>> *seg,
+    std::vector<std::string> *seg_words) {
+    std::vector<std::pair<std::string, std::string>>::iterator iter;
+    for (iter = seg->begin(); iter != seg->end(); iter++) {
+        seg_words->push_back((*iter).first);
+    }
+    return 0;
+}
+
+int FrontEngineInterface::GetSentenceIds(const std::string &sentence,
+                                         std::vector<int> *phoneids,
+                                         std::vector<int> *toneids) {
+    std::vector<std::pair<std::string, std::string>>
+        cut_result;  //分词结果包含词和词性
+    if (0 != Cut(sentence, &cut_result)) {
+        LOG(ERROR) << "Cut sentence: \"" << sentence << "\" failed";
+        return -1;
+    }
+
+    if (0 != GetWordsIds(cut_result, phoneids, toneids)) {
+        LOG(ERROR) << "Get words phoneids failed";
+        return -1;
+    }
+    return 0;
+}
+
+int FrontEngineInterface::GetWordsIds(
+    const std::vector<std::pair<std::string, std::string>> &cut_result,
+    std::vector<int> *phoneids,
+    std::vector<int> *toneids) {
+    std::string word;
+    std::string pos;
+    std::vector<std::string> word_initials;
+    std::vector<std::string> word_finals;
+    std::string phone;
+    for (int i = 0; i < cut_result.size(); i++) {
+        word = cut_result[i].first;
+        pos = cut_result[i].second;
+        if (std::find(_punc_omit.begin(), _punc_omit.end(), word) ==
+            _punc_omit.end()) {  // 非可忽略的标点
+            word_initials = {};
+            word_finals = {};
+            phone = "";
+            // 判断是否在标点符号集合中
+            if (std::find(_punc.begin(), _punc.end(), word) ==
+                _punc.end()) {  // 文字
+                // 获取字词的声母韵母列表
+                if (0 !=
+                    GetInitialsFinals(word, &word_initials, &word_finals)) {
+                    LOG(ERROR)
+                        << "Genarate the word_initials and word_finals of "
+                        << word << " failed";
+                    return -1;
+                }
+
+                // 对读音进行修改
+                if (0 != ModifyTone(word, pos, &word_finals)) {
+                    LOG(ERROR) << "Failed to modify tone.";
+                }
+
+                // 对儿化音进行修改
+                std::vector<std::vector<std::string>> new_initals_finals =
+                    MergeErhua(word_initials, word_finals, word, pos);
+                word_initials = new_initals_finals[0];
+                word_finals = new_initals_finals[1];
+
+                // 将声母和韵母合并成音素
+                assert(word_initials.size() == word_finals.size());
+                std::string temp_phone;
+                for (int j = 0; j < word_initials.size(); j++) {
+                    if (word_initials[j] != "") {
+                        temp_phone = word_initials[j] + " " + word_finals[j];
+                    } else {
+                        temp_phone = word_finals[j];
+                    }
+                    if (j == 0) {
+                        phone += temp_phone;
+                    } else {
+                        phone += (" " + temp_phone);
+                    }
+                }
+            } else {  // 标点符号
+                if (_separate_tone == "true") {
+                    phone = "sp0";  // speedyspeech
+                } else {
+                    phone = "sp";  // fastspeech2
+                }
+            }
+
+            // 音素到音素id
+            if (0 != Phone2Phoneid(phone, phoneids, toneids)) {
+                LOG(ERROR) << "Genarate the phone id of " << word << " failed";
+                return -1;
+            }
+        }
+    }
+    return 0;
+}
+
+int FrontEngineInterface::Cut(
+    const std::string &sentence,
+    std::vector<std::pair<std::string, std::string>> *cut_result) {
+    std::vector<std::pair<std::string, std::string>> cut_result_jieba;
+
+    // 结巴分词
+    _jieba->Tag(sentence, cut_result_jieba);
+
+    // 对分词后结果进行整合
+    if (0 != MergeforModify(&cut_result_jieba, cut_result)) {
+        LOG(ERROR) << "Failed to modify  for word segmentation result.";
+        return -1;
+    }
+
+    return 0;
+}
+
+int FrontEngineInterface::GetPhone(const std::string &word,
+                                   std::string *phone) {
+    // 判断 word 在不在 词典里，如果不在，进行CutAll分词
+    if (word_phone_map.find(word) == word_phone_map.end()) {
+        std::vector<std::string> wordcut;
+        _jieba->CutAll(word, wordcut);
+        phone->assign(word_phone_map[wordcut[0]]);
+        for (int i = 1; i < wordcut.size(); i++) {
+            phone->assign((*phone) + (" " + word_phone_map[wordcut[i]]));
+        }
+    } else {
+        phone->assign(word_phone_map[word]);
+    }
+
+    return 0;
+}
+
+int FrontEngineInterface::Phone2Phoneid(const std::string &phone,
+                                        std::vector<int> *phoneid,
+                                        std::vector<int> *toneid) {
+    std::vector<std::string> phone_vec;
+    phone_vec = absl::StrSplit(phone, " ");
+    std::string temp_phone;
+    for (int i = 0; i < phone_vec.size(); i++) {
+        temp_phone = phone_vec[i];
+        if (_separate_tone == "true") {
+            phoneid->push_back(atoi(
+                (phone_id_map[temp_phone.substr(0, temp_phone.length() - 1)])
+                    .c_str()));
+            toneid->push_back(
+                atoi((tone_id_map[temp_phone.substr(temp_phone.length() - 1,
+                                                    temp_phone.length())])
+                         .c_str()));
+        } else {
+            phoneid->push_back(atoi((phone_id_map[temp_phone]).c_str()));
+        }
+    }
+    return 0;
+}
+
+
+// 根据韵母判断该词中每个字的读音都为第三声。true表示词中每个字都是第三声
+bool FrontEngineInterface::AllToneThree(
+    const std::vector<std::string> &finals) {
+    bool flags = true;
+    for (int i = 0; i < finals.size(); i++) {
+        if (static_cast<int>(finals[i].back()) != 51) {  //如果读音不为第三声
+            flags = false;
+        }
+    }
+    return flags;
+}
+
+// 判断词是否是叠词
+bool FrontEngineInterface::IsReduplication(const std::string &word) {
+    bool flags = false;
+    std::wstring word_wstr = ppspeech::utf8string2wstring(word);
+    int len = word_wstr.length();
+    if (len == 2 && word_wstr[0] == word_wstr[1]) {
+        flags = true;
+    }
+    return flags;
+}
+
+// 获取每个字词的声母和韵母列表， word_initials 为声母列表，word_finals
+// 为韵母列表
+int FrontEngineInterface::GetInitialsFinals(
+    const std::string &word,
+    std::vector<std::string> *word_initials,
+    std::vector<std::string> *word_finals) {
+    std::string phone;
+    GetPhone(word, &phone);  //获取字词对应的音素
+    std::vector<std::string> phone_vec = absl::StrSplit(phone, " ");
+    //获取韵母，每个字的音素有1或者2个，start为单个字音素的起始位置。
+    int start = 0;
+    while (start < phone_vec.size()) {
+        if (phone_vec[start] == "sp" || phone_vec[start] == "sp0") {
+            start += 1;
+        } else if (isdigit(phone_vec[start].back()) == 0 ||
+                   static_cast<int>(phone_vec[start].back()) == 48) {
+            word_initials->push_back(phone_vec[start]);
+            word_finals->push_back(phone_vec[start + 1]);
+            start += 2;
+        } else {
+            word_initials->push_back("");
+            word_finals->push_back(phone_vec[start]);
+            start += 1;
+        }
+    }
+
+    assert(word_finals->size() == ppspeech::utf8string2wstring(word).length() &&
+           word_finals->size() == word_initials->size());
+
+    return 0;
+}
+
+// 获取每个字词的韵母列表
+int FrontEngineInterface::GetFinals(const std::string &word,
+                                    std::vector<std::string> *word_finals) {
+    std::vector<std::string> word_initials;
+    if (0 != GetInitialsFinals(word, &word_initials, word_finals)) {
+        LOG(ERROR) << "Failed to get word finals";
+        return -1;
+    }
+
+    return 0;
+}
+
+int FrontEngineInterface::Word2WordVec(const std::string &word,
+                                       std::vector<std::wstring> *wordvec) {
+    std::wstring word_wstr = ppspeech::utf8string2wstring(word);
+    for (int i = 0; i < word_wstr.length(); i++) {
+        std::wstring word_sigle(1, word_wstr[i]);
+        wordvec->push_back(word_sigle);
+    }
+    return 0;
+}
+
+// yuantian01解释：把一个词再进行分词找到。例子：小雨伞 --> 小 雨伞 或者 小雨 伞
+int FrontEngineInterface::SplitWord(const std::string &word,
+                                    std::vector<std::string> *new_word_vec) {
+    std::vector<std::string> word_vec;
+    std::string second_subword;
+    _jieba->CutForSearch(word, word_vec);
+    // 升序
+    std::sort(word_vec.begin(),
+              word_vec.end(),
+              [](std::string a, std::string b) { return a.size() > b.size(); });
+    std::string first_subword = word_vec[0];  // 提取长度最短的字符串
+    int first_begin_idx = word.find_first_of(first_subword);
+    if (first_begin_idx == 0) {
+        second_subword = word.substr(first_subword.length());
+        new_word_vec->push_back(first_subword);
+        new_word_vec->push_back(second_subword);
+    } else {
+        second_subword = word.substr(0, word.length() - first_subword.length());
+        new_word_vec->push_back(second_subword);
+        new_word_vec->push_back(first_subword);
+    }
+
+    return 0;
+}
+
+
+// example: 不 一起 --> 不一起
+std::vector<std::pair<std::string, std::string>> FrontEngineInterface::MergeBu(
+    std::vector<std::pair<std::string, std::string>> *seg_result) {
+    std::vector<std::pair<std::string, std::string>> result;
+    std::string word;
+    std::string pos;
+    std::string last_word = "";
+
+    for (int i = 0; i < seg_result->size(); i++) {
+        word = std::get<0>((*seg_result)[i]);
+        pos = std::get<1>((*seg_result)[i]);
+        if (last_word == "不") {
+            word = last_word + word;
+        }
+        if (word != "不") {
+            result.push_back(make_pair(word, pos));
+        }
+        last_word = word;
+    }
+
+    if (last_word == "不") {
+        result.push_back(make_pair(last_word, "d"));
+        last_word = "";
+    }
+
+    return result;
+}
+
+std::vector<std::pair<std::string, std::string>> FrontEngineInterface::Mergeyi(
+    std::vector<std::pair<std::string, std::string>> *seg_result) {
+    std::vector<std::pair<std::string, std::string>> *result_temp =
+        new std::vector<std::pair<std::string, std::string>>();
+    std::string word;
+    std::string pos;
+    // function 1  example: 听 一 听 --> 听一听
+    for (int i = 0; i < seg_result->size(); i++) {
+        word = std::get<0>((*seg_result)[i]);
+        pos = std::get<1>((*seg_result)[i]);
+
+        if ((i - 1 >= 0) && (word == "一") && (i + 1 < seg_result->size()) &&
+            (std::get<0>((*seg_result)[i - 1]) ==
+             std::get<0>((*seg_result)[i + 1])) &&
+            std::get<1>((*seg_result)[i - 1]) == "v") {
+            std::get<0>((*result_temp)[i - 1]) =
+                std::get<0>((*result_temp)[i - 1]) + "一" +
+                std::get<0>((*result_temp)[i - 1]);
+        } else {
+            if ((i - 2 >= 0) && (std::get<0>((*seg_result)[i - 1]) == "一") &&
+                (std::get<0>((*seg_result)[i - 2]) == word) && (pos == "v")) {
+                continue;
+            } else {
+                result_temp->push_back(make_pair(word, pos));
+            }
+        }
+    }
+
+    // function 2  example: 一 你 -->  一你
+    std::vector<std::pair<std::string, std::string>> result = {};
+    for (int j = 0; j < result_temp->size(); j++) {
+        word = std::get<0>((*result_temp)[j]);
+        pos = std::get<1>((*result_temp)[j]);
+        if ((result.size() != 0) && (result.back().first == "一")) {
+            result.back().first = result.back().first + word;
+        } else {
+            result.push_back(make_pair(word, pos));
+        }
+    }
+
+    return result;
+}
+
+// example: 你 你 --> 你你
+std::vector<std::pair<std::string, std::string>>
+FrontEngineInterface::MergeReduplication(
+    std::vector<std::pair<std::string, std::string>> *seg_result) {
+    std::vector<std::pair<std::string, std::string>> result;
+    std::string word;
+    std::string pos;
+
+    for (int i = 0; i < seg_result->size(); i++) {
+        word = std::get<0>((*seg_result)[i]);
+        pos = std::get<1>((*seg_result)[i]);
+        if ((result.size() != 0) && (word == result.back().first)) {
+            result.back().first =
+                result.back().first + std::get<0>((*seg_result)[i]);
+        } else {
+            result.push_back(make_pair(word, pos));
+        }
+    }
+
+    return result;
+}
+
+// the first and the second words are all_tone_three
+std::vector<std::pair<std::string, std::string>>
+FrontEngineInterface::MergeThreeTones(
+    std::vector<std::pair<std::string, std::string>> *seg_result) {
+    std::vector<std::pair<std::string, std::string>> result;
+    std::string word;
+    std::string pos;
+    std::vector<std::vector<std::string>> finals;  //韵母数组
+    std::vector<std::string> word_final;
+    std::vector<bool> merge_last(seg_result->size(), false);
+
+    // 判断最后一个分词结果是不是标点，不看标点的声母韵母
+    int word_num = seg_result->size() - 1;
+
+    // seg_result[word_num].first
+    if (std::find(
+            _punc.begin(), _punc.end(), std::get<0>((*seg_result)[word_num])) ==
+        _punc.end()) {  // 最后一个分词结果不是标点
+        word_num += 1;
+    }
+
+    // 获取韵母数组
+    for (int i = 0; i < word_num; i++) {
+        word_final = {};
+        word = std::get<0>((*seg_result)[i]);
+        pos = std::get<1>((*seg_result)[i]);
+        if (std::find(_punc_omit.begin(), _punc_omit.end(), word) ==
+            _punc_omit.end()) {  // 非可忽略的标点，即文字
+            if (0 != GetFinals(word, &word_final)) {
+                LOG(ERROR) << "Failed to get the final of word.";
+            }
+        }
+
+        finals.push_back(word_final);
+    }
+    assert(word_num == finals.size());
+
+    // 对第三声读音的字词分词结果进行处理
+    for (int i = 0; i < word_num; i++) {
+        word = std::get<0>((*seg_result)[i]);
+        pos = std::get<1>((*seg_result)[i]);
+        if (i - 1 >= 0 && AllToneThree(finals[i - 1]) &&
+            AllToneThree(finals[i]) && !merge_last[i - 1]) {
+            // if the last word is reduplication, not merge, because
+            // reduplication need to be _neural_sandhi
+            // seg_result[i - 1].first
+            if (!IsReduplication(std::get<0>((*seg_result)[i - 1])) &&
+                (ppspeech::utf8string2wstring(
+                     std::get<0>((*seg_result)[i - 1])))
+                            .length() +
+                        (ppspeech::utf8string2wstring(word)).length() <=
+                    3) {
+                result.back().first =
+                    result.back().first + std::get<0>((*seg_result)[i]);
+                merge_last[i] = true;
+            } else {
+                result.push_back(make_pair(word, pos));
+            }
+        } else {
+            result.push_back(make_pair(word, pos));
+        }
+    }
+
+    //把标点的分词结果补上
+    if (word_num < seg_result->size()) {
+        result.push_back(
+            // seg_result[word_num].first seg_result[word_num].second
+            // std::get<0>((*seg_result)[word_num])
+            make_pair(std::get<0>((*seg_result)[word_num]),
+                      std::get<1>((*seg_result)[word_num])));
+    }
+
+    return result;
+}
+
+// the last char of first word and the first char of second word is tone_three
+std::vector<std::pair<std::string, std::string>>
+FrontEngineInterface::MergeThreeTones2(
+    std::vector<std::pair<std::string, std::string>> *seg_result) {
+    std::vector<std::pair<std::string, std::string>> result;
+    std::string word;
+    std::string pos;
+    std::vector<std::vector<std::string>> finals;  //韵母数组
+    std::vector<std::string> word_final;
+    std::vector<bool> merge_last(seg_result->size(), false);
+
+    // 判断最后一个分词结果是不是标点
+    int word_num = seg_result->size() - 1;
+    if (std::find(
+            _punc.begin(), _punc.end(), std::get<0>((*seg_result)[word_num])) ==
+        _punc.end()) {  // 最后一个分词结果不是标点
+        word_num += 1;
+    }
+
+    // 获取韵母数组
+    for (int i = 0; i < word_num; i++) {
+        word_final = {};
+        word = std::get<0>((*seg_result)[i]);
+        pos = std::get<1>((*seg_result)[i]);
+        // 如果是文字，则获取韵母，如果是可忽略的标点，例如引号，则跳过
+        if (std::find(_punc_omit.begin(), _punc_omit.end(), word) ==
+            _punc_omit.end()) {
+            if (0 != GetFinals(word, &word_final)) {
+                LOG(ERROR) << "Failed to get the final of word.";
+            }
+        }
+
+        finals.push_back(word_final);
+    }
+    assert(word_num == finals.size());
+
+    // 对第三声读音的字词分词结果进行处理
+    for (int i = 0; i < word_num; i++) {
+        word = std::get<0>((*seg_result)[i]);
+        pos = std::get<1>((*seg_result)[i]);
+        if (i - 1 >= 0 && !finals[i - 1].empty() &&
+            absl::EndsWith(finals[i - 1].back(), "3") == true &&
+            !finals[i].empty() &&
+            absl::EndsWith(finals[i].front(), "3") == true &&
+            !merge_last[i - 1]) {
+            // if the last word is reduplication, not merge, because
+            // reduplication need to be _neural_sandhi
+            // seg_result[i - 1].first
+            if (!IsReduplication(std::get<0>((*seg_result)[i - 1])) &&
+                (ppspeech::utf8string2wstring(
+                     std::get<0>((*seg_result)[i - 1])))
+                            .length() +
+                        ppspeech::utf8string2wstring(word).length() <=
+                    3) {
+                result.back().first =
+                    result.back().first + std::get<0>((*seg_result)[i]);
+                merge_last[i] = true;
+            } else {
+                result.push_back(make_pair(word, pos));
+            }
+        } else {
+            result.push_back(make_pair(word, pos));
+        }
+    }
+
+    //把标点的分词结果补上
+    if (word_num < seg_result->size()) {
+        result.push_back(make_pair(std::get<0>((*seg_result)[word_num]),
+                                   std::get<1>((*seg_result)[word_num])));
+    }
+
+    return result;
+}
+
+// example: 吃饭 儿 --> 吃饭儿
+std::vector<std::pair<std::string, std::string>> FrontEngineInterface::MergeEr(
+    std::vector<std::pair<std::string, std::string>> *seg_result) {
+    std::vector<std::pair<std::string, std::string>> result;
+    std::string word;
+    std::string pos;
+
+    for (int i = 0; i < seg_result->size(); i++) {
+        word = std::get<0>((*seg_result)[i]);
+        pos = std::get<1>((*seg_result)[i]);
+        if ((i - 1 >= 0) && (word == "儿")) {
+            result.back().first =
+                result.back().first + std::get<0>((*seg_result)[i]);
+        } else {
+            result.push_back(make_pair(word, pos));
+        }
+    }
+
+    return result;
+}
+
+int FrontEngineInterface::MergeforModify(
+    std::vector<std::pair<std::string, std::string>> *seg_word_type,
+    std::vector<std::pair<std::string, std::string>> *modify_seg_word_type) {
+    std::vector<std::string> seg_result;
+    GetSegResult(seg_word_type, &seg_result);
+    LOG(INFO) << "Before merge, seg result is: "
+              << limonp::Join(seg_result.begin(), seg_result.end(), "/");
+    std::vector<std::pair<std::string, std::string>> tmp;
+    tmp = MergeBu(seg_word_type);
+    *modify_seg_word_type = tmp;
+    tmp = Mergeyi(modify_seg_word_type);
+    *modify_seg_word_type = tmp;
+    tmp = MergeReduplication(modify_seg_word_type);
+    *modify_seg_word_type = tmp;
+    tmp = MergeThreeTones(modify_seg_word_type);
+    *modify_seg_word_type = tmp;
+    tmp = MergeThreeTones2(modify_seg_word_type);
+    *modify_seg_word_type = tmp;
+    tmp = MergeEr(modify_seg_word_type);
+    *modify_seg_word_type = tmp;
+    seg_result = {};
+
+    GetSegResult(modify_seg_word_type, &seg_result);
+    LOG(INFO) << "After merge, seg result is: "
+              << limonp::Join(seg_result.begin(), seg_result.end(), "/");
+
+    return 0;
+}
+
+
+int FrontEngineInterface::BuSandi(const std::string &word,
+                                  std::vector<std::string> *finals) {
+    std::wstring bu = L"不";
+    std::vector<std::wstring> wordvec;
+    // 一个词转成向量形式
+    if (0 != Word2WordVec(word, &wordvec)) {
+        LOG(ERROR) << "Failed to get word vector";
+        return -1;
+    }
+
+    // e.g. 看不懂   b u4  -->  b u5, 将韵母的最后一位替换成 5
+    if (wordvec.size() == 3 && wordvec[1] == bu) {
+        (*finals)[1] = (*finals)[1].replace((*finals)[1].length() - 1, 1, "5");
+    } else {
+        // e.g. 不怕  b u4 --> b u2, 将韵母的最后一位替换成 2
+        for (int i = 0; i < wordvec.size(); i++) {
+            if (wordvec[i] == bu && i + 1 < wordvec.size() &&
+                absl::EndsWith((*finals)[i + 1], "4") == true) {
+                (*finals)[i] =
+                    (*finals)[i].replace((*finals)[i].length() - 1, 1, "2");
+            }
+        }
+    }
+
+    return 0;
+}
+
+
+int FrontEngineInterface::YiSandhi(const std::string &word,
+                                   std::vector<std::string> *finals) {
+    std::wstring yi = L"一";
+    std::vector<std::wstring> wordvec;
+    // 一个词转成向量形式
+    if (0 != Word2WordVec(word, &wordvec)) {
+        LOG(ERROR) << "Failed to get word vector";
+        return -1;
+    }
+
+    //情况1："一" in number sequences, e.g. 一零零, 二一零
+    std::wstring num_wstr = L"零一二三四六七八九";
+    std::wstring word_wstr = ppspeech::utf8string2wstring(word);
+    if (word_wstr.find(yi) != word_wstr.npos && wordvec.back() != yi) {
+        int flags = 0;
+        for (int j = 0; j < wordvec.size(); j++) {
+            if (num_wstr.find(wordvec[j]) == num_wstr.npos) {
+                flags = -1;
+                break;
+            }
+        }
+        if (flags == 0) {
+            return 0;
+        }
+    } else if (wordvec.size() == 3 && wordvec[1] == yi &&
+               wordvec[0] == wordvec[2]) {
+        // "一" between reduplication words shold be yi5, e.g. 看一看
+        (*finals)[1] = (*finals)[1].replace((*finals)[1].length() - 1, 1, "5");
+    } else if (wordvec[0] == L"第" && wordvec[1] == yi) {  //以第一位开始
+        (*finals)[1] = (*finals)[1].replace((*finals)[1].length() - 1, 1, "1");
+    } else {
+        for (int i = 0; i < wordvec.size(); i++) {
+            if (wordvec[i] == yi && i + 1 < wordvec.size()) {
+                if (absl::EndsWith((*finals)[i + 1], "4") == true) {
+                    // "一" before tone4 should be yi2, e.g. 一段
+                    (*finals)[i] =
+                        (*finals)[i].replace((*finals)[i].length() - 1, 1, "2");
+                } else {
+                    // "一" before non-tone4 should be yi4, e.g. 一天
+                    (*finals)[i] =
+                        (*finals)[i].replace((*finals)[i].length() - 1, 1, "4");
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+int FrontEngineInterface::NeuralSandhi(const std::string &word,
+                                       const std::string &pos,
+                                       std::vector<std::string> *finals) {
+    std::wstring word_wstr = ppspeech::utf8string2wstring(word);
+    std::vector<std::wstring> wordvec;
+    // 一个词转成向量形式
+    if (0 != Word2WordVec(word, &wordvec)) {
+        LOG(ERROR) << "Failed to get word vector";
+        return -1;
+    }
+    int word_num = wordvec.size();
+    assert(word_num == word_wstr.length());
+
+    // 情况1：reduplication words for n. and v. e.g. 奶奶, 试试, 旺旺
+    for (int j = 0; j < wordvec.size(); j++) {
+        std::string inits = "nva";
+        if (j - 1 >= 0 && wordvec[j] == wordvec[j - 1] &&
+            inits.find(pos[0]) != inits.npos) {
+            (*finals)[j] =
+                (*finals)[j].replace((*finals)[j].length() - 1, 1, "5");
+        }
+    }
+
+    // 情况2：对下述词的处理
+    std::wstring yuqici = L"吧呢哈啊呐噻嘛吖嗨呐哦哒额滴哩哟喽啰耶喔诶";
+    std::wstring de = L"的地得";
+    std::wstring le = L"了着过";
+    std::vector<std::string> le_pos = {"ul", "uz", "ug"};
+    std::wstring men = L"们子";
+    std::vector<std::string> men_pos = {"r", "n"};
+    std::wstring weizhi = L"上下里";
+    std::vector<std::string> weizhi_pos = {"s", "l", "f"};
+    std::wstring dong = L"来去";
+    std::wstring fangxiang = L"上下进出回过起开";
+    std::wstring ge = L"个";
+    std::wstring xiushi = L"几有两半多各整每做是零一二三四六七八九";
+    auto ge_idx = word_wstr.find_first_of(ge);  // 出现“个”的第一个位置
+
+    if (word_num >= 1 && yuqici.find(wordvec.back()) != yuqici.npos) {
+        (*finals).back() =
+            (*finals).back().replace((*finals).back().length() - 1, 1, "5");
+    } else if (word_num >= 1 && de.find(wordvec.back()) != de.npos) {
+        (*finals).back() =
+            (*finals).back().replace((*finals).back().length() - 1, 1, "5");
+    } else if (word_num == 1 && le.find(wordvec[0]) != le.npos &&
+               find(le_pos.begin(), le_pos.end(), pos) != le_pos.end()) {
+        (*finals).back() =
+            (*finals).back().replace((*finals).back().length() - 1, 1, "5");
+    } else if (word_num > 1 && men.find(wordvec.back()) != men.npos &&
+               find(men_pos.begin(), men_pos.end(), pos) != men_pos.end() &&
+               find(must_not_neural_tone_words.begin(),
+                    must_not_neural_tone_words.end(),
+                    word) != must_not_neural_tone_words.end()) {
+        (*finals).back() =
+            (*finals).back().replace((*finals).back().length() - 1, 1, "5");
+    } else if (word_num > 1 && weizhi.find(wordvec.back()) != weizhi.npos &&
+               find(weizhi_pos.begin(), weizhi_pos.end(), pos) !=
+                   weizhi_pos.end()) {
+        (*finals).back() =
+            (*finals).back().replace((*finals).back().length() - 1, 1, "5");
+    } else if (word_num > 1 && dong.find(wordvec.back()) != dong.npos &&
+               fangxiang.find(wordvec[word_num - 2]) != fangxiang.npos) {
+        (*finals).back() =
+            (*finals).back().replace((*finals).back().length() - 1, 1, "5");
+    } else if ((ge_idx != word_wstr.npos && ge_idx >= 1 &&
+                xiushi.find(wordvec[ge_idx - 1]) != xiushi.npos) ||
+               word_wstr == ge) {
+        (*finals).back() =
+            (*finals).back().replace((*finals).back().length() - 1, 1, "5");
+    } else {
+        if (find(must_neural_tone_words.begin(),
+                 must_neural_tone_words.end(),
+                 word) != must_neural_tone_words.end() ||
+            (word_num >= 2 &&
+             find(must_neural_tone_words.begin(),
+                  must_neural_tone_words.end(),
+                  ppspeech::wstring2utf8string(word_wstr.substr(
+                      word_num - 2))) != must_neural_tone_words.end())) {
+            (*finals).back() =
+                (*finals).back().replace((*finals).back().length() - 1, 1, "5");
+        }
+    }
+
+    // 进行进一步分词，把长词切分更短些
+    std::vector<std::string> word_list;
+    if (0 != SplitWord(word, &word_list)) {
+        LOG(ERROR) << "Failed to split word.";
+        return -1;
+    }
+    // 创建对应的 韵母列表
+    std::vector<std::vector<std::string>> finals_list;
+    std::vector<std::string> finals_temp;
+    finals_temp.assign((*finals).begin(),
+                       (*finals).begin() +
+                           ppspeech::utf8string2wstring(word_list[0]).length());
+    finals_list.push_back(finals_temp);
+    finals_temp.assign(
+        (*finals).begin() + ppspeech::utf8string2wstring(word_list[0]).length(),
+        (*finals).end());
+    finals_list.push_back(finals_temp);
+
+    finals = new std::vector<std::string>();
+    for (int i = 0; i < word_list.size(); i++) {
+        std::wstring temp_wstr = ppspeech::utf8string2wstring(word_list[i]);
+        if ((find(must_neural_tone_words.begin(),
+                  must_neural_tone_words.end(),
+                  word_list[i]) != must_neural_tone_words.end()) ||
+            (temp_wstr.length() >= 2 &&
+             find(must_neural_tone_words.begin(),
+                  must_neural_tone_words.end(),
+                  ppspeech::wstring2utf8string(
+                      temp_wstr.substr(temp_wstr.length() - 2))) !=
+                 must_neural_tone_words.end())) {
+            finals_list[i].back() = finals_list[i].back().replace(
+                finals_list[i].back().length() - 1, 1, "5");
+        }
+        (*finals).insert(
+            (*finals).end(), finals_list[i].begin(), finals_list[i].end());
+    }
+
+    return 0;
+}
+
+int FrontEngineInterface::ThreeSandhi(const std::string &word,
+                                      std::vector<std::string> *finals) {
+    std::wstring word_wstr = ppspeech::utf8string2wstring(word);
+    std::vector<std::vector<std::string>> finals_list;
+    std::vector<std::string> finals_temp;
+    std::vector<std::wstring> wordvec;
+    // 一个词转成向量形式
+    if (0 != Word2WordVec(word, &wordvec)) {
+        LOG(ERROR) << "Failed to get word vector";
+        return -1;
+    }
+    int word_num = wordvec.size();
+    assert(word_num == word_wstr.length());
+
+    if (word_num == 2 && AllToneThree((*finals))) {
+        (*finals)[0] = (*finals)[0].replace((*finals)[0].length() - 1, 1, "2");
+    } else if (word_num == 3) {
+        // 进行进一步分词，把长词切分更短些
+        std::vector<std::string> word_list;
+        if (0 != SplitWord(word, &word_list)) {
+            LOG(ERROR) << "Failed to split word.";
+            return -1;
+        }
+        if (AllToneThree((*finals))) {
+            std::wstring temp_wstr = ppspeech::utf8string2wstring(word_list[0]);
+            // disyllabic + monosyllabic, e.g. 蒙古/包
+            if (temp_wstr.length() == 2) {
+                (*finals)[0] =
+                    (*finals)[0].replace((*finals)[0].length() - 1, 1, "2");
+                (*finals)[1] =
+                    (*finals)[1].replace((*finals)[1].length() - 1, 1, "2");
+            } else if (temp_wstr.length() ==
+                       1) {  // monosyllabic + disyllabic, e.g. 纸/老虎
+                (*finals)[1] =
+                    (*finals)[1].replace((*finals)[1].length() - 1, 1, "2");
+            }
+        } else {
+            // 创建对应的 韵母列表
+            finals_temp = {};
+            finals_list = {};
+            finals_temp.assign(
+                (*finals).begin(),
+                (*finals).begin() +
+                    ppspeech::utf8string2wstring(word_list[0]).length());
+            finals_list.push_back(finals_temp);
+            finals_temp.assign(
+                (*finals).begin() +
+                    ppspeech::utf8string2wstring(word_list[0]).length(),
+                (*finals).end());
+            finals_list.push_back(finals_temp);
+
+            finals = new std::vector<std::string>();
+            for (int i = 0; i < finals_list.size(); i++) {
+                // e.g. 所有/人
+                if (AllToneThree(finals_list[i]) &&
+                    finals_list[i].size() == 2) {
+                    finals_list[i][0] = finals_list[i][0].replace(
+                        finals_list[i][0].length() - 1, 1, "2");
+                } else if (i == 1 && !(AllToneThree(finals_list[i])) &&
+                           absl::EndsWith(finals_list[i][0], "3") == true &&
+                           absl::EndsWith(finals_list[0].back(), "3") == true) {
+                    finals_list[0].back() = finals_list[0].back().replace(
+                        finals_list[0].back().length() - 1, 1, "2");
+                }
+            }
+            (*finals).insert(
+                (*finals).end(), finals_list[0].begin(), finals_list[0].end());
+            (*finals).insert(
+                (*finals).end(), finals_list[1].begin(), finals_list[1].end());
+        }
+
+    } else if (word_num == 4) {  //将成语拆分为两个长度为 2 的单词
+        // 创建对应的 韵母列表
+        finals_temp = {};
+        finals_list = {};
+        finals_temp.assign((*finals).begin(), (*finals).begin() + 2);
+        finals_list.push_back(finals_temp);
+        finals_temp.assign((*finals).begin() + 2, (*finals).end());
+        finals_list.push_back(finals_temp);
+
+        finals = new std::vector<std::string>();
+        for (int j = 0; j < finals_list.size(); j++) {
+            if (AllToneThree(finals_list[j])) {
+                finals_list[j][0] = finals_list[j][0].replace(
+                    finals_list[j][0].length() - 1, 1, "2");
+            }
+            (*finals).insert(
+                (*finals).end(), finals_list[j].begin(), finals_list[j].end());
+        }
+    }
+
+    return 0;
+}
+
+int FrontEngineInterface::ModifyTone(const std::string &word,
+                                     const std::string &pos,
+                                     std::vector<std::string> *finals) {
+    if ((0 != BuSandi(word, finals)) || (0 != YiSandhi(word, finals)) ||
+        (0 != NeuralSandhi(word, pos, finals)) ||
+        (0 != ThreeSandhi(word, finals))) {
+        LOG(ERROR) << "Failed to modify tone of the word: " << word;
+        return -1;
+    }
+
+    return 0;
+}
+
+std::vector<std::vector<std::string>> FrontEngineInterface::MergeErhua(
+    const std::vector<std::string> &initials,
+    const std::vector<std::string> &finals,
+    const std::string &word,
+    const std::string &pos) {
+    std::vector<std::string> new_initials = {};
+    std::vector<std::string> new_finals = {};
+    std::vector<std::vector<std::string>> new_initials_finals;
+    std::vector<std::string> specified_pos = {"a", "j", "nr"};
+    std::wstring word_wstr = ppspeech::utf8string2wstring(word);
+    std::vector<std::wstring> wordvec;
+    // 一个词转成向量形式
+    if (0 != Word2WordVec(word, &wordvec)) {
+        LOG(ERROR) << "Failed to get word vector";
+    }
+    int word_num = wordvec.size();
+
+    if ((find(must_erhua.begin(), must_erhua.end(), word) ==
+         must_erhua.end()) &&
+        ((find(not_erhua.begin(), not_erhua.end(), word) != not_erhua.end()) ||
+         (find(specified_pos.begin(), specified_pos.end(), pos) !=
+          specified_pos.end()))) {
+        new_initials_finals.push_back(initials);
+        new_initials_finals.push_back(finals);
+        return new_initials_finals;
+    }
+    if (finals.size() != word_num) {
+        new_initials_finals.push_back(initials);
+        new_initials_finals.push_back(finals);
+        return new_initials_finals;
+    }
+
+    assert(finals.size() == word_num);
+    for (int i = 0; i < finals.size(); i++) {
+        if (i == finals.size() - 1 && wordvec[i] == L"儿" &&
+            (finals[i] == "er2" || finals[i] == "er5") && word_num >= 2 &&
+            find(not_erhua.begin(),
+                 not_erhua.end(),
+                 ppspeech::wstring2utf8string(word_wstr.substr(
+                     word_wstr.length() - 2))) == not_erhua.end() &&
+            !new_finals.empty()) {
+            new_finals.back() =
+                new_finals.back().substr(0, new_finals.back().length() - 1) +
+                "r" + new_finals.back().substr(new_finals.back().length() - 1);
+        } else {
+            new_initials.push_back(initials[i]);
+            new_finals.push_back(finals[i]);
+        }
+    }
+    new_initials_finals.push_back(new_initials);
+    new_initials_finals.push_back(new_finals);
+
+    return new_initials_finals;
+}
+}  // namespace ppspeech
diff --git a/demos/TTSCppFrontend/src/front/front_interface.h b/demos/TTSCppFrontend/src/front/front_interface.h
new file mode 100644
index 00000000..8c16859c
--- /dev/null
+++ b/demos/TTSCppFrontend/src/front/front_interface.h
@@ -0,0 +1,198 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef PADDLE_TTS_SERVING_FRONT_FRONT_INTERFACE_H
+#define PADDLE_TTS_SERVING_FRONT_FRONT_INTERFACE_H
+
+#include <glog/logging.h>
+#include <fstream>
+#include <map>
+#include <memory>
+#include <string>
+//#include "utils/dir_utils.h"
+#include <cppjieba/Jieba.hpp>
+#include "absl/strings/str_split.h"
+#include "front/text_normalize.h"
+
+
+namespace ppspeech {
+
+class FrontEngineInterface : public TextNormalizer {
+  public:
+    explicit FrontEngineInterface(std::string conf) : _conf_file(conf) {
+        TextNormalizer();
+        _jieba = nullptr;
+        _initialed = false;
+        init();
+    }
+
+    int init();
+    ~FrontEngineInterface() {}
+
+    // 读取配置文件
+    int ReadConfFile();
+
+    // 简体转繁体
+    int Trand2Simp(const std::wstring &sentence, std::wstring *sentence_simp);
+
+    // 生成字典
+    int GenDict(const std::string &file,
+                std::map<std::string, std::string> *map);
+
+    // 由 词+词性的分词结果转为仅包含词的结果
+    int GetSegResult(std::vector<std::pair<std::string, std::string>> *seg,
+                     std::vector<std::string> *seg_words);
+
+    // 生成句子的音素，音调id。如果音素和音调未分开，则 toneids
+    // 为空（fastspeech2），反之则不为空(speedyspeech)
+    int GetSentenceIds(const std::string &sentence,
+                       std::vector<int> *phoneids,
+                       std::vector<int> *toneids);
+
+    // 根据分词结果获取词的音素，音调id，并对读音进行适当修改
+    // (ModifyTone)。如果音素和音调未分开，则 toneids
+    // 为空（fastspeech2），反之则不为空(speedyspeech)
+    int GetWordsIds(
+        const std::vector<std::pair<std::string, std::string>> &cut_result,
+        std::vector<int> *phoneids,
+        std::vector<int> *toneids);
+
+    // 结巴分词生成包含词和词性的分词结果，再对分词结果进行适当修改
+    // (MergeforModify)
+    int Cut(const std::string &sentence,
+            std::vector<std::pair<std::string, std::string>> *cut_result);
+
+    // 字词到音素的映射，查找字典
+    int GetPhone(const std::string &word, std::string *phone);
+
+    // 音素到音素id
+    int Phone2Phoneid(const std::string &phone,
+                      std::vector<int> *phoneid,
+                      std::vector<int> *toneids);
+
+
+    // 根据韵母判断该词中每个字的读音都为第三声。true表示词中每个字都是第三声
+    bool AllToneThree(const std::vector<std::string> &finals);
+
+    // 判断词是否是叠词
+    bool IsReduplication(const std::string &word);
+
+    // 获取每个字词的声母韵母列表
+    int GetInitialsFinals(const std::string &word,
+                          std::vector<std::string> *word_initials,
+                          std::vector<std::string> *word_finals);
+
+    // 获取每个字词的韵母列表
+    int GetFinals(const std::string &word,
+                  std::vector<std::string> *word_finals);
+
+    // 整个词转成向量形式，向量的每个元素对应词的一个字
+    int Word2WordVec(const std::string &word,
+                     std::vector<std::wstring> *wordvec);
+
+    // 将整个词重新进行 full cut，分词后，各个词会在词典中
+    int SplitWord(const std::string &word,
+                  std::vector<std::string> *fullcut_word);
+
+    // 对分词结果进行处理：对包含“不”字的分词结果进行整理
+    std::vector<std::pair<std::string, std::string>> MergeBu(
+        std::vector<std::pair<std::string, std::string>> *seg_result);
+
+    // 对分词结果进行处理：对包含“一”字的分词结果进行整理
+    std::vector<std::pair<std::string, std::string>> Mergeyi(
+        std::vector<std::pair<std::string, std::string>> *seg_result);
+
+    // 对分词结果进行处理：对前后相同的两个字进行合并
+    std::vector<std::pair<std::string, std::string>> MergeReduplication(
+        std::vector<std::pair<std::string, std::string>> *seg_result);
+
+    // 对一个词和后一个词他们的读音均为第三声的两个词进行合并
+    std::vector<std::pair<std::string, std::string>> MergeThreeTones(
+        std::vector<std::pair<std::string, std::string>> *seg_result);
+
+    // 对一个词的最后一个读音和后一个词的第一个读音为第三声的两个词进行合并
+    std::vector<std::pair<std::string, std::string>> MergeThreeTones2(
+        std::vector<std::pair<std::string, std::string>> *seg_result);
+
+    // 对分词结果进行处理：对包含“儿”字的分词结果进行整理
+    std::vector<std::pair<std::string, std::string>> MergeEr(
+        std::vector<std::pair<std::string, std::string>> *seg_result);
+
+    // 对分词结果进行处理、修改
+    int MergeforModify(
+        std::vector<std::pair<std::string, std::string>> *seg_result,
+        std::vector<std::pair<std::string, std::string>> *merge_seg_result);
+
+
+    // 对包含“不”字的相关词音调进行修改
+    int BuSandi(const std::string &word, std::vector<std::string> *finals);
+
+    // 对包含“一”字的相关词音调进行修改
+    int YiSandhi(const std::string &word, std::vector<std::string> *finals);
+
+    // 对一些特殊词（包括量词，语助词等）的相关词音调进行修改
+    int NeuralSandhi(const std::string &word,
+                     const std::string &pos,
+                     std::vector<std::string> *finals);
+
+    // 对包含第三声的相关词音调进行修改
+    int ThreeSandhi(const std::string &word, std::vector<std::string> *finals);
+
+    // 对字词音调进行处理、修改
+    int ModifyTone(const std::string &word,
+                   const std::string &pos,
+                   std::vector<std::string> *finals);
+
+
+    // 对儿化音进行处理
+    std::vector<std::vector<std::string>> MergeErhua(
+        const std::vector<std::string> &initials,
+        const std::vector<std::string> &finals,
+        const std::string &word,
+        const std::string &pos);
+
+
+  private:
+    bool _initialed;
+    cppjieba::Jieba *_jieba;
+    std::vector<std::string> _punc;
+    std::vector<std::string> _punc_omit;
+
+    std::string _conf_file;
+    std::map<std::string, std::string> conf_map;
+    std::map<std::string, std::string> word_phone_map;
+    std::map<std::string, std::string> phone_id_map;
+    std::map<std::string, std::string> tone_id_map;
+    std::map<std::string, std::string> trand_simp_map;
+
+
+    std::string _jieba_dict_path;
+    std::string _jieba_hmm_path;
+    std::string _jieba_user_dict_path;
+    std::string _jieba_idf_path;
+    std::string _jieba_stop_word_path;
+
+    std::string _separate_tone;
+    std::string _word2phone_path;
+    std::string _phone2id_path;
+    std::string _tone2id_path;
+    std::string _trand2simp_path;
+
+    std::vector<std::string> must_erhua;
+    std::vector<std::string> not_erhua;
+
+    std::vector<std::string> must_not_neural_tone_words;
+    std::vector<std::string> must_neural_tone_words;
+};
+}  // namespace ppspeech
+#endif
\ No newline at end of file
diff --git a/demos/TTSCppFrontend/src/front/text_normalize.cpp b/demos/TTSCppFrontend/src/front/text_normalize.cpp
new file mode 100644
index 00000000..8420e840
--- /dev/null
+++ b/demos/TTSCppFrontend/src/front/text_normalize.cpp
@@ -0,0 +1,542 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "front/text_normalize.h"
+
+namespace ppspeech {
+
+// 初始化 digits_map and unit_map
+int TextNormalizer::InitMap() {
+    digits_map["0"] = "零";
+    digits_map["1"] = "一";
+    digits_map["2"] = "二";
+    digits_map["3"] = "三";
+    digits_map["4"] = "四";
+    digits_map["5"] = "五";
+    digits_map["6"] = "六";
+    digits_map["7"] = "七";
+    digits_map["8"] = "八";
+    digits_map["9"] = "九";
+
+    units_map[1] = "十";
+    units_map[2] = "百";
+    units_map[3] = "千";
+    units_map[4] = "万";
+    units_map[8] = "亿";
+
+    return 0;
+}
+
+// 替换
+int TextNormalizer::Replace(std::wstring *sentence,
+                            const int &pos,
+                            const int &len,
+                            const std::wstring &repstr) {
+    // 删除原来的
+    sentence->erase(pos, len);
+    // 插入新的
+    sentence->insert(pos, repstr);
+    return 0;
+}
+
+// 根据标点符号切分句子
+int TextNormalizer::SplitByPunc(const std::wstring &sentence,
+                                std::vector<std::wstring> *sentence_part) {
+    std::wstring temp = sentence;
+    std::wregex reg(L"[：，；。？！,;?!]");
+    std::wsmatch match;
+
+    while (std::regex_search(temp, match, reg)) {
+        sentence_part->push_back(
+            temp.substr(0, match.position(0) + match.length(0)));
+        Replace(&temp, 0, match.position(0) + match.length(0), L"");
+    }
+    // 如果最后没有标点符号
+    if (temp != L"") {
+        sentence_part->push_back(temp);
+    }
+    return 0;
+}
+
+// 数字转文本，10200 - > 一万零二百
+std::string TextNormalizer::CreateTextValue(const std::string &num_str,
+                                            bool use_zero) {
+    std::string num_lstrip =
+        std::string(absl::StripPrefix(num_str, "0")).data();
+    int len = num_lstrip.length();
+
+    if (len == 0) {
+        return "";
+    } else if (len == 1) {
+        if (use_zero && (len < num_str.length())) {
+            return digits_map["0"] + digits_map[num_lstrip];
+        } else {
+            return digits_map[num_lstrip];
+        }
+    } else {
+        int largest_unit = 0;  // 最大单位
+        std::string first_part;
+        std::string second_part;
+
+        if (len > 1 && len <= 2) {
+            largest_unit = 1;
+        } else if (len > 2 && len <= 3) {
+            largest_unit = 2;
+        } else if (len > 3 && len <= 4) {
+            largest_unit = 3;
+        } else if (len > 4 && len <= 8) {
+            largest_unit = 4;
+        } else if (len > 8) {
+            largest_unit = 8;
+        }
+
+        first_part = num_str.substr(0, num_str.length() - largest_unit);
+        second_part = num_str.substr(num_str.length() - largest_unit);
+
+        return CreateTextValue(first_part, use_zero) + units_map[largest_unit] +
+               CreateTextValue(second_part, use_zero);
+    }
+}
+
+// 数字一个一个对应，可直接用于年份，电话，手机，
+std::string TextNormalizer::SingleDigit2Text(const std::string &num_str,
+                                             bool alt_one) {
+    std::string text = "";
+    if (alt_one) {
+        digits_map["1"] = "幺";
+    } else {
+        digits_map["1"] = "一";
+    }
+
+    for (size_t i = 0; i < num_str.size(); i++) {
+        std::string num_int(1, num_str[i]);
+        if (digits_map.find(num_int) == digits_map.end()) {
+            LOG(ERROR) << "digits_map doesn't have key: " << num_int;
+        }
+        text += digits_map[num_int];
+    }
+
+    return text;
+}
+
+std::string TextNormalizer::SingleDigit2Text(const std::wstring &num,
+                                             bool alt_one) {
+    std::string num_str = wstring2utf8string(num);
+    return SingleDigit2Text(num_str, alt_one);
+}
+
+//  数字整体对应，可直接用于月份，日期，数值整数部分
+std::string TextNormalizer::MultiDigit2Text(const std::string &num_str,
+                                            bool alt_one,
+                                            bool use_zero) {
+    LOG(INFO) << "aaaaaaaaaaaaaaaa: " << alt_one << use_zero;
+    if (alt_one) {
+        digits_map["1"] = "幺";
+    } else {
+        digits_map["1"] = "一";
+    }
+
+    std::wstring result =
+        utf8string2wstring(CreateTextValue(num_str, use_zero));
+    std::wstring result_0(1, result[0]);
+    std::wstring result_1(1, result[1]);
+    // 一十八 --> 十八
+    if ((result_0 == utf8string2wstring(digits_map["1"])) &&
+        (result_1 == utf8string2wstring(units_map[1]))) {
+        return wstring2utf8string(result.substr(1, result.length()));
+    } else {
+        return wstring2utf8string(result);
+    }
+}
+
+std::string TextNormalizer::MultiDigit2Text(const std::wstring &num,
+                                            bool alt_one,
+                                            bool use_zero) {
+    std::string num_str = wstring2utf8string(num);
+    return MultiDigit2Text(num_str, alt_one, use_zero);
+}
+
+// 数字转文本，包括整数和小数
+std::string TextNormalizer::Digits2Text(const std::string &num_str) {
+    std::string text;
+    std::vector<std::string> integer_decimal;
+    integer_decimal = absl::StrSplit(num_str, ".");
+
+    if (integer_decimal.size() == 1) {  // 整数
+        text = MultiDigit2Text(integer_decimal[0]);
+    } else if (integer_decimal.size() == 2) {  // 小数
+        if (integer_decimal[0] == "") {  // 无整数的小数类型，例如：.22
+            text = "点" +
+                   SingleDigit2Text(
+                       std::string(absl::StripSuffix(integer_decimal[1], "0"))
+                           .data());
+        } else {  // 常规小数类型，例如：12.34
+            text = MultiDigit2Text(integer_decimal[0]) + "点" +
+                   SingleDigit2Text(
+                       std::string(absl::StripSuffix(integer_decimal[1], "0"))
+                           .data());
+        }
+    } else {
+        return "The value does not conform to the numeric format";
+    }
+
+    return text;
+}
+
+std::string TextNormalizer::Digits2Text(const std::wstring &num) {
+    std::string num_str = wstring2utf8string(num);
+    return Digits2Text(num_str);
+}
+
+// 日期，2021年8月18日 --> 二零二一年八月十八日
+int TextNormalizer::ReData(std::wstring *sentence) {
+    std::wregex reg(
+        L"(\\d{4}|\\d{2})年((0?[1-9]|1[0-2])月)?(((0?[1-9])|((1|2)[0-9])|30|31)"
+        L"([日号]))?");
+    std::wsmatch match;
+    std::string rep;
+
+    while (std::regex_search(*sentence, match, reg)) {
+        rep = "";
+        rep += SingleDigit2Text(match[1]) + "年";
+        if (match[3] != L"") {
+            rep += MultiDigit2Text(match[3], false, false) + "月";
+        }
+        if (match[5] != L"") {
+            rep += MultiDigit2Text(match[5], false, false) +
+                   wstring2utf8string(match[9]);
+        }
+
+        Replace(sentence,
+                match.position(0),
+                match.length(0),
+                utf8string2wstring(rep));
+    }
+
+    return 0;
+}
+
+
+// XX-XX-XX or XX/XX/XX 例如：2021/08/18 --> 二零二一年八月十八日
+int TextNormalizer::ReData2(std::wstring *sentence) {
+    std::wregex reg(
+        L"(\\d{4})([- /.])(0[1-9]|1[012])\\2(0[1-9]|[12][0-9]|3[01])");
+    std::wsmatch match;
+    std::string rep;
+
+    while (std::regex_search(*sentence, match, reg)) {
+        rep = "";
+        rep += (SingleDigit2Text(match[1]) + "年");
+        rep += (MultiDigit2Text(match[3], false, false) + "月");
+        rep += (MultiDigit2Text(match[4], false, false) + "日");
+        Replace(sentence,
+                match.position(0),
+                match.length(0),
+                utf8string2wstring(rep));
+    }
+
+    return 0;
+}
+
+// XX:XX:XX   09:09:02 --> 九点零九分零二秒
+int TextNormalizer::ReTime(std::wstring *sentence) {
+    std::wregex reg(L"([0-1]?[0-9]|2[0-3]):([0-5][0-9])(:([0-5][0-9]))?");
+    std::wsmatch match;
+    std::string rep;
+
+    while (std::regex_search(*sentence, match, reg)) {
+        rep = "";
+        rep += (MultiDigit2Text(match[1], false, false) + "点");
+        if (absl::StartsWith(wstring2utf8string(match[2]), "0")) {
+            rep += "零";
+        }
+        rep += (MultiDigit2Text(match[2]) + "分");
+        if (absl::StartsWith(wstring2utf8string(match[4]), "0")) {
+            rep += "零";
+        }
+        rep += (MultiDigit2Text(match[4]) + "秒");
+
+        Replace(sentence,
+                match.position(0),
+                match.length(0),
+                utf8string2wstring(rep));
+    }
+
+    return 0;
+}
+
+// 温度，例如：-24.3℃ --> 零下二十四点三度
+int TextNormalizer::ReTemperature(std::wstring *sentence) {
+    std::wregex reg(L"(-?)(\\d+(\\.\\d+)?)(°C|℃|度|摄氏度)");
+    std::wsmatch match;
+    std::string rep;
+    std::string sign;
+    std::vector<std::string> integer_decimal;
+    std::string unit;
+
+    while (std::regex_search(*sentence, match, reg)) {
+        match[1] == L"-" ? sign = "负" : sign = "";
+        match[4] == L"摄氏度" ? unit = "摄氏度" : unit = "度";
+        rep = sign + Digits2Text(match[2]) + unit;
+
+        Replace(sentence,
+                match.position(0),
+                match.length(0),
+                utf8string2wstring(rep));
+    }
+
+    return 0;
+}
+
+// 分数，例如： 1/3 --> 三分之一
+int TextNormalizer::ReFrac(std::wstring *sentence) {
+    std::wregex reg(L"(-?)(\\d+)/(\\d+)");
+    std::wsmatch match;
+    std::string sign;
+    std::string rep;
+    while (std::regex_search(*sentence, match, reg)) {
+        match[1] == L"-" ? sign = "负" : sign = "";
+        rep = sign + MultiDigit2Text(match[3]) + "分之" +
+              MultiDigit2Text(match[2]);
+        Replace(sentence,
+                match.position(0),
+                match.length(0),
+                utf8string2wstring(rep));
+    }
+
+    return 0;
+}
+
+// 百分数，例如：45.5% --> 百分之四十五点五
+int TextNormalizer::RePercentage(std::wstring *sentence) {
+    std::wregex reg(L"(-?)(\\d+(\\.\\d+)?)%");
+    std::wsmatch match;
+    std::string sign;
+    std::string rep;
+    std::vector<std::string> integer_decimal;
+
+    while (std::regex_search(*sentence, match, reg)) {
+        match[1] == L"-" ? sign = "负" : sign = "";
+        rep = sign + "百分之" + Digits2Text(match[2]);
+
+        Replace(sentence,
+                match.position(0),
+                match.length(0),
+                utf8string2wstring(rep));
+    }
+
+    return 0;
+}
+
+// 手机号码，例如：+86 18883862235 --> 八六幺八八八三八六二二三五
+int TextNormalizer::ReMobilePhone(std::wstring *sentence) {
+    std::wregex reg(
+        L"(\\d)?((\\+?86 ?)?1([38]\\d|5[0-35-9]|7[678]|9[89])\\d{8})(\\d)?");
+    std::wsmatch match;
+    std::string rep;
+    std::vector<std::string> country_phonenum;
+
+    while (std::regex_search(*sentence, match, reg)) {
+        country_phonenum = absl::StrSplit(wstring2utf8string(match[0]), "+");
+        rep = "";
+        for (int i = 0; i < country_phonenum.size(); i++) {
+            LOG(INFO) << country_phonenum[i];
+            rep += SingleDigit2Text(country_phonenum[i], true);
+        }
+        Replace(sentence,
+                match.position(0),
+                match.length(0),
+                utf8string2wstring(rep));
+    }
+
+    return 0;
+}
+
+// 座机号码，例如：010-51093154 --> 零幺零五幺零九三幺五四
+int TextNormalizer::RePhone(std::wstring *sentence) {
+    std::wregex reg(
+        L"(\\d)?((0(10|2[1-3]|[3-9]\\d{2})-?)?[1-9]\\d{6,7})(\\d)?");
+    std::wsmatch match;
+    std::vector<std::string> zone_phonenum;
+    std::string rep;
+
+    while (std::regex_search(*sentence, match, reg)) {
+        rep = "";
+        zone_phonenum = absl::StrSplit(wstring2utf8string(match[0]), "-");
+        for (int i = 0; i < zone_phonenum.size(); i++) {
+            rep += SingleDigit2Text(zone_phonenum[i], true);
+        }
+        Replace(sentence,
+                match.position(0),
+                match.length(0),
+                utf8string2wstring(rep));
+    }
+
+    return 0;
+}
+
+// 范围，例如：60~90 --> 六十到九十
+int TextNormalizer::ReRange(std::wstring *sentence) {
+    std::wregex reg(
+        L"((-?)((\\d+)(\\.\\d+)?)|(\\.(\\d+)))[-~]((-?)((\\d+)(\\.\\d+)?)|(\\.("
+        L"\\d+)))");
+    std::wsmatch match;
+    std::string rep;
+    std::string sign1;
+    std::string sign2;
+
+    while (std::regex_search(*sentence, match, reg)) {
+        rep = "";
+        match[2] == L"-" ? sign1 = "负" : sign1 = "";
+        if (match[6] != L"") {
+            rep += sign1 + Digits2Text(match[6]) + "到";
+        } else {
+            rep += sign1 + Digits2Text(match[3]) + "到";
+        }
+        match[9] == L"-" ? sign2 = "负" : sign2 = "";
+        if (match[13] != L"") {
+            rep += sign2 + Digits2Text(match[13]);
+        } else {
+            rep += sign2 + Digits2Text(match[10]);
+        }
+
+        Replace(sentence,
+                match.position(0),
+                match.length(0),
+                utf8string2wstring(rep));
+    }
+
+    return 0;
+}
+
+// 带负号的整数，例如：-10 --> 负十
+int TextNormalizer::ReInterger(std::wstring *sentence) {
+    std::wregex reg(L"(-)(\\d+)");
+    std::wsmatch match;
+    std::string rep;
+    while (std::regex_search(*sentence, match, reg)) {
+        rep = "负" + MultiDigit2Text(match[2]);
+        Replace(sentence,
+                match.position(0),
+                match.length(0),
+                utf8string2wstring(rep));
+    }
+
+    return 0;
+}
+
+// 纯小数
+int TextNormalizer::ReDecimalNum(std::wstring *sentence) {
+    std::wregex reg(L"(-?)((\\d+)(\\.\\d+))|(\\.(\\d+))");
+    std::wsmatch match;
+    std::string sign;
+    std::string rep;
+    // std::vector<std::string> integer_decimal;
+    while (std::regex_search(*sentence, match, reg)) {
+        match[1] == L"-" ? sign = "负" : sign = "";
+        if (match[5] != L"") {
+            rep = sign + Digits2Text(match[5]);
+        } else {
+            rep = sign + Digits2Text(match[2]);
+        }
+
+        Replace(sentence,
+                match.position(0),
+                match.length(0),
+                utf8string2wstring(rep));
+    }
+
+    return 0;
+}
+
+// 正整数 + 量词
+int TextNormalizer::RePositiveQuantifiers(std::wstring *sentence) {
+    std::wstring common_quantifiers =
+        L"(朵|匹|张|座|回|场|尾|条|个|首|阙|阵|网|炮|顶|丘|棵|只|支|袭|辆|挑|"
+        L"担|颗|壳|窠|曲|墙|群|腔|砣|座|客|贯|扎|捆|刀|令|打|手|罗|坡|山|岭|江|"
+        L"溪|钟|队|单|双|对|出|口|头|脚|板|跳|枝|件|贴|针|线|管|名|位|身|堂|课|"
+        L"本|页|家|户|层|丝|毫|厘|分|钱|两|斤|担|铢|石|钧|锱|忽|(千|毫|微)克|"
+        L"毫|厘|(公)分|分|寸|尺|丈|里|寻|常|铺|程|(千|分|厘|毫|微)米|米|撮|勺|"
+        L"合|升|斗|石|盘|碗|碟|叠|桶|笼|盆|盒|杯|钟|斛|锅|簋|篮|盘|桶|罐|瓶|壶|"
+        L"卮|盏|箩|箱|煲|啖|袋|钵|年|月|日|季|刻|时|周|天|秒|分|旬|纪|岁|世|更|"
+        L"夜|春|夏|秋|冬|代|伏|辈|丸|泡|粒|颗|幢|堆|条|根|支|道|面|片|张|颗|块|"
+        L"元|(亿|千万|百万|万|千|百)|(亿|千万|百万|万|千|百|美|)元|(亿|千万|"
+        L"百万|万|千|百|)块|角|毛|分)";
+    std::wregex reg(L"(\\d+)([多余几])?" + common_quantifiers);
+    std::wsmatch match;
+    std::string rep;
+    while (std::regex_search(*sentence, match, reg)) {
+        rep = MultiDigit2Text(match[1]);
+        Replace(sentence,
+                match.position(1),
+                match.length(1),
+                utf8string2wstring(rep));
+    }
+
+    return 0;
+}
+
+// 编号类数字，例如： 89757 --> 八九七五七
+int TextNormalizer::ReDefalutNum(std::wstring *sentence) {
+    std::wregex reg(L"\\d{3}\\d*");
+    std::wsmatch match;
+    while (std::regex_search(*sentence, match, reg)) {
+        Replace(sentence,
+                match.position(0),
+                match.length(0),
+                utf8string2wstring(SingleDigit2Text(match[0])));
+    }
+
+    return 0;
+}
+
+int TextNormalizer::ReNumber(std::wstring *sentence) {
+    std::wregex reg(L"(-?)((\\d+)(\\.\\d+)?)|(\\.(\\d+))");
+    std::wsmatch match;
+    std::string sign;
+    std::string rep;
+    while (std::regex_search(*sentence, match, reg)) {
+        match[1] == L"-" ? sign = "负" : sign = "";
+        if (match[5] != L"") {
+            rep = sign + Digits2Text(match[5]);
+        } else {
+            rep = sign + Digits2Text(match[2]);
+        }
+
+        Replace(sentence,
+                match.position(0),
+                match.length(0),
+                utf8string2wstring(rep));
+    }
+    return 0;
+}
+
+// 整体正则，按顺序
+int TextNormalizer::SentenceNormalize(std::wstring *sentence) {
+    ReData(sentence);
+    ReData2(sentence);
+    ReTime(sentence);
+    ReTemperature(sentence);
+    ReFrac(sentence);
+    RePercentage(sentence);
+    ReMobilePhone(sentence);
+    RePhone(sentence);
+    ReRange(sentence);
+    ReInterger(sentence);
+    ReDecimalNum(sentence);
+    RePositiveQuantifiers(sentence);
+    ReDefalutNum(sentence);
+    ReNumber(sentence);
+    return 0;
+}
+}  // namespace ppspeech
\ No newline at end of file
diff --git a/demos/TTSCppFrontend/src/front/text_normalize.h b/demos/TTSCppFrontend/src/front/text_normalize.h
new file mode 100644
index 00000000..4383fa1b
--- /dev/null
+++ b/demos/TTSCppFrontend/src/front/text_normalize.h
@@ -0,0 +1,77 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef PADDLE_TTS_SERVING_FRONT_TEXT_NORMALIZE_H
+#define PADDLE_TTS_SERVING_FRONT_TEXT_NORMALIZE_H
+
+#include <glog/logging.h>
+#include <codecvt>
+#include <map>
+#include <regex>
+#include <string>
+#include "absl/strings/str_split.h"
+#include "absl/strings/strip.h"
+#include "base/type_conv.h"
+
+namespace ppspeech {
+
+class TextNormalizer {
+  public:
+    TextNormalizer() { InitMap(); }
+    ~TextNormalizer() {}
+
+    int InitMap();
+    int Replace(std::wstring *sentence,
+                const int &pos,
+                const int &len,
+                const std::wstring &repstr);
+    int SplitByPunc(const std::wstring &sentence,
+                    std::vector<std::wstring> *sentence_part);
+
+    std::string CreateTextValue(const std::string &num, bool use_zero = true);
+    std::string SingleDigit2Text(const std::string &num_str,
+                                 bool alt_one = false);
+    std::string SingleDigit2Text(const std::wstring &num, bool alt_one = false);
+    std::string MultiDigit2Text(const std::string &num_str,
+                                bool alt_one = false,
+                                bool use_zero = true);
+    std::string MultiDigit2Text(const std::wstring &num,
+                                bool alt_one = false,
+                                bool use_zero = true);
+    std::string Digits2Text(const std::string &num_str);
+    std::string Digits2Text(const std::wstring &num);
+
+    int ReData(std::wstring *sentence);
+    int ReData2(std::wstring *sentence);
+    int ReTime(std::wstring *sentence);
+    int ReTemperature(std::wstring *sentence);
+    int ReFrac(std::wstring *sentence);
+    int RePercentage(std::wstring *sentence);
+    int ReMobilePhone(std::wstring *sentence);
+    int RePhone(std::wstring *sentence);
+    int ReRange(std::wstring *sentence);
+    int ReInterger(std::wstring *sentence);
+    int ReDecimalNum(std::wstring *sentence);
+    int RePositiveQuantifiers(std::wstring *sentence);
+    int ReDefalutNum(std::wstring *sentence);
+    int ReNumber(std::wstring *sentence);
+    int SentenceNormalize(std::wstring *sentence);
+
+
+  private:
+    std::map<std::string, std::string> digits_map;
+    std::map<int, std::string> units_map;
+};
+}  // namespace ppspeech
+
+#endif
\ No newline at end of file
diff --git a/demos/TTSCppFrontend/third-party/CMakeLists.txt b/demos/TTSCppFrontend/third-party/CMakeLists.txt
new file mode 100644
index 00000000..0579b8f2
--- /dev/null
+++ b/demos/TTSCppFrontend/third-party/CMakeLists.txt
@@ -0,0 +1,64 @@
+cmake_minimum_required(VERSION 3.10)
+project(tts_third_party_libs)
+
+include(ExternalProject)
+
+# gflags
+ExternalProject_Add(gflags
+    GIT_REPOSITORY https://github.com/gflags/gflags.git
+    GIT_TAG        v2.2.2
+    PREFIX         ${CMAKE_CURRENT_BINARY_DIR}
+    INSTALL_DIR    ${CMAKE_CURRENT_BINARY_DIR}
+    CMAKE_ARGS     -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR>
+                   -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                   -DBUILD_STATIC_LIBS=OFF
+                   -DBUILD_SHARED_LIBS=ON
+)
+
+# glog
+ExternalProject_Add(
+    glog
+    GIT_REPOSITORY https://github.com/google/glog.git
+    GIT_TAG        v0.6.0
+    PREFIX         ${CMAKE_CURRENT_BINARY_DIR}
+    INSTALL_DIR    ${CMAKE_CURRENT_BINARY_DIR}
+    CMAKE_ARGS     -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR>
+                   -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+    DEPENDS        gflags
+)
+
+# abseil
+ExternalProject_Add(
+    abseil
+    GIT_REPOSITORY https://github.com/abseil/abseil-cpp.git
+    GIT_TAG        20230125.1
+    PREFIX         ${CMAKE_CURRENT_BINARY_DIR}
+    INSTALL_DIR    ${CMAKE_CURRENT_BINARY_DIR}
+    CMAKE_ARGS     -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR>
+                   -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                   -DABSL_PROPAGATE_CXX_STD=ON
+)
+
+# cppjieba (header-only)
+ExternalProject_Add(
+    cppjieba
+    GIT_REPOSITORY https://github.com/yanyiwu/cppjieba.git
+    GIT_TAG        v5.0.3
+    PREFIX         ${CMAKE_CURRENT_BINARY_DIR}
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND     ""
+    INSTALL_COMMAND   ""
+    TEST_COMMAND      ""
+)
+
+# limonp (header-only)
+ExternalProject_Add(
+    limonp
+    GIT_REPOSITORY https://github.com/yanyiwu/limonp.git
+    GIT_TAG        v0.6.6
+    PREFIX         ${CMAKE_CURRENT_BINARY_DIR}
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND     ""
+    INSTALL_COMMAND   ""
+    TEST_COMMAND      ""
+)
diff --git a/demos/audio_searching/src/test_audio_search.py b/demos/audio_searching/src/test_audio_search.py
index cb91e156..f9ea2929 100644
--- a/demos/audio_searching/src/test_audio_search.py
+++ b/demos/audio_searching/src/test_audio_search.py
@@ -14,8 +14,8 @@
 from audio_search import app
 from fastapi.testclient import TestClient
 
-from utils.utility import download
-from utils.utility import unpack
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unpack
 
 client = TestClient(app)
 
diff --git a/demos/audio_searching/src/test_vpr_search.py b/demos/audio_searching/src/test_vpr_search.py
index 298e12eb..cc795564 100644
--- a/demos/audio_searching/src/test_vpr_search.py
+++ b/demos/audio_searching/src/test_vpr_search.py
@@ -14,8 +14,8 @@
 from fastapi.testclient import TestClient
 from vpr_search import app
 
-from utils.utility import download
-from utils.utility import unpack
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unpack
 
 client = TestClient(app)
 
diff --git a/demos/speech_recognition/README.md b/demos/speech_recognition/README.md
index c815a88a..ee2acd6f 100644
--- a/demos/speech_recognition/README.md
+++ b/demos/speech_recognition/README.md
@@ -17,7 +17,7 @@ The input of this demo should be a WAV file(`.wav`), and the sample rate must be
 
 Here are sample files for this demo that can be downloaded:
 ```bash
-wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
+wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav
 ```
 
 ### 3. Usage
@@ -27,6 +27,8 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
   paddlespeech asr --input ./zh.wav -v
   # English
   paddlespeech asr --model transformer_librispeech --lang en --input ./en.wav -v
+  # Code-Switch
+  paddlespeech asr --model conformer_talcs --lang zh_en --codeswitch True --input ./ch_zh_mix.wav -v 
   # Chinese ASR + Punctuation Restoration
   paddlespeech asr --input ./zh.wav -v | paddlespeech text --task punc -v
   ```
@@ -40,6 +42,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
   - `input`(required): Audio file to recognize.
   - `model`: Model type of asr task. Default: `conformer_wenetspeech`.
   - `lang`: Model language. Default: `zh`.
+  - `codeswitch`: Code Swith Model. Default: `False`
   - `sample_rate`: Sample rate of the model. Default: `16000`.
   - `config`: Config of asr task. Use pretrained model when it is None. Default: `None`.
   - `ckpt_path`: Model checkpoint. Use pretrained model when it is None. Default: `None`.
@@ -83,14 +86,15 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
 
 Here is a list of pretrained models released by PaddleSpeech that can be used by command and python API:
 
-| Model | Language | Sample Rate
-| :--- | :---: | :---: |
-| conformer_wenetspeech | zh | 16k
-| conformer_online_multicn | zh | 16k
-| conformer_aishell | zh | 16k
-| conformer_online_aishell | zh | 16k
-| transformer_librispeech | en | 16k
-| deepspeech2online_wenetspeech | zh | 16k
-| deepspeech2offline_aishell| zh| 16k
-| deepspeech2online_aishell | zh | 16k
-| deepspeech2offline_librispeech | en | 16k
+| Model | Code Switch | Language | Sample Rate
+| :--- | :---: | :---: | :---: |
+| conformer_wenetspeech | False | zh | 16k
+| conformer_online_multicn | False | zh | 16k
+| conformer_aishell | False | zh | 16k
+| conformer_online_aishell | False | zh | 16k
+| transformer_librispeech | False | en | 16k
+| deepspeech2online_wenetspeech | False | zh | 16k
+| deepspeech2offline_aishell | False | zh| 16k
+| deepspeech2online_aishell | False | zh | 16k
+| deepspeech2offline_librispeech | False | en | 16k
+| conformer_talcs | True | zh_en | 16k
diff --git a/demos/speech_recognition/README_cn.md b/demos/speech_recognition/README_cn.md
index 13aa9f27..62dce3bc 100644
--- a/demos/speech_recognition/README_cn.md
+++ b/demos/speech_recognition/README_cn.md
@@ -1,4 +1,5 @@
 (简体中文|[English](./README.md))
+ (简体中文|[English](./README.md))
 
 # 语音识别
 ## 介绍
@@ -16,7 +17,7 @@
 
 可以下载此 demo 的示例音频：
 ```bash
-wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
+wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav
 ```
 ### 3. 使用方法
 - 命令行 (推荐使用)
@@ -25,6 +26,8 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
   paddlespeech asr --input ./zh.wav -v
   # 英文
   paddlespeech asr --model transformer_librispeech --lang en --input ./en.wav -v
+  #中英混合
+  paddlespeech asr --model conformer_talcs --lang zh_en --codeswitch True --input ./ch_zh_mix.wav -v 
   # 中文 + 标点恢复
   paddlespeech asr --input ./zh.wav -v | paddlespeech text --task punc -v
   ```
@@ -38,6 +41,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
   - `input`(必须输入)：用于识别的音频文件。
   - `model`：ASR 任务的模型，默认值：`conformer_wenetspeech`。
   - `lang`：模型语言，默认值：`zh`。
+  - `codeswitch`: 是否使用语言转换，默认值：`False`。
   - `sample_rate`：音频采样率，默认值：`16000`。
   - `config`：ASR 任务的参数文件，若不设置则使用预训练模型中的默认配置，默认值：`None`。
   - `ckpt_path`：模型参数文件，若不设置则下载预训练模型使用，默认值：`None`。
@@ -80,14 +84,15 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
 ### 4.预训练模型
 以下是 PaddleSpeech 提供的可以被命令行和 python API 使用的预训练模型列表：
 
-| 模型 | 语言 | 采样率
-| :--- | :---: | :---: |
-| conformer_wenetspeech | zh | 16k
-| conformer_online_multicn | zh | 16k
-| conformer_aishell | zh | 16k
-| conformer_online_aishell | zh | 16k
-| transformer_librispeech | en | 16k
-| deepspeech2online_wenetspeech | zh | 16k
-| deepspeech2offline_aishell| zh| 16k
-| deepspeech2online_aishell | zh | 16k
-| deepspeech2offline_librispeech | en | 16k
+| 模型 | 语言转换 | 语言 | 采样率
+| :--- | :---: | :---: | :---: |
+| conformer_wenetspeech | False | zh | 16k
+| conformer_online_multicn | False | zh | 16k
+| conformer_aishell | False | zh | 16k
+| conformer_online_aishell | False | zh | 16k
+| transformer_librispeech | False | en | 16k
+| deepspeech2online_wenetspeech | False | zh | 16k
+| deepspeech2offline_aishell | False | zh| 16k
+| deepspeech2online_aishell | False | zh | 16k
+| deepspeech2offline_librispeech | False | en | 16k
+| conformer_talcs | True | zh_en | 16k
diff --git a/demos/speech_recognition/run.sh b/demos/speech_recognition/run.sh
index e48ff3e9..8ba6e4c3 100755
--- a/demos/speech_recognition/run.sh
+++ b/demos/speech_recognition/run.sh
@@ -2,6 +2,7 @@
 
 wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
 wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
+wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav
 
 # asr
 paddlespeech asr --input ./zh.wav
@@ -18,6 +19,11 @@ paddlespeech asr --help
 # english asr
 paddlespeech asr --lang en --model transformer_librispeech --input ./en.wav
 
+
+# code-switch asr
+paddlespeech asr --lang zh_en --codeswitch True --model conformer_talcs --input ./ch_zh_mix.wav
+
+
 # model stats
 paddlespeech stats --task asr
 
diff --git a/demos/speech_web/README.md b/demos/speech_web/README.md
index 572781ab..fc1fe710 100644
--- a/demos/speech_web/README.md
+++ b/demos/speech_web/README.md
@@ -23,7 +23,7 @@ Paddle Speech Demo 是一个以 PaddleSpeech 的语音交互功能为主体开
 
 + ERNIE-SAT：语言-语音跨模态大模型 ERNIE-SAT 可视化展示示例，支持个性化合成，跨语言语音合成（音频为中文则输入英文文本进行合成），语音编辑（修改音频文字中间的结果）功能。 ERNIE-SAT 更多实现细节，可以参考：
   + [【ERNIE-SAT with AISHELL-3 dataset】](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/ernie_sat)
-  + [【ERNIE-SAT with with AISHELL3 and VCTK datasets】](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3_vctk/ernie_sat)
+  + [【ERNIE-SAT with AISHELL3 and VCTK datasets】](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3_vctk/ernie_sat)
   + [【ERNIE-SAT with VCTK dataset】](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/ernie_sat)
 
 运行效果：
diff --git a/demos/speech_web/speech_server/main.py b/demos/speech_web/speech_server/main.py
index 03e7e599..f4678628 100644
--- a/demos/speech_web/speech_server/main.py
+++ b/demos/speech_web/speech_server/main.py
@@ -260,7 +260,7 @@ async def websocket_endpoint_online(websocket: WebSocket):
                 #              and we break the loop
                 if message['signal'] == 'start':
                     resp = {"status": "ok", "signal": "server_ready"}
-                    # do something at begining here
+                    # do something at beginning here
                     # create the instance to process the audio
                     # connection_handler = chatbot.asr.connection_handler
                     connection_handler = PaddleASRConnectionHanddler(engine)
diff --git a/demos/speech_web/speech_server/requirements.txt b/demos/speech_web/speech_server/requirements.txt
index cdc65465..8425a1fe 100644
--- a/demos/speech_web/speech_server/requirements.txt
+++ b/demos/speech_web/speech_server/requirements.txt
@@ -1,8 +1,6 @@
 aiofiles
 faiss-cpu
-praatio==5.0.0
+praatio>=5.0.0
 pydantic
 python-multipart
-scikit_learn
 starlette
-uvicorn
diff --git a/demos/streaming_asr_server/README.md b/demos/streaming_asr_server/README.md
index 1d33b694..31256d15 100644
--- a/demos/streaming_asr_server/README.md
+++ b/demos/streaming_asr_server/README.md
@@ -9,7 +9,7 @@ This demo is an implementation of starting the streaming speech service and acce
 
 Streaming ASR server only support `websocket` protocol, and doesn't support `http` protocol.
 
-服务接口定义请参考:
+For service interface definitions, please refer to:
 - [PaddleSpeech Streaming Server WebSocket API](https://github.com/PaddlePaddle/PaddleSpeech/wiki/PaddleSpeech-Server-WebSocket-API)
 
 ## Usage
@@ -23,7 +23,7 @@ You can choose one way from easy, meduim and hard to install paddlespeech.
 **If you install in easy mode, you need to prepare the yaml file by yourself, you can refer to 
 
 ### 2. Prepare config File
-The configuration file can be found in `conf/ws_application.yaml` 和 `conf/ws_conformer_wenetspeech_application.yaml`.
+The configuration file can be found in `conf/ws_application.yaml` or `conf/ws_conformer_wenetspeech_application.yaml`.
 
 At present, the speech tasks integrated by the model include: DeepSpeech2 and conformer.
 
@@ -87,7 +87,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
 
   server_executor = ServerExecutor()
   server_executor(
-      config_file="./conf/ws_conformer_wenetspeech_application.yaml",
+      config_file="./conf/ws_conformer_wenetspeech_application_faster.yaml",
       log_file="./log/paddlespeech.log")
   ```
 
@@ -579,3 +579,354 @@ bash server.sh
   [2022-05-07 11:11:18,915] [    INFO] - audio duration: 4.9968125, elapsed time: 15.928460597991943, RTF=3.187724293835709
   [2022-05-07 11:11:18,916] [    INFO] - asr websocket client finished : 我认为跑步最重要的就是给我带来了身体健康
   ```
+
+## Generate corresponding subtitle (.srt format) from audio file (.wav format or.mp3 format)
+
+By default, each server is deployed on the 'CPU' device and speech recognition and punctuation prediction can be deployed on different 'GPU' by modifying the' device 'parameter in the service configuration file respectively.
+
+We use `streaming_ asr_server.py` and `punc_server.py` two services to lanuch streaming speech recognition and punctuation prediction services respectively. And the `websocket_client_srt.py` script can be used to call streaming speech recognition and punctuation prediction services at the same time, and will generate the corresponding subtitle (.srt format).
+
+**need to install ffmpeg before running this script**
+
+**You should at the directory of `.../demos/streaming_asr_server/`**
+
+### 1. Start two server
+
+```bash
+Note: streaming speech recognition and punctuation prediction are configured on different graphics cards through configuration files
+paddlespeech_server start --config_file ./conf/ws_conformer_wenetspeech_application.yaml
+```
+
+Open another terminal run the following commands:
+```bash
+paddlespeech_server start --config_file conf/punc_application.yaml
+```
+
+### 2. Call client
+
+  ```bash
+  python3 local/websocket_client_srt.py --server_ip 127.0.0.1 --port 8090 --punc.server_ip 127.0.0.1 --punc.port 8190 --wavfile ../../data/认知.mp3
+  ```
+  Output:
+  ```text
+  [2023-03-30 23:26:13,991] [    INFO] - Start to do streaming asr client
+[2023-03-30 23:26:13,994] [    INFO] - asr websocket client start
+[2023-03-30 23:26:13,994] [    INFO] - endpoint: http://127.0.0.1:8190/paddlespeech/text
+[2023-03-30 23:26:13,994] [    INFO] - endpoint: ws://127.0.0.1:8090/paddlespeech/asr/streaming
+[2023-03-30 23:26:14,475] [    INFO] - /home/fxb/PaddleSpeech-develop/data/认知.mp3 converted to /home/fxb/PaddleSpeech-develop/data/认知.wav
+[2023-03-30 23:26:14,476] [    INFO] - start to process the wavscp: /home/fxb/PaddleSpeech-develop/data/认知.wav
+[2023-03-30 23:26:14,515] [    INFO] - client receive msg={"status": "ok", "signal": "server_ready"}
+[2023-03-30 23:26:14,533] [    INFO] - client receive msg={'result': ''}
+[2023-03-30 23:26:14,545] [    INFO] - client receive msg={'result': ''}
+[2023-03-30 23:26:14,556] [    INFO] - client receive msg={'result': ''}
+[2023-03-30 23:26:14,572] [    INFO] - client receive msg={'result': ''}
+[2023-03-30 23:26:14,588] [    INFO] - client receive msg={'result': ''}
+[2023-03-30 23:26:14,600] [    INFO] - client receive msg={'result': ''}
+[2023-03-30 23:26:14,613] [    INFO] - client receive msg={'result': ''}
+[2023-03-30 23:26:14,626] [    INFO] - client receive msg={'result': ''}
+[2023-03-30 23:26:15,122] [    INFO] - client receive msg={'result': '第一部'}
+[2023-03-30 23:26:15,135] [    INFO] - client receive msg={'result': '第一部'}
+[2023-03-30 23:26:15,154] [    INFO] - client receive msg={'result': '第一部'}
+[2023-03-30 23:26:15,163] [    INFO] - client receive msg={'result': '第一部'}
+[2023-03-30 23:26:15,175] [    INFO] - client receive msg={'result': '第一部'}
+[2023-03-30 23:26:15,185] [    INFO] - client receive msg={'result': '第一部'}
+[2023-03-30 23:26:15,196] [    INFO] - client receive msg={'result': '第一部'}
+[2023-03-30 23:26:15,637] [    INFO] - client receive msg={'result': '第一部分是认'}
+[2023-03-30 23:26:15,648] [    INFO] - client receive msg={'result': '第一部分是认'}
+[2023-03-30 23:26:15,657] [    INFO] - client receive msg={'result': '第一部分是认'}
+[2023-03-30 23:26:15,666] [    INFO] - client receive msg={'result': '第一部分是认'}
+[2023-03-30 23:26:15,676] [    INFO] - client receive msg={'result': '第一部分是认'}
+[2023-03-30 23:26:15,683] [    INFO] - client receive msg={'result': '第一部分是认'}
+[2023-03-30 23:26:15,691] [    INFO] - client receive msg={'result': '第一部分是认'}
+[2023-03-30 23:26:15,703] [    INFO] - client receive msg={'result': '第一部分是认'}
+[2023-03-30 23:26:16,146] [    INFO] - client receive msg={'result': '第一部分是认知部分'}
+[2023-03-30 23:26:16,159] [    INFO] - client receive msg={'result': '第一部分是认知部分'}
+[2023-03-30 23:26:16,167] [    INFO] - client receive msg={'result': '第一部分是认知部分'}
+[2023-03-30 23:26:16,177] [    INFO] - client receive msg={'result': '第一部分是认知部分'}
+[2023-03-30 23:26:16,187] [    INFO] - client receive msg={'result': '第一部分是认知部分'}
+[2023-03-30 23:26:16,197] [    INFO] - client receive msg={'result': '第一部分是认知部分'}
+[2023-03-30 23:26:16,210] [    INFO] - client receive msg={'result': '第一部分是认知部分'}
+[2023-03-30 23:26:16,694] [    INFO] - client receive msg={'result': '第一部分是认知部分'}
+[2023-03-30 23:26:16,704] [    INFO] - client receive msg={'result': '第一部分是认知部分'}
+[2023-03-30 23:26:16,713] [    INFO] - client receive msg={'result': '第一部分是认知部分'}
+[2023-03-30 23:26:16,725] [    INFO] - client receive msg={'result': '第一部分是认知部分'}
+[2023-03-30 23:26:16,737] [    INFO] - client receive msg={'result': '第一部分是认知部分'}
+[2023-03-30 23:26:16,749] [    INFO] - client receive msg={'result': '第一部分是认知部分'}
+[2023-03-30 23:26:16,759] [    INFO] - client receive msg={'result': '第一部分是认知部分'}
+[2023-03-30 23:26:16,770] [    INFO] - client receive msg={'result': '第一部分是认知部分'}
+[2023-03-30 23:26:17,279] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通'}
+[2023-03-30 23:26:17,302] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通'}
+[2023-03-30 23:26:17,316] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通'}
+[2023-03-30 23:26:17,332] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通'}
+[2023-03-30 23:26:17,343] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通'}
+[2023-03-30 23:26:17,358] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通'}
+[2023-03-30 23:26:17,373] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通'}
+[2023-03-30 23:26:17,958] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图'}
+[2023-03-30 23:26:17,971] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图'}
+[2023-03-30 23:26:17,987] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图'}
+[2023-03-30 23:26:18,000] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图'}
+[2023-03-30 23:26:18,017] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图'}
+[2023-03-30 23:26:18,028] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图'}
+[2023-03-30 23:26:18,038] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图'}
+[2023-03-30 23:26:18,049] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图'}
+[2023-03-30 23:26:18,653] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本'}
+[2023-03-30 23:26:18,689] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本'}
+[2023-03-30 23:26:18,701] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本'}
+[2023-03-30 23:26:18,712] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本'}
+[2023-03-30 23:26:18,723] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本'}
+[2023-03-30 23:26:18,750] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本'}
+[2023-03-30 23:26:18,767] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本'}
+[2023-03-30 23:26:19,295] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式'}
+[2023-03-30 23:26:19,307] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式'}
+[2023-03-30 23:26:19,323] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式'}
+[2023-03-30 23:26:19,332] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式'}
+[2023-03-30 23:26:19,342] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式'}
+[2023-03-30 23:26:19,349] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式'}
+[2023-03-30 23:26:19,373] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式'}
+[2023-03-30 23:26:19,389] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式'}
+[2023-03-30 23:26:20,046] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生'}
+[2023-03-30 23:26:20,055] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生'}
+[2023-03-30 23:26:20,067] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生'}
+[2023-03-30 23:26:20,076] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生'}
+[2023-03-30 23:26:20,094] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生'}
+[2023-03-30 23:26:20,124] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生'}
+[2023-03-30 23:26:20,135] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生'}
+[2023-03-30 23:26:20,732] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解'}
+[2023-03-30 23:26:20,742] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解'}
+[2023-03-30 23:26:20,757] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解'}
+[2023-03-30 23:26:20,770] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解'}
+[2023-03-30 23:26:20,782] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解'}
+[2023-03-30 23:26:20,798] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解'}
+[2023-03-30 23:26:20,815] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解'}
+[2023-03-30 23:26:20,834] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解'}
+[2023-03-30 23:26:21,390] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感'}
+[2023-03-30 23:26:21,405] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感'}
+[2023-03-30 23:26:21,416] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感'}
+[2023-03-30 23:26:21,428] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感'}
+[2023-03-30 23:26:21,448] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感'}
+[2023-03-30 23:26:21,459] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感'}
+[2023-03-30 23:26:21,473] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感'}
+[2023-03-30 23:26:22,065] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作'}
+[2023-03-30 23:26:22,085] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作'}
+[2023-03-30 23:26:22,110] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作'}
+[2023-03-30 23:26:22,118] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作'}
+[2023-03-30 23:26:22,137] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作'}
+[2023-03-30 23:26:22,144] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作'}
+[2023-03-30 23:26:22,154] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作'}
+[2023-03-30 23:26:22,169] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作'}
+[2023-03-30 23:26:22,698] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理'}
+[2023-03-30 23:26:22,709] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理'}
+[2023-03-30 23:26:22,731] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理'}
+[2023-03-30 23:26:22,743] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理'}
+[2023-03-30 23:26:22,755] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理'}
+[2023-03-30 23:26:22,771] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理'}
+[2023-03-30 23:26:22,782] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理'}
+[2023-03-30 23:26:23,415] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生'}
+[2023-03-30 23:26:23,430] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生'}
+[2023-03-30 23:26:23,442] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生'}
+[2023-03-30 23:26:23,456] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生'}
+[2023-03-30 23:26:23,470] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生'}
+[2023-03-30 23:26:23,487] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生'}
+[2023-03-30 23:26:23,498] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生'}
+[2023-03-30 23:26:23,524] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生'}
+[2023-03-30 23:26:24,200] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备'}
+[2023-03-30 23:26:24,210] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备'}
+[2023-03-30 23:26:24,219] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备'}
+[2023-03-30 23:26:24,231] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备'}
+[2023-03-30 23:26:24,250] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备'}
+[2023-03-30 23:26:24,262] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备'}
+[2023-03-30 23:26:24,272] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备'}
+[2023-03-30 23:26:24,898] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致'}
+[2023-03-30 23:26:24,903] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致'}
+[2023-03-30 23:26:24,907] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致'}
+[2023-03-30 23:26:24,932] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致'}
+[2023-03-30 23:26:24,957] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致'}
+[2023-03-30 23:26:24,979] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致'}
+[2023-03-30 23:26:24,991] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致'}
+[2023-03-30 23:26:25,011] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致'}
+[2023-03-30 23:26:25,616] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知'}
+[2023-03-30 23:26:25,625] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知'}
+[2023-03-30 23:26:25,648] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知'}
+[2023-03-30 23:26:25,658] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知'}
+[2023-03-30 23:26:25,669] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知'}
+[2023-03-30 23:26:25,681] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知'}
+[2023-03-30 23:26:25,690] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知'}
+[2023-03-30 23:26:25,707] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知'}
+[2023-03-30 23:26:26,378] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知'}
+[2023-03-30 23:26:26,384] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知'}
+[2023-03-30 23:26:26,389] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知'}
+[2023-03-30 23:26:26,397] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知'}
+[2023-03-30 23:26:26,402] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知'}
+[2023-03-30 23:26:26,415] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知'}
+[2023-03-30 23:26:26,428] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知'}
+[2023-03-30 23:26:27,008] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使'}
+[2023-03-30 23:26:27,018] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使'}
+[2023-03-30 23:26:27,026] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使'}
+[2023-03-30 23:26:27,037] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使'}
+[2023-03-30 23:26:27,046] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使'}
+[2023-03-30 23:26:27,054] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使'}
+[2023-03-30 23:26:27,062] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使'}
+[2023-03-30 23:26:27,070] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使'}
+[2023-03-30 23:26:27,735] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传'}
+[2023-03-30 23:26:27,745] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传'}
+[2023-03-30 23:26:27,755] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传'}
+[2023-03-30 23:26:27,769] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传'}
+[2023-03-30 23:26:27,783] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传'}
+[2023-03-30 23:26:27,794] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传'}
+[2023-03-30 23:26:27,804] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传'}
+[2023-03-30 23:26:28,454] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内'}
+[2023-03-30 23:26:28,472] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内'}
+[2023-03-30 23:26:28,481] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内'}
+[2023-03-30 23:26:28,489] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内'}
+[2023-03-30 23:26:28,499] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内'}
+[2023-03-30 23:26:28,533] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内'}
+[2023-03-30 23:26:28,543] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内'}
+[2023-03-30 23:26:28,556] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内'}
+[2023-03-30 23:26:29,212] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图'}
+[2023-03-30 23:26:29,222] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图'}
+[2023-03-30 23:26:29,233] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图'}
+[2023-03-30 23:26:29,246] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图'}
+[2023-03-30 23:26:29,258] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图'}
+[2023-03-30 23:26:29,270] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图'}
+[2023-03-30 23:26:29,286] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图'}
+[2023-03-30 23:26:30,003] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅'}
+[2023-03-30 23:26:30,013] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅'}
+[2023-03-30 23:26:30,038] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅'}
+[2023-03-30 23:26:30,048] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅'}
+[2023-03-30 23:26:30,062] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅'}
+[2023-03-30 23:26:30,074] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅'}
+[2023-03-30 23:26:30,114] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅'}
+[2023-03-30 23:26:30,125] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅'}
+[2023-03-30 23:26:30,856] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说'}
+[2023-03-30 23:26:30,876] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说'}
+[2023-03-30 23:26:30,885] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说'}
+[2023-03-30 23:26:30,897] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说'}
+[2023-03-30 23:26:30,914] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说'}
+[2023-03-30 23:26:30,940] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说'}
+[2023-03-30 23:26:30,952] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说'}
+[2023-03-30 23:26:31,655] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明'}
+[2023-03-30 23:26:31,696] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明'}
+[2023-03-30 23:26:31,709] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明'}
+[2023-03-30 23:26:31,718] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明'}
+[2023-03-30 23:26:31,727] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明'}
+[2023-03-30 23:26:31,740] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明'}
+[2023-03-30 23:26:31,757] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明'}
+[2023-03-30 23:26:31,768] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明'}
+[2023-03-30 23:26:32,476] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助'}
+[2023-03-30 23:26:32,486] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助'}
+[2023-03-30 23:26:32,495] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助'}
+[2023-03-30 23:26:32,549] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助'}
+[2023-03-30 23:26:32,560] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助'}
+[2023-03-30 23:26:32,574] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助'}
+[2023-03-30 23:26:32,590] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助'}
+[2023-03-30 23:26:33,338] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生'}
+[2023-03-30 23:26:33,356] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生'}
+[2023-03-30 23:26:33,368] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生'}
+[2023-03-30 23:26:33,386] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生'}
+[2023-03-30 23:26:33,397] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生'}
+[2023-03-30 23:26:33,409] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生'}
+[2023-03-30 23:26:33,424] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生'}
+[2023-03-30 23:26:33,434] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生'}
+[2023-03-30 23:26:34,352] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感'}
+[2023-03-30 23:26:34,364] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感'}
+[2023-03-30 23:26:34,377] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感'}
+[2023-03-30 23:26:34,395] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感'}
+[2023-03-30 23:26:34,410] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感'}
+[2023-03-30 23:26:34,423] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感'}
+[2023-03-30 23:26:34,434] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感'}
+[2023-03-30 23:26:35,373] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有'}
+[2023-03-30 23:26:35,397] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有'}
+[2023-03-30 23:26:35,410] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有'}
+[2023-03-30 23:26:35,420] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有'}
+[2023-03-30 23:26:35,437] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有'}
+[2023-03-30 23:26:35,448] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有'}
+[2023-03-30 23:26:35,460] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有'}
+[2023-03-30 23:26:35,473] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有'}
+[2023-03-30 23:26:36,288] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的'}
+[2023-03-30 23:26:36,297] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的'}
+[2023-03-30 23:26:36,306] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的'}
+[2023-03-30 23:26:36,326] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的'}
+[2023-03-30 23:26:36,336] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的'}
+[2023-03-30 23:26:36,351] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的'}
+[2023-03-30 23:26:36,365] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的'}
+[2023-03-30 23:26:37,164] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象'}
+[2023-03-30 23:26:37,173] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象'}
+[2023-03-30 23:26:37,182] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象'}
+[2023-03-30 23:26:37,192] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象'}
+[2023-03-30 23:26:37,204] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象'}
+[2023-03-30 23:26:37,232] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象'}
+[2023-03-30 23:26:37,238] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象'}
+[2023-03-30 23:26:37,252] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象'}
+[2023-03-30 23:26:38,084] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后'}
+[2023-03-30 23:26:38,093] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后'}
+[2023-03-30 23:26:38,106] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后'}
+[2023-03-30 23:26:38,122] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后'}
+[2023-03-30 23:26:38,140] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后'}
+[2023-03-30 23:26:38,181] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后'}
+[2023-03-30 23:26:38,206] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后'}
+[2023-03-30 23:26:39,094] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合'}
+[2023-03-30 23:26:39,111] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合'}
+[2023-03-30 23:26:39,132] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合'}
+[2023-03-30 23:26:39,150] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合'}
+[2023-03-30 23:26:39,174] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合'}
+[2023-03-30 23:26:39,190] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合'}
+[2023-03-30 23:26:39,197] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合'}
+[2023-03-30 23:26:39,212] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合'}
+[2023-03-30 23:26:40,009] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实'}
+[2023-03-30 23:26:40,094] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实'}
+[2023-03-30 23:26:40,105] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实'}
+[2023-03-30 23:26:40,128] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实'}
+[2023-03-30 23:26:40,149] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实'}
+[2023-03-30 23:26:40,173] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实'}
+[2023-03-30 23:26:40,189] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实'}
+[2023-03-30 23:26:40,200] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实'}
+[2023-03-30 23:26:40,952] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用'}
+[2023-03-30 23:26:40,973] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用'}
+[2023-03-30 23:26:40,986] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用'}
+[2023-03-30 23:26:40,999] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用'}
+[2023-03-30 23:26:41,013] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用'}
+[2023-03-30 23:26:41,022] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用'}
+[2023-03-30 23:26:41,033] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用'}
+[2023-03-30 23:26:41,819] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升'}
+[2023-03-30 23:26:41,832] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升'}
+[2023-03-30 23:26:41,845] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升'}
+[2023-03-30 23:26:41,878] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升'}
+[2023-03-30 23:26:41,886] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升'}
+[2023-03-30 23:26:41,893] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升'}
+[2023-03-30 23:26:41,925] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升'}
+[2023-03-30 23:26:41,935] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升'}
+[2023-03-30 23:26:42,562] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对'}
+[2023-03-30 23:26:42,589] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对'}
+[2023-03-30 23:26:42,621] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对'}
+[2023-03-30 23:26:42,634] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对'}
+[2023-03-30 23:26:42,644] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对'}
+[2023-03-30 23:26:42,657] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对'}
+[2023-03-30 23:26:42,668] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对'}
+[2023-03-30 23:26:43,380] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴'}
+[2023-03-30 23:26:43,389] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴'}
+[2023-03-30 23:26:43,436] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴'}
+[2023-03-30 23:26:43,448] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴'}
+[2023-03-30 23:26:43,462] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴'}
+[2023-03-30 23:26:43,472] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴'}
+[2023-03-30 23:26:43,486] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴'}
+[2023-03-30 23:26:43,496] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴'}
+[2023-03-30 23:26:44,346] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴趣以'}
+[2023-03-30 23:26:44,356] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴趣以'}
+[2023-03-30 23:26:44,364] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴趣以'}
+[2023-03-30 23:26:44,374] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴趣以'}
+[2023-03-30 23:26:44,389] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴趣以'}
+[2023-03-30 23:26:44,398] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴趣以'}
+[2023-03-30 23:26:44,420] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴趣以'}
+[2023-03-30 23:26:45,226] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴趣以及意义感'}
+[2023-03-30 23:26:45,235] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴趣以及意义感'}
+[2023-03-30 23:26:45,258] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴趣以及意义感'}
+[2023-03-30 23:26:45,273] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴趣以及意义感'}
+[2023-03-30 23:26:45,295] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴趣以及意义感'}
+[2023-03-30 23:26:45,306] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴趣以及意义感'}
+[2023-03-30 23:26:46,380] [    INFO] - client punctuation restored msg={'result': '第一部分是认知部分，该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理，让学生对设备有大致的认知。随后使用真实传感器的内部构造图，辅以文字说明，进一步帮助学生对传感器有更深刻的印象，最后结合具体的实践应用，提升学生对实训的兴趣以及意义感。'}
+[2023-03-30 23:27:01,059] [    INFO] - client final receive msg={'status': 'ok', 'signal': 'finished', 'result': '第一部分是认知部分，该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理，让学生对设备有大致的认知。随后使用真实传感器的内部构造图，辅以文字说明，进一步帮助学生对传感器有更深刻的印象，最后结合具体的实践应用，提升学生对实训的兴趣以及意义感。', 'times': [{'w': '第', 'bg': 0.0, 'ed': 0.36}, {'w': '一', 'bg': 0.36, 'ed': 0.48}, {'w': '部', 'bg': 0.48, 'ed': 0.62}, {'w': '分', 'bg': 0.62, 'ed': 0.8200000000000001}, {'w': '是', 'bg': 0.8200000000000001, 'ed': 1.08}, {'w': '认', 'bg': 1.08, 'ed': 1.28}, {'w': '知', 'bg': 1.28, 'ed': 1.44}, {'w': '部', 'bg': 1.44, 'ed': 1.58}, {'w': '分', 'bg': 1.58, 'ed': 2.1}, {'w': '该', 'bg': 2.1, 'ed': 2.6}, {'w': '部', 'bg': 2.6, 'ed': 2.72}, {'w': '分', 'bg': 2.72, 'ed': 2.94}, {'w': '通', 'bg': 2.94, 'ed': 3.16}, {'w': '过', 'bg': 3.16, 'ed': 3.36}, {'w': '示', 'bg': 3.36, 'ed': 3.54}, {'w': '意', 'bg': 3.54, 'ed': 3.68}, {'w': '图', 'bg': 3.68, 'ed': 3.9}, {'w': '和', 'bg': 3.9, 'ed': 4.14}, {'w': '文', 'bg': 4.14, 'ed': 4.32}, {'w': '本', 'bg': 4.32, 'ed': 4.46}, {'w': '的', 'bg': 4.46, 'ed': 4.58}, {'w': '形', 'bg': 4.58, 'ed': 4.72}, {'w': '式', 'bg': 4.72, 'ed': 5.0}, {'w': '向', 'bg': 5.0, 'ed': 5.32}, {'w': '学', 'bg': 5.32, 'ed': 5.5}, {'w': '生', 'bg': 5.5, 'ed': 5.66}, {'w': '讲', 'bg': 5.66, 'ed': 5.86}, {'w': '解', 'bg': 5.86, 'ed': 6.18}, {'w': '主', 'bg': 6.18, 'ed': 6.46}, {'w': '要', 'bg': 6.46, 'ed': 6.62}, {'w': '传', 'bg': 6.62, 'ed': 6.8}, {'w': '感', 'bg': 6.8, 'ed': 7.0}, {'w': '器', 'bg': 7.0, 'ed': 7.16}, {'w': '的', 'bg': 7.16, 'ed': 7.28}, {'w': '工', 'bg': 7.28, 'ed': 7.44}, {'w': '作', 'bg': 7.44, 'ed': 7.6000000000000005}, {'w': '原', 'bg': 7.6000000000000005, 'ed': 7.74}, {'w': '理', 'bg': 7.74, 'ed': 8.06}, {'w': '让', 'bg': 8.06, 'ed': 8.44}, {'w': '学', 'bg': 8.44, 'ed': 8.64}, {'w': '生', 'bg': 8.64, 'ed': 8.84}, {'w': '对', 'bg': 8.84, 'ed': 9.06}, {'w': '设', 'bg': 9.06, 'ed': 9.24}, {'w': '备', 'bg': 9.24, 'ed': 9.52}, {'w': '有', 'bg': 9.52, 'ed': 9.86}, {'w': '大', 'bg': 9.86, 'ed': 10.1}, {'w': '致', 'bg': 10.1, 'ed': 10.24}, {'w': '的', 'bg': 10.24, 'ed': 10.36}, {'w': '认', 'bg': 10.36, 'ed': 10.5}, {'w': '知', 'bg': 10.5, 'ed': 11.040000000000001}, {'w': '随', 'bg': 11.040000000000001, 'ed': 11.56}, {'w': '后', 'bg': 11.56, 'ed': 11.82}, {'w': '使', 'bg': 11.82, 'ed': 12.1}, {'w': '用', 'bg': 12.1, 'ed': 12.26}, {'w': '真', 'bg': 12.26, 'ed': 12.44}, {'w': '实', 'bg': 12.44, 'ed': 12.620000000000001}, {'w': '传', 'bg': 12.620000000000001, 'ed': 12.780000000000001}, {'w': '感', 'bg': 12.780000000000001, 'ed': 12.94}, {'w': '器', 'bg': 12.94, 'ed': 13.1}, {'w': '的', 'bg': 13.1, 'ed': 13.26}, {'w': '内', 'bg': 13.26, 'ed': 13.42}, {'w': '部', 'bg': 13.42, 'ed': 13.56}, {'w': '构', 'bg': 13.56, 'ed': 13.700000000000001}, {'w': '造', 'bg': 13.700000000000001, 'ed': 13.86}, {'w': '图', 'bg': 13.86, 'ed': 14.280000000000001}, {'w': '辅', 'bg': 14.280000000000001, 'ed': 14.66}, {'w': '以', 'bg': 14.66, 'ed': 14.82}, {'w': '文', 'bg': 14.82, 'ed': 15.0}, {'w': '字', 'bg': 15.0, 'ed': 15.16}, {'w': '说', 'bg': 15.16, 'ed': 15.32}, {'w': '明', 'bg': 15.32, 'ed': 15.72}, {'w': '进', 'bg': 15.72, 'ed': 16.1}, {'w': '一', 'bg': 16.1, 'ed': 16.2}, {'w': '步', 'bg': 16.2, 'ed': 16.32}, {'w': '帮', 'bg': 16.32, 'ed': 16.48}, {'w': '助', 'bg': 16.48, 'ed': 16.66}, {'w': '学', 'bg': 16.66, 'ed': 16.82}, {'w': '生', 'bg': 16.82, 'ed': 17.12}, {'w': '对', 'bg': 17.12, 'ed': 17.48}, {'w': '传', 'bg': 17.48, 'ed': 17.66}, {'w': '感', 'bg': 17.66, 'ed': 17.84}, {'w': '器', 'bg': 17.84, 'ed': 18.12}, {'w': '有', 'bg': 18.12, 'ed': 18.42}, {'w': '更', 'bg': 18.42, 'ed': 18.66}, {'w': '深', 'bg': 18.66, 'ed': 18.88}, {'w': '刻', 'bg': 18.88, 'ed': 19.04}, {'w': '的', 'bg': 19.04, 'ed': 19.16}, {'w': '印', 'bg': 19.16, 'ed': 19.3}, {'w': '象', 'bg': 19.3, 'ed': 19.8}, {'w': '最', 'bg': 19.8, 'ed': 20.3}, {'w': '后', 'bg': 20.3, 'ed': 20.62}, {'w': '结', 'bg': 20.62, 'ed': 20.96}, {'w': '合', 'bg': 20.96, 'ed': 21.14}, {'w': '具', 'bg': 21.14, 'ed': 21.3}, {'w': '体', 'bg': 21.3, 'ed': 21.42}, {'w': '的', 'bg': 21.42, 'ed': 21.580000000000002}, {'w': '实', 'bg': 21.580000000000002, 'ed': 21.76}, {'w': '践', 'bg': 21.76, 'ed': 21.92}, {'w': '应', 'bg': 21.92, 'ed': 22.080000000000002}, {'w': '用', 'bg': 22.080000000000002, 'ed': 22.44}, {'w': '提', 'bg': 22.44, 'ed': 22.78}, {'w': '升', 'bg': 22.78, 'ed': 22.94}, {'w': '学', 'bg': 22.94, 'ed': 23.12}, {'w': '生', 'bg': 23.12, 'ed': 23.34}, {'w': '对', 'bg': 23.34, 'ed': 23.62}, {'w': '实', 'bg': 23.62, 'ed': 23.82}, {'w': '训', 'bg': 23.82, 'ed': 23.96}, {'w': '的', 'bg': 23.96, 'ed': 24.12}, {'w': '兴', 'bg': 24.12, 'ed': 24.3}, {'w': '趣', 'bg': 24.3, 'ed': 24.6}, {'w': '以', 'bg': 24.6, 'ed': 24.88}, {'w': '及', 'bg': 24.88, 'ed': 25.12}, {'w': '意', 'bg': 25.12, 'ed': 25.34}, {'w': '义', 'bg': 25.34, 'ed': 25.46}, {'w': '感', 'bg': 25.46, 'ed': 26.04}]}
+[2023-03-30 23:27:01,060] [    INFO] - audio duration: 26.04, elapsed time: 46.581613540649414, RTF=1.7888484462614982
+sentences:  ['第一部分是认知部分', '该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理', '让学生对设备有大致的认知', '随后使用真实传感器的内部构造图', '辅以文字说明', '进一步帮助学生对传感器有更深刻的印象', '最后结合具体的实践应用', '提升学生对实训的兴趣以及意义感']
+relative_times:  [[0.0, 2.1], [2.1, 8.06], [8.06, 11.040000000000001], [11.040000000000001, 14.280000000000001], [14.280000000000001, 15.72], [15.72, 19.8], [19.8, 22.44], [22.44, 26.04]]
+[2023-03-30 23:27:01,076] [    INFO] - results saved to /home/fxb/PaddleSpeech-develop/data/认知.srt
+  ```
diff --git a/demos/streaming_asr_server/README_cn.md b/demos/streaming_asr_server/README_cn.md
index 1902a2fa..bbddd693 100644
--- a/demos/streaming_asr_server/README_cn.md
+++ b/demos/streaming_asr_server/README_cn.md
@@ -90,7 +90,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
 
   server_executor = ServerExecutor()
   server_executor(
-      config_file="./conf/ws_conformer_wenetspeech_application", 
+      config_file="./conf/ws_conformer_wenetspeech_application_faster.yaml", 
       log_file="./log/paddlespeech.log")
   ```
 
@@ -578,3 +578,354 @@ bash server.sh
   [2022-05-07 11:11:18,915] [    INFO] - audio duration: 4.9968125, elapsed time: 15.928460597991943, RTF=3.187724293835709
   [2022-05-07 11:11:18,916] [    INFO] - asr websocket client finished : 我认为跑步最重要的就是给我带来了身体健康
   ```
+
+## 从音频文件(.wav 格式 或者.mp3 格式)生成字幕文件 (.srt 格式)
+
+**注意:** 默认部署在 `cpu` 设备上，可以通过修改服务配置文件中 `device` 参数将语音识别和标点预测部署在不同的 `gpu` 上。
+
+使用 `streaming_asr_server.py` 和 `punc_server.py` 两个服务，分别启动流式语音识别和标点预测服务。调用 `websocket_client.py` 脚本可以同时调用流式语音识别和标点预测服务，将会生成对应的字幕文件(.srt格式)。
+
+**使用该脚本前需要安装mffpeg**
+
+**应该在对应的`.../demos/streaming_asr_server/`目录下运行以下脚本**
+
+### 1. 启动服务端
+
+```bash
+Note: streaming speech recognition and punctuation prediction are configured on different graphics cards through configuration files
+paddlespeech_server start --config_file ./conf/ws_conformer_wenetspeech_application.yaml
+```
+
+Open another terminal run the following commands:
+```bash
+paddlespeech_server start --config_file conf/punc_application.yaml
+```
+
+### 2. 启动客户端
+
+  ```bash
+  python3 local/websocket_client_srt.py --server_ip 127.0.0.1 --port 8090 --punc.server_ip 127.0.0.1 --punc.port 8190 --wavfile ../../data/认知.mp3
+  ```
+  Output:
+  ```text
+  [2023-03-30 23:26:13,991] [    INFO] - Start to do streaming asr client
+[2023-03-30 23:26:13,994] [    INFO] - asr websocket client start
+[2023-03-30 23:26:13,994] [    INFO] - endpoint: http://127.0.0.1:8190/paddlespeech/text
+[2023-03-30 23:26:13,994] [    INFO] - endpoint: ws://127.0.0.1:8090/paddlespeech/asr/streaming
+[2023-03-30 23:26:14,475] [    INFO] - /home/fxb/PaddleSpeech-develop/data/认知.mp3 converted to /home/fxb/PaddleSpeech-develop/data/认知.wav
+[2023-03-30 23:26:14,476] [    INFO] - start to process the wavscp: /home/fxb/PaddleSpeech-develop/data/认知.wav
+[2023-03-30 23:26:14,515] [    INFO] - client receive msg={"status": "ok", "signal": "server_ready"}
+[2023-03-30 23:26:14,533] [    INFO] - client receive msg={'result': ''}
+[2023-03-30 23:26:14,545] [    INFO] - client receive msg={'result': ''}
+[2023-03-30 23:26:14,556] [    INFO] - client receive msg={'result': ''}
+[2023-03-30 23:26:14,572] [    INFO] - client receive msg={'result': ''}
+[2023-03-30 23:26:14,588] [    INFO] - client receive msg={'result': ''}
+[2023-03-30 23:26:14,600] [    INFO] - client receive msg={'result': ''}
+[2023-03-30 23:26:14,613] [    INFO] - client receive msg={'result': ''}
+[2023-03-30 23:26:14,626] [    INFO] - client receive msg={'result': ''}
+[2023-03-30 23:26:15,122] [    INFO] - client receive msg={'result': '第一部'}
+[2023-03-30 23:26:15,135] [    INFO] - client receive msg={'result': '第一部'}
+[2023-03-30 23:26:15,154] [    INFO] - client receive msg={'result': '第一部'}
+[2023-03-30 23:26:15,163] [    INFO] - client receive msg={'result': '第一部'}
+[2023-03-30 23:26:15,175] [    INFO] - client receive msg={'result': '第一部'}
+[2023-03-30 23:26:15,185] [    INFO] - client receive msg={'result': '第一部'}
+[2023-03-30 23:26:15,196] [    INFO] - client receive msg={'result': '第一部'}
+[2023-03-30 23:26:15,637] [    INFO] - client receive msg={'result': '第一部分是认'}
+[2023-03-30 23:26:15,648] [    INFO] - client receive msg={'result': '第一部分是认'}
+[2023-03-30 23:26:15,657] [    INFO] - client receive msg={'result': '第一部分是认'}
+[2023-03-30 23:26:15,666] [    INFO] - client receive msg={'result': '第一部分是认'}
+[2023-03-30 23:26:15,676] [    INFO] - client receive msg={'result': '第一部分是认'}
+[2023-03-30 23:26:15,683] [    INFO] - client receive msg={'result': '第一部分是认'}
+[2023-03-30 23:26:15,691] [    INFO] - client receive msg={'result': '第一部分是认'}
+[2023-03-30 23:26:15,703] [    INFO] - client receive msg={'result': '第一部分是认'}
+[2023-03-30 23:26:16,146] [    INFO] - client receive msg={'result': '第一部分是认知部分'}
+[2023-03-30 23:26:16,159] [    INFO] - client receive msg={'result': '第一部分是认知部分'}
+[2023-03-30 23:26:16,167] [    INFO] - client receive msg={'result': '第一部分是认知部分'}
+[2023-03-30 23:26:16,177] [    INFO] - client receive msg={'result': '第一部分是认知部分'}
+[2023-03-30 23:26:16,187] [    INFO] - client receive msg={'result': '第一部分是认知部分'}
+[2023-03-30 23:26:16,197] [    INFO] - client receive msg={'result': '第一部分是认知部分'}
+[2023-03-30 23:26:16,210] [    INFO] - client receive msg={'result': '第一部分是认知部分'}
+[2023-03-30 23:26:16,694] [    INFO] - client receive msg={'result': '第一部分是认知部分'}
+[2023-03-30 23:26:16,704] [    INFO] - client receive msg={'result': '第一部分是认知部分'}
+[2023-03-30 23:26:16,713] [    INFO] - client receive msg={'result': '第一部分是认知部分'}
+[2023-03-30 23:26:16,725] [    INFO] - client receive msg={'result': '第一部分是认知部分'}
+[2023-03-30 23:26:16,737] [    INFO] - client receive msg={'result': '第一部分是认知部分'}
+[2023-03-30 23:26:16,749] [    INFO] - client receive msg={'result': '第一部分是认知部分'}
+[2023-03-30 23:26:16,759] [    INFO] - client receive msg={'result': '第一部分是认知部分'}
+[2023-03-30 23:26:16,770] [    INFO] - client receive msg={'result': '第一部分是认知部分'}
+[2023-03-30 23:26:17,279] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通'}
+[2023-03-30 23:26:17,302] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通'}
+[2023-03-30 23:26:17,316] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通'}
+[2023-03-30 23:26:17,332] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通'}
+[2023-03-30 23:26:17,343] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通'}
+[2023-03-30 23:26:17,358] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通'}
+[2023-03-30 23:26:17,373] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通'}
+[2023-03-30 23:26:17,958] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图'}
+[2023-03-30 23:26:17,971] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图'}
+[2023-03-30 23:26:17,987] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图'}
+[2023-03-30 23:26:18,000] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图'}
+[2023-03-30 23:26:18,017] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图'}
+[2023-03-30 23:26:18,028] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图'}
+[2023-03-30 23:26:18,038] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图'}
+[2023-03-30 23:26:18,049] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图'}
+[2023-03-30 23:26:18,653] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本'}
+[2023-03-30 23:26:18,689] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本'}
+[2023-03-30 23:26:18,701] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本'}
+[2023-03-30 23:26:18,712] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本'}
+[2023-03-30 23:26:18,723] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本'}
+[2023-03-30 23:26:18,750] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本'}
+[2023-03-30 23:26:18,767] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本'}
+[2023-03-30 23:26:19,295] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式'}
+[2023-03-30 23:26:19,307] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式'}
+[2023-03-30 23:26:19,323] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式'}
+[2023-03-30 23:26:19,332] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式'}
+[2023-03-30 23:26:19,342] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式'}
+[2023-03-30 23:26:19,349] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式'}
+[2023-03-30 23:26:19,373] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式'}
+[2023-03-30 23:26:19,389] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式'}
+[2023-03-30 23:26:20,046] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生'}
+[2023-03-30 23:26:20,055] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生'}
+[2023-03-30 23:26:20,067] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生'}
+[2023-03-30 23:26:20,076] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生'}
+[2023-03-30 23:26:20,094] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生'}
+[2023-03-30 23:26:20,124] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生'}
+[2023-03-30 23:26:20,135] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生'}
+[2023-03-30 23:26:20,732] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解'}
+[2023-03-30 23:26:20,742] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解'}
+[2023-03-30 23:26:20,757] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解'}
+[2023-03-30 23:26:20,770] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解'}
+[2023-03-30 23:26:20,782] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解'}
+[2023-03-30 23:26:20,798] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解'}
+[2023-03-30 23:26:20,815] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解'}
+[2023-03-30 23:26:20,834] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解'}
+[2023-03-30 23:26:21,390] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感'}
+[2023-03-30 23:26:21,405] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感'}
+[2023-03-30 23:26:21,416] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感'}
+[2023-03-30 23:26:21,428] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感'}
+[2023-03-30 23:26:21,448] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感'}
+[2023-03-30 23:26:21,459] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感'}
+[2023-03-30 23:26:21,473] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感'}
+[2023-03-30 23:26:22,065] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作'}
+[2023-03-30 23:26:22,085] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作'}
+[2023-03-30 23:26:22,110] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作'}
+[2023-03-30 23:26:22,118] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作'}
+[2023-03-30 23:26:22,137] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作'}
+[2023-03-30 23:26:22,144] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作'}
+[2023-03-30 23:26:22,154] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作'}
+[2023-03-30 23:26:22,169] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作'}
+[2023-03-30 23:26:22,698] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理'}
+[2023-03-30 23:26:22,709] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理'}
+[2023-03-30 23:26:22,731] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理'}
+[2023-03-30 23:26:22,743] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理'}
+[2023-03-30 23:26:22,755] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理'}
+[2023-03-30 23:26:22,771] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理'}
+[2023-03-30 23:26:22,782] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理'}
+[2023-03-30 23:26:23,415] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生'}
+[2023-03-30 23:26:23,430] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生'}
+[2023-03-30 23:26:23,442] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生'}
+[2023-03-30 23:26:23,456] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生'}
+[2023-03-30 23:26:23,470] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生'}
+[2023-03-30 23:26:23,487] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生'}
+[2023-03-30 23:26:23,498] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生'}
+[2023-03-30 23:26:23,524] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生'}
+[2023-03-30 23:26:24,200] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备'}
+[2023-03-30 23:26:24,210] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备'}
+[2023-03-30 23:26:24,219] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备'}
+[2023-03-30 23:26:24,231] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备'}
+[2023-03-30 23:26:24,250] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备'}
+[2023-03-30 23:26:24,262] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备'}
+[2023-03-30 23:26:24,272] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备'}
+[2023-03-30 23:26:24,898] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致'}
+[2023-03-30 23:26:24,903] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致'}
+[2023-03-30 23:26:24,907] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致'}
+[2023-03-30 23:26:24,932] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致'}
+[2023-03-30 23:26:24,957] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致'}
+[2023-03-30 23:26:24,979] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致'}
+[2023-03-30 23:26:24,991] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致'}
+[2023-03-30 23:26:25,011] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致'}
+[2023-03-30 23:26:25,616] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知'}
+[2023-03-30 23:26:25,625] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知'}
+[2023-03-30 23:26:25,648] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知'}
+[2023-03-30 23:26:25,658] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知'}
+[2023-03-30 23:26:25,669] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知'}
+[2023-03-30 23:26:25,681] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知'}
+[2023-03-30 23:26:25,690] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知'}
+[2023-03-30 23:26:25,707] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知'}
+[2023-03-30 23:26:26,378] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知'}
+[2023-03-30 23:26:26,384] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知'}
+[2023-03-30 23:26:26,389] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知'}
+[2023-03-30 23:26:26,397] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知'}
+[2023-03-30 23:26:26,402] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知'}
+[2023-03-30 23:26:26,415] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知'}
+[2023-03-30 23:26:26,428] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知'}
+[2023-03-30 23:26:27,008] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使'}
+[2023-03-30 23:26:27,018] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使'}
+[2023-03-30 23:26:27,026] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使'}
+[2023-03-30 23:26:27,037] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使'}
+[2023-03-30 23:26:27,046] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使'}
+[2023-03-30 23:26:27,054] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使'}
+[2023-03-30 23:26:27,062] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使'}
+[2023-03-30 23:26:27,070] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使'}
+[2023-03-30 23:26:27,735] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传'}
+[2023-03-30 23:26:27,745] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传'}
+[2023-03-30 23:26:27,755] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传'}
+[2023-03-30 23:26:27,769] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传'}
+[2023-03-30 23:26:27,783] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传'}
+[2023-03-30 23:26:27,794] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传'}
+[2023-03-30 23:26:27,804] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传'}
+[2023-03-30 23:26:28,454] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内'}
+[2023-03-30 23:26:28,472] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内'}
+[2023-03-30 23:26:28,481] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内'}
+[2023-03-30 23:26:28,489] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内'}
+[2023-03-30 23:26:28,499] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内'}
+[2023-03-30 23:26:28,533] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内'}
+[2023-03-30 23:26:28,543] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内'}
+[2023-03-30 23:26:28,556] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内'}
+[2023-03-30 23:26:29,212] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图'}
+[2023-03-30 23:26:29,222] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图'}
+[2023-03-30 23:26:29,233] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图'}
+[2023-03-30 23:26:29,246] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图'}
+[2023-03-30 23:26:29,258] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图'}
+[2023-03-30 23:26:29,270] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图'}
+[2023-03-30 23:26:29,286] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图'}
+[2023-03-30 23:26:30,003] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅'}
+[2023-03-30 23:26:30,013] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅'}
+[2023-03-30 23:26:30,038] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅'}
+[2023-03-30 23:26:30,048] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅'}
+[2023-03-30 23:26:30,062] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅'}
+[2023-03-30 23:26:30,074] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅'}
+[2023-03-30 23:26:30,114] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅'}
+[2023-03-30 23:26:30,125] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅'}
+[2023-03-30 23:26:30,856] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说'}
+[2023-03-30 23:26:30,876] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说'}
+[2023-03-30 23:26:30,885] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说'}
+[2023-03-30 23:26:30,897] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说'}
+[2023-03-30 23:26:30,914] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说'}
+[2023-03-30 23:26:30,940] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说'}
+[2023-03-30 23:26:30,952] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说'}
+[2023-03-30 23:26:31,655] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明'}
+[2023-03-30 23:26:31,696] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明'}
+[2023-03-30 23:26:31,709] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明'}
+[2023-03-30 23:26:31,718] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明'}
+[2023-03-30 23:26:31,727] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明'}
+[2023-03-30 23:26:31,740] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明'}
+[2023-03-30 23:26:31,757] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明'}
+[2023-03-30 23:26:31,768] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明'}
+[2023-03-30 23:26:32,476] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助'}
+[2023-03-30 23:26:32,486] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助'}
+[2023-03-30 23:26:32,495] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助'}
+[2023-03-30 23:26:32,549] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助'}
+[2023-03-30 23:26:32,560] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助'}
+[2023-03-30 23:26:32,574] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助'}
+[2023-03-30 23:26:32,590] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助'}
+[2023-03-30 23:26:33,338] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生'}
+[2023-03-30 23:26:33,356] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生'}
+[2023-03-30 23:26:33,368] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生'}
+[2023-03-30 23:26:33,386] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生'}
+[2023-03-30 23:26:33,397] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生'}
+[2023-03-30 23:26:33,409] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生'}
+[2023-03-30 23:26:33,424] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生'}
+[2023-03-30 23:26:33,434] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生'}
+[2023-03-30 23:26:34,352] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感'}
+[2023-03-30 23:26:34,364] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感'}
+[2023-03-30 23:26:34,377] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感'}
+[2023-03-30 23:26:34,395] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感'}
+[2023-03-30 23:26:34,410] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感'}
+[2023-03-30 23:26:34,423] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感'}
+[2023-03-30 23:26:34,434] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感'}
+[2023-03-30 23:26:35,373] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有'}
+[2023-03-30 23:26:35,397] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有'}
+[2023-03-30 23:26:35,410] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有'}
+[2023-03-30 23:26:35,420] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有'}
+[2023-03-30 23:26:35,437] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有'}
+[2023-03-30 23:26:35,448] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有'}
+[2023-03-30 23:26:35,460] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有'}
+[2023-03-30 23:26:35,473] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有'}
+[2023-03-30 23:26:36,288] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的'}
+[2023-03-30 23:26:36,297] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的'}
+[2023-03-30 23:26:36,306] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的'}
+[2023-03-30 23:26:36,326] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的'}
+[2023-03-30 23:26:36,336] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的'}
+[2023-03-30 23:26:36,351] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的'}
+[2023-03-30 23:26:36,365] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的'}
+[2023-03-30 23:26:37,164] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象'}
+[2023-03-30 23:26:37,173] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象'}
+[2023-03-30 23:26:37,182] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象'}
+[2023-03-30 23:26:37,192] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象'}
+[2023-03-30 23:26:37,204] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象'}
+[2023-03-30 23:26:37,232] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象'}
+[2023-03-30 23:26:37,238] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象'}
+[2023-03-30 23:26:37,252] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象'}
+[2023-03-30 23:26:38,084] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后'}
+[2023-03-30 23:26:38,093] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后'}
+[2023-03-30 23:26:38,106] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后'}
+[2023-03-30 23:26:38,122] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后'}
+[2023-03-30 23:26:38,140] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后'}
+[2023-03-30 23:26:38,181] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后'}
+[2023-03-30 23:26:38,206] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后'}
+[2023-03-30 23:26:39,094] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合'}
+[2023-03-30 23:26:39,111] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合'}
+[2023-03-30 23:26:39,132] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合'}
+[2023-03-30 23:26:39,150] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合'}
+[2023-03-30 23:26:39,174] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合'}
+[2023-03-30 23:26:39,190] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合'}
+[2023-03-30 23:26:39,197] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合'}
+[2023-03-30 23:26:39,212] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合'}
+[2023-03-30 23:26:40,009] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实'}
+[2023-03-30 23:26:40,094] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实'}
+[2023-03-30 23:26:40,105] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实'}
+[2023-03-30 23:26:40,128] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实'}
+[2023-03-30 23:26:40,149] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实'}
+[2023-03-30 23:26:40,173] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实'}
+[2023-03-30 23:26:40,189] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实'}
+[2023-03-30 23:26:40,200] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实'}
+[2023-03-30 23:26:40,952] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用'}
+[2023-03-30 23:26:40,973] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用'}
+[2023-03-30 23:26:40,986] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用'}
+[2023-03-30 23:26:40,999] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用'}
+[2023-03-30 23:26:41,013] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用'}
+[2023-03-30 23:26:41,022] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用'}
+[2023-03-30 23:26:41,033] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用'}
+[2023-03-30 23:26:41,819] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升'}
+[2023-03-30 23:26:41,832] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升'}
+[2023-03-30 23:26:41,845] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升'}
+[2023-03-30 23:26:41,878] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升'}
+[2023-03-30 23:26:41,886] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升'}
+[2023-03-30 23:26:41,893] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升'}
+[2023-03-30 23:26:41,925] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升'}
+[2023-03-30 23:26:41,935] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升'}
+[2023-03-30 23:26:42,562] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对'}
+[2023-03-30 23:26:42,589] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对'}
+[2023-03-30 23:26:42,621] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对'}
+[2023-03-30 23:26:42,634] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对'}
+[2023-03-30 23:26:42,644] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对'}
+[2023-03-30 23:26:42,657] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对'}
+[2023-03-30 23:26:42,668] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对'}
+[2023-03-30 23:26:43,380] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴'}
+[2023-03-30 23:26:43,389] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴'}
+[2023-03-30 23:26:43,436] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴'}
+[2023-03-30 23:26:43,448] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴'}
+[2023-03-30 23:26:43,462] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴'}
+[2023-03-30 23:26:43,472] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴'}
+[2023-03-30 23:26:43,486] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴'}
+[2023-03-30 23:26:43,496] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴'}
+[2023-03-30 23:26:44,346] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴趣以'}
+[2023-03-30 23:26:44,356] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴趣以'}
+[2023-03-30 23:26:44,364] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴趣以'}
+[2023-03-30 23:26:44,374] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴趣以'}
+[2023-03-30 23:26:44,389] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴趣以'}
+[2023-03-30 23:26:44,398] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴趣以'}
+[2023-03-30 23:26:44,420] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴趣以'}
+[2023-03-30 23:26:45,226] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴趣以及意义感'}
+[2023-03-30 23:26:45,235] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴趣以及意义感'}
+[2023-03-30 23:26:45,258] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴趣以及意义感'}
+[2023-03-30 23:26:45,273] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴趣以及意义感'}
+[2023-03-30 23:26:45,295] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴趣以及意义感'}
+[2023-03-30 23:26:45,306] [    INFO] - client receive msg={'result': '第一部分是认知部分该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理让学生对设备有大致的认知随后使用真实传感器的内部构造图辅以文字说明进一步帮助学生对传感器有更深刻的印象最后结合具体的实践应用提升学生对实训的兴趣以及意义感'}
+[2023-03-30 23:26:46,380] [    INFO] - client punctuation restored msg={'result': '第一部分是认知部分，该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理，让学生对设备有大致的认知。随后使用真实传感器的内部构造图，辅以文字说明，进一步帮助学生对传感器有更深刻的印象，最后结合具体的实践应用，提升学生对实训的兴趣以及意义感。'}
+[2023-03-30 23:27:01,059] [    INFO] - client final receive msg={'status': 'ok', 'signal': 'finished', 'result': '第一部分是认知部分，该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理，让学生对设备有大致的认知。随后使用真实传感器的内部构造图，辅以文字说明，进一步帮助学生对传感器有更深刻的印象，最后结合具体的实践应用，提升学生对实训的兴趣以及意义感。', 'times': [{'w': '第', 'bg': 0.0, 'ed': 0.36}, {'w': '一', 'bg': 0.36, 'ed': 0.48}, {'w': '部', 'bg': 0.48, 'ed': 0.62}, {'w': '分', 'bg': 0.62, 'ed': 0.8200000000000001}, {'w': '是', 'bg': 0.8200000000000001, 'ed': 1.08}, {'w': '认', 'bg': 1.08, 'ed': 1.28}, {'w': '知', 'bg': 1.28, 'ed': 1.44}, {'w': '部', 'bg': 1.44, 'ed': 1.58}, {'w': '分', 'bg': 1.58, 'ed': 2.1}, {'w': '该', 'bg': 2.1, 'ed': 2.6}, {'w': '部', 'bg': 2.6, 'ed': 2.72}, {'w': '分', 'bg': 2.72, 'ed': 2.94}, {'w': '通', 'bg': 2.94, 'ed': 3.16}, {'w': '过', 'bg': 3.16, 'ed': 3.36}, {'w': '示', 'bg': 3.36, 'ed': 3.54}, {'w': '意', 'bg': 3.54, 'ed': 3.68}, {'w': '图', 'bg': 3.68, 'ed': 3.9}, {'w': '和', 'bg': 3.9, 'ed': 4.14}, {'w': '文', 'bg': 4.14, 'ed': 4.32}, {'w': '本', 'bg': 4.32, 'ed': 4.46}, {'w': '的', 'bg': 4.46, 'ed': 4.58}, {'w': '形', 'bg': 4.58, 'ed': 4.72}, {'w': '式', 'bg': 4.72, 'ed': 5.0}, {'w': '向', 'bg': 5.0, 'ed': 5.32}, {'w': '学', 'bg': 5.32, 'ed': 5.5}, {'w': '生', 'bg': 5.5, 'ed': 5.66}, {'w': '讲', 'bg': 5.66, 'ed': 5.86}, {'w': '解', 'bg': 5.86, 'ed': 6.18}, {'w': '主', 'bg': 6.18, 'ed': 6.46}, {'w': '要', 'bg': 6.46, 'ed': 6.62}, {'w': '传', 'bg': 6.62, 'ed': 6.8}, {'w': '感', 'bg': 6.8, 'ed': 7.0}, {'w': '器', 'bg': 7.0, 'ed': 7.16}, {'w': '的', 'bg': 7.16, 'ed': 7.28}, {'w': '工', 'bg': 7.28, 'ed': 7.44}, {'w': '作', 'bg': 7.44, 'ed': 7.6000000000000005}, {'w': '原', 'bg': 7.6000000000000005, 'ed': 7.74}, {'w': '理', 'bg': 7.74, 'ed': 8.06}, {'w': '让', 'bg': 8.06, 'ed': 8.44}, {'w': '学', 'bg': 8.44, 'ed': 8.64}, {'w': '生', 'bg': 8.64, 'ed': 8.84}, {'w': '对', 'bg': 8.84, 'ed': 9.06}, {'w': '设', 'bg': 9.06, 'ed': 9.24}, {'w': '备', 'bg': 9.24, 'ed': 9.52}, {'w': '有', 'bg': 9.52, 'ed': 9.86}, {'w': '大', 'bg': 9.86, 'ed': 10.1}, {'w': '致', 'bg': 10.1, 'ed': 10.24}, {'w': '的', 'bg': 10.24, 'ed': 10.36}, {'w': '认', 'bg': 10.36, 'ed': 10.5}, {'w': '知', 'bg': 10.5, 'ed': 11.040000000000001}, {'w': '随', 'bg': 11.040000000000001, 'ed': 11.56}, {'w': '后', 'bg': 11.56, 'ed': 11.82}, {'w': '使', 'bg': 11.82, 'ed': 12.1}, {'w': '用', 'bg': 12.1, 'ed': 12.26}, {'w': '真', 'bg': 12.26, 'ed': 12.44}, {'w': '实', 'bg': 12.44, 'ed': 12.620000000000001}, {'w': '传', 'bg': 12.620000000000001, 'ed': 12.780000000000001}, {'w': '感', 'bg': 12.780000000000001, 'ed': 12.94}, {'w': '器', 'bg': 12.94, 'ed': 13.1}, {'w': '的', 'bg': 13.1, 'ed': 13.26}, {'w': '内', 'bg': 13.26, 'ed': 13.42}, {'w': '部', 'bg': 13.42, 'ed': 13.56}, {'w': '构', 'bg': 13.56, 'ed': 13.700000000000001}, {'w': '造', 'bg': 13.700000000000001, 'ed': 13.86}, {'w': '图', 'bg': 13.86, 'ed': 14.280000000000001}, {'w': '辅', 'bg': 14.280000000000001, 'ed': 14.66}, {'w': '以', 'bg': 14.66, 'ed': 14.82}, {'w': '文', 'bg': 14.82, 'ed': 15.0}, {'w': '字', 'bg': 15.0, 'ed': 15.16}, {'w': '说', 'bg': 15.16, 'ed': 15.32}, {'w': '明', 'bg': 15.32, 'ed': 15.72}, {'w': '进', 'bg': 15.72, 'ed': 16.1}, {'w': '一', 'bg': 16.1, 'ed': 16.2}, {'w': '步', 'bg': 16.2, 'ed': 16.32}, {'w': '帮', 'bg': 16.32, 'ed': 16.48}, {'w': '助', 'bg': 16.48, 'ed': 16.66}, {'w': '学', 'bg': 16.66, 'ed': 16.82}, {'w': '生', 'bg': 16.82, 'ed': 17.12}, {'w': '对', 'bg': 17.12, 'ed': 17.48}, {'w': '传', 'bg': 17.48, 'ed': 17.66}, {'w': '感', 'bg': 17.66, 'ed': 17.84}, {'w': '器', 'bg': 17.84, 'ed': 18.12}, {'w': '有', 'bg': 18.12, 'ed': 18.42}, {'w': '更', 'bg': 18.42, 'ed': 18.66}, {'w': '深', 'bg': 18.66, 'ed': 18.88}, {'w': '刻', 'bg': 18.88, 'ed': 19.04}, {'w': '的', 'bg': 19.04, 'ed': 19.16}, {'w': '印', 'bg': 19.16, 'ed': 19.3}, {'w': '象', 'bg': 19.3, 'ed': 19.8}, {'w': '最', 'bg': 19.8, 'ed': 20.3}, {'w': '后', 'bg': 20.3, 'ed': 20.62}, {'w': '结', 'bg': 20.62, 'ed': 20.96}, {'w': '合', 'bg': 20.96, 'ed': 21.14}, {'w': '具', 'bg': 21.14, 'ed': 21.3}, {'w': '体', 'bg': 21.3, 'ed': 21.42}, {'w': '的', 'bg': 21.42, 'ed': 21.580000000000002}, {'w': '实', 'bg': 21.580000000000002, 'ed': 21.76}, {'w': '践', 'bg': 21.76, 'ed': 21.92}, {'w': '应', 'bg': 21.92, 'ed': 22.080000000000002}, {'w': '用', 'bg': 22.080000000000002, 'ed': 22.44}, {'w': '提', 'bg': 22.44, 'ed': 22.78}, {'w': '升', 'bg': 22.78, 'ed': 22.94}, {'w': '学', 'bg': 22.94, 'ed': 23.12}, {'w': '生', 'bg': 23.12, 'ed': 23.34}, {'w': '对', 'bg': 23.34, 'ed': 23.62}, {'w': '实', 'bg': 23.62, 'ed': 23.82}, {'w': '训', 'bg': 23.82, 'ed': 23.96}, {'w': '的', 'bg': 23.96, 'ed': 24.12}, {'w': '兴', 'bg': 24.12, 'ed': 24.3}, {'w': '趣', 'bg': 24.3, 'ed': 24.6}, {'w': '以', 'bg': 24.6, 'ed': 24.88}, {'w': '及', 'bg': 24.88, 'ed': 25.12}, {'w': '意', 'bg': 25.12, 'ed': 25.34}, {'w': '义', 'bg': 25.34, 'ed': 25.46}, {'w': '感', 'bg': 25.46, 'ed': 26.04}]}
+[2023-03-30 23:27:01,060] [    INFO] - audio duration: 26.04, elapsed time: 46.581613540649414, RTF=1.7888484462614982
+sentences:  ['第一部分是认知部分', '该部分通过示意图和文本的形式向学生讲解主要传感器的工作原理', '让学生对设备有大致的认知', '随后使用真实传感器的内部构造图', '辅以文字说明', '进一步帮助学生对传感器有更深刻的印象', '最后结合具体的实践应用', '提升学生对实训的兴趣以及意义感']
+relative_times:  [[0.0, 2.1], [2.1, 8.06], [8.06, 11.040000000000001], [11.040000000000001, 14.280000000000001], [14.280000000000001, 15.72], [15.72, 19.8], [19.8, 22.44], [22.44, 26.04]]
+[2023-03-30 23:27:01,076] [    INFO] - results saved to /home/fxb/PaddleSpeech-develop/data/认知.srt
+  ```
diff --git a/demos/streaming_asr_server/local/websocket_client_srt.py b/demos/streaming_asr_server/local/websocket_client_srt.py
new file mode 100644
index 00000000..02fea484
--- /dev/null
+++ b/demos/streaming_asr_server/local/websocket_client_srt.py
@@ -0,0 +1,162 @@
+#!/usr/bin/python
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# calc avg RTF(NOT Accurate): grep -rn RTF log.txt | awk '{print $NF}' | awk -F "=" '{sum += $NF} END {print "all time",sum, "audio num", NR,  "RTF", sum/NR}'
+# python3 websocket_client.py --server_ip 127.0.0.1 --port 8290 --punc.server_ip 127.0.0.1 --punc.port 8190 --wavfile ./zh.wav
+# python3 websocket_client.py --server_ip 127.0.0.1 --port 8290 --wavfile ./zh.wav
+import argparse
+import asyncio
+import codecs
+import os
+from pydub import AudioSegment
+import re
+
+from paddlespeech.cli.log import logger
+from paddlespeech.server.utils.audio_handler import ASRWsAudioHandler
+
+def convert_to_wav(input_file):
+    # Load audio file
+    audio = AudioSegment.from_file(input_file)
+
+    # Set parameters for audio file
+    audio = audio.set_channels(1)
+    audio = audio.set_frame_rate(16000)
+
+    # Create output filename
+    output_file = os.path.splitext(input_file)[0] + ".wav"
+
+    # Export audio file as WAV
+    audio.export(output_file, format="wav")
+
+    logger.info(f"{input_file} converted to {output_file}")
+
+def format_time(sec):
+    # Convert seconds to SRT format (HH:MM:SS,ms)
+    hours = int(sec/3600)
+    minutes = int((sec%3600)/60)
+    seconds = int(sec%60)
+    milliseconds = int((sec%1)*1000)
+    return f'{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}'
+
+def results2srt(results, srt_file):
+    """convert results from paddlespeech to srt format for subtitle
+    Args:
+        results (dict): results from paddlespeech
+    """
+    # times contains start and end time of each word
+    times = results['times']
+    # result contains the whole sentence including punctuation
+    result = results['result']
+    # split result into several sencences by '，' and '。'
+    sentences = re.split('，|。', result)[:-1]
+    # print("sentences: ", sentences)
+    # generate relative time for each sentence in sentences
+    relative_times = []
+    word_i = 0
+    for sentence in sentences:
+        relative_times.append([])
+        for word in sentence:
+            if relative_times[-1] == []:
+                relative_times[-1].append(times[word_i]['bg'])
+            if len(relative_times[-1]) == 1:
+                relative_times[-1].append(times[word_i]['ed'])
+            else:
+                relative_times[-1][1] = times[word_i]['ed']
+            word_i += 1
+    # print("relative_times: ", relative_times)
+    # generate srt file acoording to relative_times and sentences
+    with open(srt_file, 'w') as f:
+        for i in range(len(sentences)):
+            # Write index number
+            f.write(str(i+1)+'\n')
+            
+            # Write start and end times
+            start = format_time(relative_times[i][0])
+            end = format_time(relative_times[i][1])
+            f.write(start + ' --> ' + end + '\n')
+            
+            # Write text
+            f.write(sentences[i]+'\n\n')
+    logger.info(f"results saved to {srt_file}")
+
+def main(args):
+    logger.info("asr websocket client start")
+    handler = ASRWsAudioHandler(
+        args.server_ip,
+        args.port,
+        endpoint=args.endpoint,
+        punc_server_ip=args.punc_server_ip,
+        punc_server_port=args.punc_server_port)
+    loop = asyncio.get_event_loop()
+
+    # check if the wav file is mp3 format
+    # if so, convert it to wav format using convert_to_wav function
+    if args.wavfile and os.path.exists(args.wavfile):
+        if args.wavfile.endswith(".mp3"):
+            convert_to_wav(args.wavfile)
+            args.wavfile = args.wavfile.replace(".mp3", ".wav")
+
+    # support to process single audio file
+    if args.wavfile and os.path.exists(args.wavfile):
+        logger.info(f"start to process the wavscp: {args.wavfile}")
+        result = loop.run_until_complete(handler.run(args.wavfile))
+        # result = result["result"]
+        # logger.info(f"asr websocket client finished : {result}")
+        results2srt(result, args.wavfile.replace(".wav", ".srt"))
+
+    # support to process batch audios from wav.scp
+    if args.wavscp and os.path.exists(args.wavscp):
+        logger.info(f"start to process the wavscp: {args.wavscp}")
+        with codecs.open(args.wavscp, 'r', encoding='utf-8') as f,\
+             codecs.open("result.txt", 'w', encoding='utf-8') as w:
+            for line in f:
+                utt_name, utt_path = line.strip().split()
+                result = loop.run_until_complete(handler.run(utt_path))
+                result = result["result"]
+                w.write(f"{utt_name} {result}\n")
+
+
+if __name__ == "__main__":
+    logger.info("Start to do streaming asr client")
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--server_ip', type=str, default='127.0.0.1', help='server ip')
+    parser.add_argument('--port', type=int, default=8090, help='server port')
+    parser.add_argument(
+        '--punc.server_ip',
+        type=str,
+        default=None,
+        dest="punc_server_ip",
+        help='Punctuation server ip')
+    parser.add_argument(
+        '--punc.port',
+        type=int,
+        default=8091,
+        dest="punc_server_port",
+        help='Punctuation server port')
+    parser.add_argument(
+        "--endpoint",
+        type=str,
+        default="/paddlespeech/asr/streaming",
+        help="ASR websocket endpoint")
+    parser.add_argument(
+        "--wavfile",
+        action="store",
+        help="wav file path ",
+        default="./16_audio.wav")
+    parser.add_argument(
+        "--wavscp", type=str, default=None, help="The batch audios dict text")
+    args = parser.parse_args()
+
+    main(args)
diff --git a/demos/text_to_speech/README.md b/demos/text_to_speech/README.md
index 41dcf820..d7bb8ca1 100644
--- a/demos/text_to_speech/README.md
+++ b/demos/text_to_speech/README.md
@@ -58,7 +58,18 @@ The input of this demo should be a text of the specific language that can be pas
         paddlespeech tts --am fastspeech2_mix --voc pwgan_csmsc --lang mix --input "我们的声学模型使用了 Fast Speech Two, 声码器使用了 Parallel Wave GAN and Hifi GAN." --spk_id 175 --output mix_spk175_pwgan.wav
         paddlespeech tts --am fastspeech2_mix --voc hifigan_csmsc --lang mix --input "我们的声学模型使用了 Fast Speech Two, 声码器使用了 Parallel Wave GAN and Hifi GAN." --spk_id 175 --output mix_spk175.wav
         ```
-     - Use ONNXRuntime infer：
+    - Chinese English Mixed, single male spk
+        ```bash
+        # male mix tts
+        # The `lang` must be `mix`!
+        paddlespeech tts --am fastspeech2_male --voc pwgan_male --lang mix --input "我们的声学模型使用了 Fast Speech Two, 声码器使用了 Parallel Wave GAN and Hifi GAN." --output male_mix_fs2_pwgan.wav
+        paddlespeech tts --am fastspeech2_male --voc hifigan_male --lang mix --input "我们的声学模型使用了 Fast Speech Two, 声码器使用了 Parallel Wave GAN and Hifi GAN." --output male_mix_fs2_hifigan.wav
+        ```
+    - Cantonese
+        ```bash
+        paddlespeech tts --am fastspeech2_canton --voc pwgan_aishell3 --input "各个国家有各个国家嘅国歌" --lang canton --spk_id 10
+        ```
+    - Use ONNXRuntime infer：
         ```bash
         paddlespeech tts --input "你好，欢迎使用百度飞桨深度学习框架！" --output default.wav --use_onnx True
         paddlespeech tts --am speedyspeech_csmsc --input "你好，欢迎使用百度飞桨深度学习框架！" --output ss.wav --use_onnx True
@@ -70,7 +81,15 @@ The input of this demo should be a text of the specific language that can be pas
         paddlespeech tts --am fastspeech2_ljspeech --voc hifigan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get." --output lj_fs2_hifigan.wav --use_onnx True
         paddlespeech tts --am fastspeech2_vctk --voc pwgan_vctk --input "Life was like a box of chocolates, you never know what you're gonna get." --lang en --spk_id 0 --output vctk_fs2_pwgan.wav --use_onnx True
         paddlespeech tts --am fastspeech2_vctk --voc hifigan_vctk --input "Life was like a box of chocolates, you never know what you're gonna get." --lang en --spk_id 0 --output vctk_fs2_hifigan.wav --use_onnx True
-         ```
+        paddlespeech tts --am fastspeech2_male --voc pwgan_male --lang zh --input "你好，欢迎使用百度飞桨深度学习框架！" --output male_zh_fs2_pwgan.wav --use_onnx True
+        paddlespeech tts --am fastspeech2_male --voc pwgan_male --lang en --input "Life was like a box of chocolates, you never know what you're gonna get." --output male_en_fs2_pwgan.wav --use_onnx True
+        paddlespeech tts --am fastspeech2_male --voc pwgan_male --lang mix --input "热烈欢迎您在 Discussions 中提交问题，并在 Issues 中指出发现的 bug。此外，我们非常希望您参与到 Paddle Speech 的开发中！" --output male_fs2_pwgan.wav --use_onnx True
+        paddlespeech tts --am fastspeech2_male --voc hifigan_male --lang zh --input "你好，欢迎使用百度飞桨深度学习框架！" --output male_zh_fs2_hifigan.wav --use_onnx True
+        paddlespeech tts --am fastspeech2_male --voc hifigan_male --lang en --input "Life was like a box of chocolates, you never know what you're gonna get." --output male_en_fs2_hifigan.wav --use_onnx True
+        paddlespeech tts --am fastspeech2_mix --voc hifigan_male --lang mix --input "热烈欢迎您在 Discussions 中提交问题，并在 Issues 中指出发现的 bug。此外，我们非常希望您参与到 Paddle Speech 的开发中！" --output male_fs2_hifigan.wav --use_onnx True
+        paddlespeech tts --am fastspeech2_mix --voc pwgan_csmsc --lang mix --spk_id 174 --input "热烈欢迎您在 Discussions 中提交问题，并在 Issues 中指出发现的 bug。此外，我们非常希望您参与到 Paddle Speech 的开发中！" --output mix_fs2_pwgan_csmsc_spk174.wav --use_onnx True
+        paddlespeech tts --am fastspeech2_canton --voc pwgan_aishell3 --lang canton --spk_id 10 --input "各个国家有各个国家嘅国歌" --output output_canton.wav --use_onnx True
+        ```
 
   Usage:
   
@@ -161,6 +180,10 @@ Here is a list of pretrained models released by PaddleSpeech that can be used by
   |       fastspeech2_mix        |   mix    |
   |       tacotron2_csmsc        |    zh    |
   |      tacotron2_ljspeech      |    en    |
+  |       fastspeech2_male       |    zh    |
+  |       fastspeech2_male       |    en    |
+  |       fastspeech2_male       |   mix    |
+  |       fastspeech2_canton     |  canton  |
 
 - Vocoder
   | Model | Language |
@@ -176,3 +199,5 @@ Here is a list of pretrained models released by PaddleSpeech that can be used by
   |       hifigan_aishell3       |    zh    |
   |         hifigan_vctk         |    en    |
   |        wavernn_csmsc         |    zh    |
+  |         pwgan_male           |    zh    |
+  |        hifigan_male          |    zh    |
diff --git a/demos/text_to_speech/README_cn.md b/demos/text_to_speech/README_cn.md
index 4a413223..d8a2a14c 100644
--- a/demos/text_to_speech/README_cn.md
+++ b/demos/text_to_speech/README_cn.md
@@ -58,7 +58,18 @@
         paddlespeech tts --am fastspeech2_mix --voc pwgan_csmsc --lang mix --input "我们的声学模型使用了 Fast Speech Two, 声码器使用了 Parallel Wave GAN and Hifi GAN." --spk_id 175 --output mix_spk175_pwgan.wav
         paddlespeech tts --am fastspeech2_mix --voc hifigan_csmsc --lang mix --input "我们的声学模型使用了 Fast Speech Two, 声码器使用了 Parallel Wave GAN and Hifi GAN." --spk_id 175 --output mix_spk175.wav
         ```
-     - 使用 ONNXRuntime 推理：
+    - 中英文混合，单个男性说话人
+        ```bash
+        # male mix tts
+        # The `lang` must be `mix`!
+        paddlespeech tts --am fastspeech2_male --voc pwgan_male --lang mix --input "我们的声学模型使用了 Fast Speech Two, 声码器使用了 Parallel Wave GAN and Hifi GAN." --output male_mix_fs2_pwgan.wav
+        paddlespeech tts --am fastspeech2_male --voc hifigan_male --lang mix --input "我们的声学模型使用了 Fast Speech Two, 声码器使用了 Parallel Wave GAN and Hifi GAN." --output male_mix_fs2_hifigan.wav
+        ```
+    - 粤语
+        ```bash
+        paddlespeech tts --am fastspeech2_canton --voc pwgan_aishell3 --input "各个国家有各个国家嘅国歌" --lang canton --spk_id 10
+        ```
+    - 使用 ONNXRuntime 推理：
         ```bash
         paddlespeech tts --input "你好，欢迎使用百度飞桨深度学习框架！" --output default.wav --use_onnx True
         paddlespeech tts --am speedyspeech_csmsc --input "你好，欢迎使用百度飞桨深度学习框架！" --output ss.wav --use_onnx True
@@ -70,7 +81,15 @@
         paddlespeech tts --am fastspeech2_ljspeech --voc hifigan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get." --output lj_fs2_hifigan.wav --use_onnx True
         paddlespeech tts --am fastspeech2_vctk --voc pwgan_vctk --input "Life was like a box of chocolates, you never know what you're gonna get." --lang en --spk_id 0 --output vctk_fs2_pwgan.wav --use_onnx True
         paddlespeech tts --am fastspeech2_vctk --voc hifigan_vctk --input "Life was like a box of chocolates, you never know what you're gonna get." --lang en --spk_id 0 --output vctk_fs2_hifigan.wav --use_onnx True
-         ```
+        paddlespeech tts --am fastspeech2_male --voc pwgan_male --lang zh --input "你好，欢迎使用百度飞桨深度学习框架！" --output male_zh_fs2_pwgan.wav --use_onnx True
+        paddlespeech tts --am fastspeech2_male --voc pwgan_male --lang en --input "Life was like a box of chocolates, you never know what you're gonna get." --output male_en_fs2_pwgan.wav --use_onnx True
+        paddlespeech tts --am fastspeech2_male --voc pwgan_male --lang mix --input "热烈欢迎您在 Discussions 中提交问题，并在 Issues 中指出发现的 bug。此外，我们非常希望您参与到 Paddle Speech 的开发中！" --output male_fs2_pwgan.wav --use_onnx True
+        paddlespeech tts --am fastspeech2_male --voc hifigan_male --lang zh --input "你好，欢迎使用百度飞桨深度学习框架！" --output male_zh_fs2_hifigan.wav --use_onnx True
+        paddlespeech tts --am fastspeech2_male --voc hifigan_male --lang en --input "Life was like a box of chocolates, you never know what you're gonna get." --output male_en_fs2_hifigan.wav --use_onnx True
+        paddlespeech tts --am fastspeech2_mix --voc hifigan_male --lang mix --input "热烈欢迎您在 Discussions 中提交问题，并在 Issues 中指出发现的 bug。此外，我们非常希望您参与到 Paddle Speech 的开发中！" --output male_fs2_hifigan.wav --use_onnx True
+        paddlespeech tts --am fastspeech2_mix --voc pwgan_csmsc --lang mix --spk_id 174 --input "热烈欢迎您在 Discussions 中提交问题，并在 Issues 中指出发现的 bug。此外，我们非常希望您参与到 Paddle Speech 的开发中！" --output mix_fs2_pwgan_csmsc_spk174.wav --use_onnx True
+        paddlespeech tts --am fastspeech2_canton --voc pwgan_aishell3 --lang canton --spk_id 10 --input "各个国家有各个国家嘅国歌" --output output_canton.wav --use_onnx True
+        ```
 
   使用方法：
   
@@ -161,6 +180,10 @@
   |       fastspeech2_mix        |   mix    |
   |       tacotron2_csmsc        |    zh    |
   |      tacotron2_ljspeech      |    en    |
+  |       fastspeech2_male       |    zh    |
+  |       fastspeech2_male       |    en    |
+  |       fastspeech2_male       |   mix    |
+  |       fastspeech2_canton     |  canton  |
 
 - 声码器
   | 模型 | 语言 |
@@ -176,3 +199,5 @@
   |       hifigan_aishell3       |    zh    |
   |         hifigan_vctk         |    en    |
   |        wavernn_csmsc         |    zh    |
+  |         pwgan_male           |    zh    |
+  |        hifigan_male          |    zh    |
diff --git a/docker/ubuntu18-cpu/Dockerfile b/docker/ubuntu18-cpu/Dockerfile
index 35f45f2e..3ae48cb6 100644
--- a/docker/ubuntu18-cpu/Dockerfile
+++ b/docker/ubuntu18-cpu/Dockerfile
@@ -2,7 +2,7 @@ FROM registry.baidubce.com/paddlepaddle/paddle:2.2.2
 LABEL maintainer="paddlesl@baidu.com"
 
 RUN apt-get update \
-  && apt-get install libsndfile-dev \
+  && apt-get install libsndfile-dev libsndfile1 \
   && apt-get clean \
   && rm -rf /var/lib/apt/lists/*
 
diff --git a/docs/images/note_map.png b/docs/images/note_map.png
new file mode 100644
index 00000000..f280d98c
Binary files /dev/null and b/docs/images/note_map.png differ
diff --git a/docs/requirements.txt b/docs/requirements.txt
index bd7f40ec..30622230 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,12 +1,9 @@
 braceexpand
-colorlog
 editdistance
-fastapi
 g2p_en
 g2pM
 h5py
 inflect
-jieba
 jsonlines
 kaldiio
 keyboard
@@ -16,7 +13,7 @@ matplotlib
 myst-parser
 nara_wpe
 numpydoc
-onnxruntime==1.10.0
+onnxruntime>=1.11.0
 opencc
 paddlenlp
 # use paddlepaddle == 2.3.* according to: https://github.com/PaddlePaddle/Paddle/issues/48243
@@ -24,31 +21,25 @@ paddlepaddle>=2.2.2,<2.4.0
 paddlespeech_ctcdecoders
 paddlespeech_feat
 pandas
-pathos==0.2.8
 pattern_singleton
-Pillow>=9.0.0
-praatio==5.0.0
+ppdiffusers>=0.9.0
+praatio>=5.0.0, <=5.1.1
 prettytable
 pypinyin-dict
 pypinyin<=0.44.0
 python-dateutil
-pyworld==0.2.12
+pyworld>=0.2.12
 recommonmark>=0.5.0
-resampy==0.2.2
+resampy
 sacrebleu
-scipy
-sentencepiece~=0.1.96
-soundfile~=0.10
 sphinx
 sphinx-autobuild
 sphinx-markdown-tables
 sphinx_rtd_theme
 textgrid
 timer
-tqdm
-typeguard
-uvicorn
-visualdl
+ToJyutping==0.2.1
+typeguard==2.13.3
 webrtcvad
 websockets
 yacs~=0.1.8
diff --git a/docs/source/released_model.md b/docs/source/released_model.md
index 87c58b78..9e922177 100644
--- a/docs/source/released_model.md
+++ b/docs/source/released_model.md
@@ -3,20 +3,21 @@
 ## Speech-to-Text Models
 
 ### Speech Recognition Model
-Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech | Example Link | Inference Type |
-:-------------:| :------------:| :-----: | -----: | :-----: |:-----:| :-----:  | :-----:  | :-----: | :-----: |
-[Ds2 Online Wenetspeech ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr0/asr0_deepspeech2_online_wenetspeech_ckpt_1.0.4.model.tar.gz) | Wenetspeech Dataset | Char-based | 1.2 GB  | 2 Conv + 5 LSTM layers | 0.152 (test\_net, w/o LM) <br> 0.2417 (test\_meeting, w/o LM) <br> 0.053 (aishell, w/ LM) |-| 10000 h | - | onnx/inference/python |
-[Ds2 Online Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.1.model.tar.gz) | Aishell Dataset | Char-based | 491 MB  | 2 Conv + 5 LSTM layers | 0.0666 |-| 151 h | [D2 Online Aishell ASR0](../../examples/aishell/asr0) | onnx/inference/python |
-[Ds2 Offline Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_offline_aishell_ckpt_1.0.1.model.tar.gz)| Aishell Dataset | Char-based | 1.4 GB | 2 Conv + 5 bidirectional LSTM layers| 0.0554 |-| 151 h | [Ds2 Offline Aishell ASR0](../../examples/aishell/asr0) | inference/python |
-[Conformer Online Wenetspeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_wenetspeech_ckpt_1.0.0a.model.tar.gz) | WenetSpeech Dataset | Char-based | 457 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring| 0.11 (test\_net) 0.1879 (test\_meeting) |-| 10000 h |- | python |
-[Conformer U2PP Online Wenetspeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.3.0.model.tar.gz) | WenetSpeech Dataset | Char-based | 476 MB  | Encoder:Conformer, Decoder:BiTransformer, Decoding method: Attention rescoring| 0.047198 (aishell test\_-1) 0.059212 (aishell test\_16) |-| 10000 h |- | python |
-[Conformer Online Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_chunk_conformer_aishell_ckpt_0.2.0.model.tar.gz) | Aishell Dataset | Char-based | 189 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring| 0.0544 |-| 151 h | [Conformer Online Aishell ASR1](../../examples/aishell/asr1) | python |
-[Conformer Offline Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_conformer_aishell_ckpt_1.0.1.model.tar.gz) | Aishell Dataset | Char-based | 189 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0460 |-| 151 h | [Conformer Offline Aishell ASR1](../../examples/aishell/asr1) | python |
-[Transformer Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_transformer_aishell_ckpt_0.1.1.model.tar.gz) | Aishell Dataset | Char-based | 128 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0523 || 151 h | [Transformer  Aishell ASR1](../../examples/aishell/asr1) | python |
-[Ds2 Offline Librispeech ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr0/asr0_deepspeech2_offline_librispeech_ckpt_1.0.1.model.tar.gz)| Librispeech Dataset | Char-based | 1.3 GB | 2 Conv + 5 bidirectional LSTM layers| - |0.0467| 960 h | [Ds2 Offline Librispeech ASR0](../../examples/librispeech/asr0) | inference/python |
-[Conformer Librispeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/asr1_conformer_librispeech_ckpt_0.1.1.model.tar.gz) | Librispeech Dataset | subword-based | 191 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0338 | 960 h | [Conformer Librispeech ASR1](../../examples/librispeech/asr1) | python |
-[Transformer Librispeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/asr1_transformer_librispeech_ckpt_0.1.1.model.tar.gz) | Librispeech Dataset | subword-based | 131 MB  | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0381 | 960 h | [Transformer Librispeech ASR1](../../examples/librispeech/asr1) | python |
-[Transformer Librispeech ASR2 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr2/asr2_transformer_librispeech_ckpt_0.1.1.model.tar.gz) | Librispeech Dataset | subword-based | 131 MB  | Encoder:Transformer, Decoder:Transformer, Decoding method: JoinCTC w/ LM |-| 0.0240 | 960 h | [Transformer Librispeech ASR2](../../examples/librispeech/asr2) | python |
+Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech | Example Link | Inference Type | static_model | 
+:-------------:| :------------:| :-----: | -----: | :-----: |:-----:| :-----:  | :-----:  | :-----: | :-----: | :-----: |
+[Ds2 Online Wenetspeech ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr0/asr0_deepspeech2_online_wenetspeech_ckpt_1.0.4.model.tar.gz) | Wenetspeech Dataset | Char-based | 1.2 GB  | 2 Conv + 5 LSTM layers | 0.152 (test\_net, w/o LM) <br> 0.2417 (test\_meeting, w/o LM) <br> 0.053 (aishell, w/ LM) |-| 10000 h | - | onnx/inference/python |-|
+[Ds2 Online Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.1.model.tar.gz) | Aishell Dataset | Char-based | 491 MB  | 2 Conv + 5 LSTM layers | 0.0666 |-| 151 h | [D2 Online Aishell ASR0](../../examples/aishell/asr0) | onnx/inference/python |-|
+[Ds2 Offline Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_offline_aishell_ckpt_1.0.1.model.tar.gz)| Aishell Dataset | Char-based | 1.4 GB | 2 Conv + 5 bidirectional LSTM layers| 0.0554 |-| 151 h | [Ds2 Offline Aishell ASR0](../../examples/aishell/asr0) | inference/python |-|
+[Conformer Online Wenetspeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_wenetspeech_ckpt_1.0.0a.model.tar.gz) | WenetSpeech Dataset | Char-based | 457 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring| 0.11 (test\_net) 0.1879 (test\_meeting) |-| 10000 h |- | python |-|
+[Conformer U2PP Online Wenetspeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.3.0.model.tar.gz) | WenetSpeech Dataset | Char-based | 540 MB  | Encoder:Conformer, Decoder:BiTransformer, Decoding method: Attention rescoring| 0.047198 (aishell test\_-1) 0.059212 (aishell test\_16) |-| 10000 h |- | python |[FP32](https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_u2pp_wenetspeech_ckpt_1.3.0.model.tar.gz) </br>[INT8](https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/static/asr1_chunk_conformer_u2pp_wenetspeech_static_quant_1.3.0.model.tar.gz) |
+[Conformer Online Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_chunk_conformer_aishell_ckpt_0.2.0.model.tar.gz) | Aishell Dataset | Char-based | 189 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring| 0.0544 |-| 151 h | [Conformer Online Aishell ASR1](../../examples/aishell/asr1) | python |-|
+[Conformer Offline Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_conformer_aishell_ckpt_1.0.1.model.tar.gz) | Aishell Dataset | Char-based | 189 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0460 |-| 151 h | [Conformer Offline Aishell ASR1](../../examples/aishell/asr1) | python |-|
+[Transformer Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_transformer_aishell_ckpt_0.1.1.model.tar.gz) | Aishell Dataset | Char-based | 128 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0523 || 151 h | [Transformer  Aishell ASR1](../../examples/aishell/asr1) | python |-|
+[Ds2 Offline Librispeech ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr0/asr0_deepspeech2_offline_librispeech_ckpt_1.0.1.model.tar.gz)| Librispeech Dataset | Char-based | 1.3 GB | 2 Conv + 5 bidirectional LSTM layers| - |0.0467| 960 h | [Ds2 Offline Librispeech ASR0](../../examples/librispeech/asr0) | inference/python |-|
+[Conformer Librispeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/asr1_conformer_librispeech_ckpt_0.1.1.model.tar.gz) | Librispeech Dataset | subword-based | 191 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0338 | 960 h | [Conformer Librispeech ASR1](../../examples/librispeech/asr1) | python |-|
+[Transformer Librispeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/asr1_transformer_librispeech_ckpt_0.1.1.model.tar.gz) | Librispeech Dataset | subword-based | 131 MB  | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0381 | 960 h | [Transformer Librispeech ASR1](../../examples/librispeech/asr1) | python |-|
+[Transformer Librispeech ASR2 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr2/asr2_transformer_librispeech_ckpt_0.1.1.model.tar.gz) | Librispeech Dataset | subword-based | 131 MB  | Encoder:Transformer, Decoder:Transformer, Decoding method: JoinCTC w/ LM |-| 0.0240 | 960 h | [Transformer Librispeech ASR2](../../examples/librispeech/asr2) | python |-|
+[Conformer TALCS ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/tal_cs/asr1/asr1_conformer_talcs_ckpt_1.4.0.model.tar.gz) | TALCS Dataset | subword-based | 470 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0844 | 587 h | [Conformer TALCS ASR1](../../examples/tal_cs/asr1) | python |-|
 
 ### Self-Supervised Pre-trained Model
 Model | Pre-Train Method | Pre-Train Data | Finetune Data | Size | Descriptions | CER | WER |  Example Link |
@@ -24,12 +25,12 @@ Model | Pre-Train Method | Pre-Train Data | Finetune Data | Size | Descriptions
 [Wav2vec2-large-960h-lv60-self Model](https://paddlespeech.bj.bcebos.com/wav2vec/wav2vec2-large-960h-lv60-self.pdparams) | wav2vec2 | Librispeech and LV-60k Dataset (5.3w h) | - | 1.18 GB |Pre-trained Wav2vec2.0 Model | - | - | - | 
 [Wav2vec2ASR-large-960h-librispeech Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr3/wav2vec2ASR-large-960h-librispeech_ckpt_1.3.1.model.tar.gz) | wav2vec2 | Librispeech and LV-60k Dataset (5.3w h) | Librispeech (960 h) | 718 MB |Encoder: Wav2vec2.0, Decoder: CTC, Decoding method: Greedy search | - | 0.0189 | [Wav2vecASR Librispeech ASR3](../../examples/librispeech/asr3) |
 [Wav2vec2-large-wenetspeech-self Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr3/wav2vec2-large-wenetspeech-self_ckpt_1.3.0.model.tar.gz) | wav2vec2 | Wenetspeech Dataset (1w h) | - | 714 MB |Pre-trained Wav2vec2.0 Model | - | - | - | 
-[Wav2vec2ASR-large-aishell1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr3/wav2vec2ASR-large-aishell1_ckpt_1.3.0.model.tar.gz) | wav2vec2 | Wenetspeech Dataset (1w h) | aishell1 (train set) | 1.17 GB |Encoder: Wav2vec2.0, Decoder: CTC, Decoding method: Greedy search | 0.0453 | - | - |
+[Wav2vec2ASR-large-aishell1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr3/wav2vec2ASR-large-aishell1_ckpt_1.4.0.model.tar.gz) | wav2vec2 | Wenetspeech Dataset (1w h) | aishell1 (train set) | 1.18 GB |Encoder: Wav2vec2.0, Decoder: CTC, Decoding method: Greedy search | 0.0510 | - | - |
 
 ### Whisper Model
 Demo Link | Training Data | Size | Descriptions | CER | Model 
 :-----------: | :-----:| :-------: | :-----: | :-----: |:---------:|
-[Whisper](../../demos/whisper) | 680kh from internet | large: 5.8G,</br>medium: 2.9G,</br>small: 923M,</br>base: 277M,</br>tiny: 145M | Encoder:Transformer,</br> Decoder:Transformer, </br>Decoding method: </br>Greedy search | 2.7 </br>(large, Librispeech) | [whisper-large](https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20221122/whisper-large-model.tar.gz) </br>[whisper-medium](https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20221122/whisper-medium-model.tar.gz) </br>[whisper-medium-English-only](https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20221122/whisper-medium-en-model.tar.gz) </br>[whisper-small](https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20221122/whisper-small-model.tar.gz) </br>[whisper-small-English-only](https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20221122/whisper-small-en-model.tar.gz) </br>[whisper-base](https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20221122/whisper-base-model.tar.gz) </br>[whisper-base-English-only](https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20221122/whisper-base-en-model.tar.gz) </br>[whisper-tiny](https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20221122/whisper-tiny-model.tar.gz) </br>[whisper-tiny-English-only](https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20221122/whisper-tiny-en-model.tar.gz)
+[Whisper](../../demos/whisper) | 680kh from internet | large: 5.8G,</br>medium: 2.9G,</br>small: 923M,</br>base: 277M,</br>tiny: 145M | Encoder:Transformer,</br> Decoder:Transformer, </br>Decoding method: </br>Greedy search | 0.027 </br>(large, Librispeech) | [whisper-large](https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20221122/whisper-large-model.tar.gz) </br>[whisper-medium](https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20221122/whisper-medium-model.tar.gz) </br>[whisper-medium-English-only](https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20221122/whisper-medium-en-model.tar.gz) </br>[whisper-small](https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20221122/whisper-small-model.tar.gz) </br>[whisper-small-English-only](https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20221122/whisper-small-en-model.tar.gz) </br>[whisper-base](https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20221122/whisper-base-model.tar.gz) </br>[whisper-base-English-only](https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20221122/whisper-base-en-model.tar.gz) </br>[whisper-tiny](https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20221122/whisper-tiny-model.tar.gz) </br>[whisper-tiny-English-only](https://paddlespeech.bj.bcebos.com/whisper/whisper_model_20221122/whisper-tiny-en-model.tar.gz)
 
 ### Language Model based on NGram
 |Language Model | Training Data | Token-based | Size | Descriptions|
@@ -60,7 +61,10 @@ FastSpeech2| AISHELL-3 |[fastspeech2-aishell3](https://github.com/PaddlePaddle/P
 FastSpeech2| LJSpeech |[fastspeech2-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/tts3)|[fastspeech2_nosil_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_ljspeech_ckpt_0.5.zip)|[fastspeech2_ljspeech_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_ljspeech_static_1.1.0.zip) </br> [fastspeech2_ljspeech_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_ljspeech_onnx_1.1.0.zip) </br> [fastspeech2_ljspeech_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_ljspeech_pdlite_1.3.0.zip)|145MB|
 FastSpeech2| VCTK |[fastspeech2-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/tts3)|[fastspeech2_vctk_ckpt_1.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_vctk_ckpt_1.2.0.zip)|[fastspeech2_vctk_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_vctk_static_1.1.0.zip) </br> [fastspeech2_vctk_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_vctk_onnx_1.1.0.zip) </br> [fastspeech2_vctk_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_vctk_pdlite_1.3.0.zip)| 145MB|
 FastSpeech2| ZH_EN |[fastspeech2-zh_en](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/zh_en_tts/tts3)|[fastspeech2_mix_ckpt_1.2.0.zip](https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_mix_ckpt_1.2.0.zip)|[fastspeech2_mix_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_mix_static_0.2.0.zip) </br> [fastspeech2_mix_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_mix_onnx_0.2.0.zip) | 145MB|
-FastSpeech2| Male ||[fastspeech2_male_ckpt_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_ckpt_1.3.0.zip)| | |
+FastSpeech2| male-zh ||[fastspeech2_male_zh_ckpt_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_zh_ckpt_1.4.0.zip)|[fastspeech2_male_zh_static_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_zh_static_1.4.0.zip) </br> [fastspeech2_male_zh_onnx_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_zh_onnx_1.4.0.zip) |146MB|
+FastSpeech2| male-en ||[fastspeech2_male_en_ckpt_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_en_ckpt_1.4.0.zip)|[fastspeech2_male_en_static_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_en_static_1.4.0.zip) </br> [fastspeech2_male_en_onnx_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_en_onnx_1.4.0.zip) |145MB|
+FastSpeech2| male-mix ||[fastspeech2_male_mix_ckpt_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_mix_ckpt_1.4.0.zip)|[fastspeech2_male_mix_static_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_mix_static_1.4.0.zip) </br> [fastspeech2_male_mix_onnx_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_mix_onnx_1.4.0.zip) |146MB|
+FastSpeech2| Cantonese |[fastspeech2-canton](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/canton/tts3)|[fastspeech2_canton_ckpt_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_canton_ckpt_1.4.0.zip)|[fastspeech2_canton_static_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_canton_static_1.4.0.zip)</br>[fastspeech2_canton_onnx_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_canton_onnx_1.4.0.zip)|146MB|
 
 ### Vocoders
 Model Type | Dataset| Example Link | Pretrained Models| Static / ONNX / Paddle-Lite Models|Size (static)
@@ -77,7 +81,8 @@ HiFiGAN | LJSpeech |[HiFiGAN-ljspeech](https://github.com/PaddlePaddle/PaddleSpe
 HiFiGAN | AISHELL-3 |[HiFiGAN-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc5)|[hifigan_aishell3_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_ckpt_0.2.0.zip)|[hifigan_aishell3_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_static_1.1.0.zip) </br> [hifigan_aishell3_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_onnx_1.1.0.zip) </br> [hifigan_aishell3_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_pdlite_1.3.0.zip)|46MB|
 HiFiGAN | VCTK |[HiFiGAN-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/voc5)|[hifigan_vctk_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_ckpt_0.2.0.zip)|[hifigan_vctk_static_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_static_1.1.0.zip) </br> [hifigan_vctk_onnx_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_onnx_1.1.0.zip) </br> [hifigan_vctk_pdlite_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_pdlite_1.3.0.zip)|46MB|
 WaveRNN | CSMSC |[WaveRNN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc6)|[wavernn_csmsc_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_ckpt_0.2.0.zip)|[wavernn_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_static_0.2.0.zip)|18MB|
-Parallel WaveGAN| Male ||[pwg_male_ckpt_1.3.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_male_ckpt_1.3.0.zip)|||
+Parallel WaveGAN| Male ||[pwg_male_ckpt_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_male_ckpt_1.4.0.zip)|[pwgan_male_static_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_male_static_1.4.0.zip) </br> [pwgan_male_onnx_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_male_onnx_1.4.0.zip)|4.8M|
+HiFiGAN| Male ||[hifigan_male_ckpt_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_male_ckpt_1.4.0.zip)|[hifigan_male_static_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_male_static_1.4.0.zip) </br> [hifigan_male_onnx_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_male_onnx_1.4.0.zip)|46M|
 
 
 ### Voice Cloning
diff --git a/docs/source/tts/quick_start.md b/docs/source/tts/quick_start.md
index d8dbc646..d2a1b4ec 100644
--- a/docs/source/tts/quick_start.md
+++ b/docs/source/tts/quick_start.md
@@ -79,8 +79,8 @@ checkpoint_name
 ├── snapshot_iter_*.pdz
 ├── speech_stats.npy
 ├── phone_id_map.txt
-├── spk_id_map.txt (optimal)
-└── tone_id_map.txt (optimal)
+├── spk_id_map.txt (optional)
+└── tone_id_map.txt (optional)
 ```
 **Vocoders:**
 ```text
diff --git a/docs/source/tts/quick_start_cn.md b/docs/source/tts/quick_start_cn.md
index c56d9bb4..ba259643 100644
--- a/docs/source/tts/quick_start_cn.md
+++ b/docs/source/tts/quick_start_cn.md
@@ -87,8 +87,8 @@ checkpoint_name
 ├── snapshot_iter_*.pdz
 ├── speech_stats.npy
 ├── phone_id_map.txt
-├── spk_id_map.txt (optimal)
-└── tone_id_map.txt (optimal)
+├── spk_id_map.txt (optional)
+└── tone_id_map.txt (optional)
 ```
 **Vocoders:**
 ```text
diff --git a/docs/source/tts/svs_music_score.md b/docs/source/tts/svs_music_score.md
new file mode 100644
index 00000000..9f351c00
--- /dev/null
+++ b/docs/source/tts/svs_music_score.md
@@ -0,0 +1,183 @@
+本人非音乐专业人士，如文档中有误欢迎指正。
+
+# 一、常见基础
+## 1.1 简谱和音名（note）
+<p align="left">
+  <img src="https://paddlespeech.bj.bcebos.com/t2s/svs/svs_music_scores/seven.png" width="300"/>
+</p>
+
+上图从左往右的黑键音名分别是：C#/Db，D#/Db，F#/Db，G#/Ab，A#/Bb
+钢琴88键如下图，分为大字一组，大字组，小字组，小字一组，小字二组，小字三组，小字四组。分别对应音名的后缀是 1 2 3 4 5 6，例如小字一组（C大调）包含的键分别为： C4，C#4/Db4，D4，D#4/Eb4，E4，F4，F#4/Gb4，G4，G#4/Ab4，A4，A#4/Bb4，B4  
+钢琴八度音就是12345671八个音，最后一个音是高1。**遵循：全全半全全全半** 就会得到 1 2 3 4 5 6 7 (高)1 的音
+
+<p align="left">
+  <img src="https://paddlespeech.bj.bcebos.com/t2s/svs/svs_music_scores/piano_88.png" />
+</p>
+
+## 1.2 十二大调
+“#”表示升调
+
+<p align="left">
+  <img src="https://paddlespeech.bj.bcebos.com/t2s/svs/svs_music_scores/up.png" />
+</p>
+
+“b”表示降调
+
+<p align="left">
+  <img src="https://paddlespeech.bj.bcebos.com/t2s/svs/svs_music_scores/down.png" />
+</p>
+
+什么大调表示Do(简谱1) 这个音从哪个键开始，例如D大调，则用D这个键来表示 Do这个音。
+下图是十二大调下简谱与音名的对应表。
+
+<p align="left">
+  <img src="../../../docs/images/note_map.png" />
+</p>
+
+
+## 1.3 Tempo
+Tempo 用于表示速度（Speed of the beat/pulse），一分钟里面有几拍（beats per mimute BPM）
+
+<p align="left">
+  <img src="https://paddlespeech.bj.bcebos.com/t2s/svs/svs_music_scores/note_beat.png" width="450"/>
+</p>
+
+whole note -->  4 beats</br>
+half note --> 2 beats</br>
+quarter note --> 1 beat</br>
+eighth note --> 1/2 beat</br>
+sixteenth note --> 1/4 beat</br> 
+
+
+# 二、应用试验
+## 2.1 从谱中获取 music scores
+music scores 包含：note，note_dur，is_slur
+
+<p align="left">
+  <img src="https://paddlespeech.bj.bcebos.com/t2s/svs/svs_music_scores/pu.png" width="600"/>
+</p>
+
+从左上角的谱信息 *bE* 可以得出该谱子是 **降E大调**，可以对应1.2小节十二大调简谱音名对照表根据 简谱获取对应的note
+从左上角的谱信息 *quarter note* 可以得出该谱子的速度是 **一分钟95拍（beat）**，一拍的时长 = **60/95 = 0.631578s**
+从左上角的谱信息 *4/4* 可以得出该谱子表示四分音符为一拍（分母的4），每小节有4拍（分子的4）
+
+从该简谱上可以获取 music score 如下：
+
+|text |phone |简谱（辅助）后面的点表示高八音 |note （从小字组开始算） |几拍（辅助） |note_dur |is_slur|
+:-------------:| :------------:| :-----: | -----: | :-----: |:-----:| :-----:  |
+|小 |x   |5  |A#3/Bb3 |半 |0.315789 |0 |
+|   |iao |5  |A#3/Bb3 |半 |0.315789 |0 |
+|酒 |j   |1. |D#4/Eb4 |半 |0.315789 |0 |
+|   |iu  |1. |D#4/Eb4 |半 |0.315789 |0 |
+|窝 |w   |2. |F4      |半 |0.315789 |0 |
+|   |o   |2. |F4      |半 |0.315789 |0 |
+|长 |ch  |3. |G4      |半 |0.315789 |0 |
+|   |ang |3. |G4      |半 |0.315789 |0 |
+|   |ang |1. |D#4/Eb4 |半 |0.315789 |1 |
+|睫 |j   |1. |D#4/Eb4 |半 |0.315789 |0 |
+|   |ie  |1. |D#4/Eb4 |半 |0.315789 |0 |
+|   |ie  |5  |A#3/Bb3 |半 |0.315789 |1 |
+|毛 |m   |5  |A#3/Bb3 |一 |0.631578 |0 |
+|   |ao  |5  |A#3/Bb3 |一 |0.631578 |0 |
+|是 |sh  |5  |A#3/Bb3 |半 |0.315789 |0 |
+|   |i   |5  |A#3/Bb3 |半 |0.315789 |0 |
+|你 |n   |3. |G4      |半 |0.315789 |0 |
+|   |i   |3. |G4      |半 |0.315789 |0 |
+|最 |z   |2. |F4      |半 |0.315789 |0 |
+|   |ui  |2. |F4      |半 |0.315789 |0 |
+|美 |m   |3. |G4      |半 |0.315789 |0 |
+|   |ei  |3. |G4      |半 |0.315789 |0 |
+|的 |d   |2. |F4      |半 |0.315789 |0 |
+|   |e   |2. |F4      |半 |0.315789 |0 |
+|记 |j   |7  |D4      |半 |0.315789 |0 |
+|   |i   |7  |D4      |半 |0.315789 |0 |
+|号 |h   |5  |A#3/Bb3 |半 |0.315789 |0 |
+|   |ao  |5  |A#3/Bb3 |半 |0.315789 |0 |
+
+
+## 2.2 一些实验
+
+<div align = "center">
+<table style="width:100%">
+  <thead>
+    <tr>
+      <th> 序号  </th>
+      <th width="500"> 说明  </th>
+      <th> 合成音频（diffsinger_opencpop + pwgan_opencpop） </th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td > 1 </td>
+      <td > 原始 opencpop 标注的 notes，note_durs，is_slurs，升F大调，起始在小字组（第3组） </td>
+      <td align = "center">
+      <a href="https://paddlespeech.bj.bcebos.com/t2s/svs/svs_music_scores/test1.wav" rel="nofollow">
+            <img align="center" src="../../../docs/images/audio_icon.png" width="200 style="max-width: 100%;"></a><br>
+      </td>
+    </tr>
+    <tr>
+      <td > 2 </td>
+      <td > 原始 opencpop 标注的 notes 和 is_slurs，note_durs 改变（从谱子获取） </td>
+      <td align = "center">
+      <a href="https://paddlespeech.bj.bcebos.com/t2s/svs/svs_music_scores/test2.wav" rel="nofollow">
+            <img align="center" src="../../../docs/images/audio_icon.png" width="200 style="max-width: 100%;"></a><br>
+      </td>
+    </tr>
+    <tr>
+      <td > 3 </td>
+      <td > 原始 opencpop 标注的 notes 去掉 rest（毛字一拍），is_slurs 和 note_durs 改变（从谱子获取） </td>
+      <td align = "center">
+      <a href="https://paddlespeech.bj.bcebos.com/t2s/svs/svs_music_scores/test3.wav" rel="nofollow">
+            <img align="center" src="../../../docs/images/audio_icon.png" width="200 style="max-width: 100%;"></a><br>
+      </td>
+    </tr>
+    <tr>
+      <td > 4 </td>
+      <td > 从谱子获取 notes，note durs，is_slurs，不含 rest（毛字一拍），起始在小字一组（第3组） </td>
+      <td align = "center">
+      <a href="https://paddlespeech.bj.bcebos.com/t2s/svs/svs_music_scores/test4.wav" rel="nofollow">
+            <img align="center" src="../../../docs/images/audio_icon.png" width="200 style="max-width: 100%;"></a><br>
+      </td>
+    </tr>
+    <tr>
+      <td > 5 </td>
+      <td > 从谱子获取 notes，note durs，is_slurs，加上 rest （毛字半拍，rest半拍），起始在小字一组（第3组）</td>
+      <td align = "center">
+      <a href="https://paddlespeech.bj.bcebos.com/t2s/svs/svs_music_scores/test5.wav" rel="nofollow">
+            <img align="center" src="../../../docs/images/audio_icon.png" width="200 style="max-width: 100%;"></a><br>
+      </td>
+    </tr>
+    <tr>
+      <td > 6 </td>
+      <td > 从谱子获取 notes， is_slurs，包含 rest，note_durs 从原始标注获取，起始在小字一组（第3组） </td>
+      <td align = "center">
+      <a href="https://paddlespeech.bj.bcebos.com/t2s/svs/svs_music_scores/test6.wav" rel="nofollow">
+            <img align="center" src="../../../docs/images/audio_icon.png" width="200 style="max-width: 100%;"></a><br>
+      </td>
+    </tr>
+    <tr>
+      <td > 7 </td>
+      <td > 从谱子获取 notes，note durs，is_slurs，不含 rest（毛字一拍），起始在小字一组（第4组） </td>
+      <td align = "center">
+      <a href="https://paddlespeech.bj.bcebos.com/t2s/svs/svs_music_scores/test7.wav" rel="nofollow">
+            <img align="center" src="../../../docs/images/audio_icon.png" width="200 style="max-width: 100%;"></a><br>
+      </td>
+    </tr>
+    
+  </tbody>
+</table>
+
+</div>
+
+
+上述实验表明通过该方法来提取 music score 是可行的，但是在应用中可以**灵活地在歌词中加"AP"(用来表示吸气声)和"SP"(用来表示停顿声)**，对应的在 **note 上加 rest**，会使得整体的歌声合成更自然。
+除此之外，还要考虑哪一个大调并且以哪一组为起始**得到的 note 在训练数据集中出现过**，如若推理时传入训练数据中没有见过的 note， 合成出来的音频可能不是我们期待的音调。
+
+
+# 三、其他
+## 3.1 读取midi
+
+```python
+import mido
+mid = mido.MidiFile('2093.midi')
+```
diff --git a/docs/tutorial/st/st_tutorial.ipynb b/docs/tutorial/st/st_tutorial.ipynb
index 2fb85053..e755beba 100644
--- a/docs/tutorial/st/st_tutorial.ipynb
+++ b/docs/tutorial/st/st_tutorial.ipynb
@@ -62,7 +62,7 @@
     "collapsed": false
    },
    "source": [
-    "# 使用Transformer进行端到端语音翻译的的基本流程\n",
+    "# 使用Transformer进行端到端语音翻译的基本流程\n",
     "## 基础模型\n",
     "由于 ASR 章节已经介绍了 Transformer 以及语音特征抽取，在此便不做过多介绍，感兴趣的同学可以去相关章节进行了解。\n",
     "\n",
diff --git a/docs/tutorial/tts/tts_tutorial.ipynb b/docs/tutorial/tts/tts_tutorial.ipynb
index 583adb01..0cecb680 100644
--- a/docs/tutorial/tts/tts_tutorial.ipynb
+++ b/docs/tutorial/tts/tts_tutorial.ipynb
@@ -464,7 +464,7 @@
     "<br><center> FastSpeech2 网络结构图</center></br>\n",
     "\n",
     "\n",
-    "PaddleSpeech TTS 实现的 FastSpeech2 与论文不同的地方在于，我们使用的的是 phone 级别的 `pitch` 和 `energy`(与 FastPitch 类似)，这样的合成结果可以更加**稳定**。\n",
+    "PaddleSpeech TTS 实现的 FastSpeech2 与论文不同的地方在于，我们使用的是 phone 级别的 `pitch` 和 `energy`(与 FastPitch 类似)，这样的合成结果可以更加**稳定**。\n",
     "<center><img src=\"https://ai-studio-static-online.cdn.bcebos.com/862c21456c784c41a83a308b7d9707f0810cc3b3c6f94ed48c60f5d32d0072f0\"></center>\n",
     "<br><center> FastPitch 网络结构图</center></br>\n",
     "\n",
diff --git a/examples/aishell/asr0/local/train.sh b/examples/aishell/asr0/local/train.sh
index 2b71b7f7..c0da3325 100755
--- a/examples/aishell/asr0/local/train.sh
+++ b/examples/aishell/asr0/local/train.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-if [ $# -lt 2 ] && [ $# -gt 3 ];then
+if [ $# -lt 2 ] || [ $# -gt 3 ];then
     echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ips(optional)"
     exit -1
 fi
diff --git a/examples/aishell/asr1/conf/chunk_squeezeformer.yaml b/examples/aishell/asr1/conf/chunk_squeezeformer.yaml
new file mode 100644
index 00000000..35a90b7d
--- /dev/null
+++ b/examples/aishell/asr1/conf/chunk_squeezeformer.yaml
@@ -0,0 +1,98 @@
+############################################
+#           Network Architecture           #
+############################################
+cmvn_file: 
+cmvn_file_type: "json"
+# encoder related
+encoder: squeezeformer
+encoder_conf:
+    encoder_dim: 256    # dimension of attention
+    output_size: 256    # dimension of output
+    attention_heads: 4
+    num_blocks: 12      # the number of encoder blocks
+    reduce_idx: 5
+    recover_idx: 11
+    feed_forward_expansion_factor: 8
+    input_dropout_rate: 0.1
+    feed_forward_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    adaptive_scale: true
+    cnn_module_kernel: 31
+    normalize_before: false
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rel_pos'
+    time_reduction_layer_type: 'stream'
+    causal: true
+    use_dynamic_chunk: true
+    use_dynamic_left_chunk: false
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1  # sublayer output dropout
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+    init_type: 'kaiming_uniform' # !Warning: need to convergence
+
+###########################################
+#                   Data                  #
+###########################################
+
+train_manifest: data/manifest.train
+dev_manifest: data/manifest.dev
+test_manifest: data/manifest.test
+
+
+###########################################
+#              Dataloader                 #
+###########################################
+
+vocab_filepath: data/lang_char/vocab.txt 
+spm_model_prefix: ''
+unit_type: 'char'
+preprocess_config: conf/preprocess.yaml
+feat_dim: 80
+stride_ms: 10.0
+window_ms: 25.0
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+batch_size: 32
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+num_workers: 2
+subsampling_factor: 1
+num_encs: 1
+
+###########################################
+#                 Training                #
+###########################################
+n_epoch: 240 
+accum_grad: 1
+global_grad_clip: 5.0
+dist_sampler: True
+optim: adam
+optim_conf:
+  lr: 0.001
+  weight_decay: 1.0e-6
+scheduler: warmuplr
+scheduler_conf:
+  warmup_steps: 25000
+  lr_decay: 1.0
+log_interval: 100
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
diff --git a/examples/aishell/asr1/conf/squeezeformer.yaml b/examples/aishell/asr1/conf/squeezeformer.yaml
new file mode 100644
index 00000000..b7841aca
--- /dev/null
+++ b/examples/aishell/asr1/conf/squeezeformer.yaml
@@ -0,0 +1,93 @@
+############################################
+#           Network Architecture           #
+############################################
+cmvn_file: 
+cmvn_file_type: "json"
+# encoder related
+encoder: squeezeformer
+encoder_conf:
+    encoder_dim: 256    # dimension of attention
+    output_size: 256    # dimension of output
+    attention_heads: 4
+    num_blocks: 12      # the number of encoder blocks
+    reduce_idx: 5
+    recover_idx: 11
+    feed_forward_expansion_factor: 8
+    input_dropout_rate: 0.1
+    feed_forward_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    adaptive_scale: true
+    cnn_module_kernel: 31
+    normalize_before: false
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rel_pos'
+    time_reduction_layer_type: 'conv1d'
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+    init_type: 'kaiming_uniform' # !Warning: need to convergence
+
+###########################################
+#                   Data                  #
+###########################################
+train_manifest: data/manifest.train
+dev_manifest: data/manifest.dev
+test_manifest: data/manifest.test
+
+###########################################
+#              Dataloader                 #
+###########################################
+vocab_filepath: data/lang_char/vocab.txt 
+spm_model_prefix: ''
+unit_type: 'char'
+preprocess_config: conf/preprocess.yaml
+feat_dim: 80
+stride_ms: 10.0
+window_ms: 25.0
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+batch_size: 32
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0 
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+num_workers: 2
+subsampling_factor: 1
+num_encs: 1
+
+###########################################
+#                Training                 #
+###########################################
+n_epoch: 150 
+accum_grad: 8
+global_grad_clip: 5.0
+dist_sampler: False
+optim: adam
+optim_conf:
+  lr: 0.002
+  weight_decay: 1.0e-6
+scheduler: warmuplr
+scheduler_conf:
+  warmup_steps: 25000
+  lr_decay: 1.0
+log_interval: 100
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
diff --git a/examples/aishell/asr1/local/test.sh b/examples/aishell/asr1/local/test.sh
index 26926b4a..8487e990 100755
--- a/examples/aishell/asr1/local/test.sh
+++ b/examples/aishell/asr1/local/test.sh
@@ -1,15 +1,21 @@
 #!/bin/bash
 
-if [ $# != 3 ];then
-    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix"
-    exit -1
-fi
+set -e
 
 stage=0
 stop_stage=100
+
+source utils/parse_options.sh || exit 1;
+
 ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
 
+
+if [ $# != 3 ];then
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix"
+    exit -1
+fi
+
 config_path=$1
 decode_config_path=$2
 ckpt_prefix=$3
@@ -92,6 +98,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
 fi
 
 if [ ${stage} -le 101 ] && [ ${stop_stage} -ge 101 ]; then
+    echo "using sclite to compute cer..."
     # format the reference test file for sclite
     python utils/format_rsl.py \
         --origin_ref data/manifest.test.raw \
diff --git a/examples/aishell/asr1/local/train.sh b/examples/aishell/asr1/local/train.sh
index bfa8dd97..3d4f052a 100755
--- a/examples/aishell/asr1/local/train.sh
+++ b/examples/aishell/asr1/local/train.sh
@@ -17,7 +17,7 @@ if [ ${seed} != 0  ]; then
     echo "using seed $seed & FLAGS_cudnn_deterministic=True ..."
 fi
 
-if [ $# -lt 2 ] && [ $# -gt 3 ];then
+if [ $# -lt 2 ] || [ $# -gt 3 ];then
     echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ips(optional)"
     exit -1
 fi
diff --git a/examples/aishell/asr3/README.md b/examples/aishell/asr3/README.md
new file mode 100644
index 00000000..6b587e12
--- /dev/null
+++ b/examples/aishell/asr3/README.md
@@ -0,0 +1,198 @@
+# Wav2vec2ASR with Aishell
+This example contains code used to finetune [wav2vec2.0](https://https://arxiv.org/pdf/2006.11477.pdf) model with [Aishell dataset](http://www.openslr.org/resources/33)
+## Overview
+All the scripts you need are in `run.sh`. There are several stages in `run.sh`, and each stage has its function.
+| Stage | Function                                                     |
+|:---- |:----------------------------------------------------------- |
+| 0     | Process data. It includes: <br>       (1) Download the dataset <br>       (2) Calculate the CMVN of the train dataset <br>       (3) Get the vocabulary file <br>       (4) Get the manifest files of the train, development and test dataset<br>       (5) Download the pretrained wav2vec2 model |
+| 1     | Train the model                                              |
+| 2     | Get the final model by averaging the top-k models, set k = 1 means to choose the best model |
+| 3     | Test the final model performance                             |
+| 4     | Infer the single audio file                                  |
+
+
+You can choose to run a range of stages by setting `stage` and `stop_stage `. 
+
+For example, if you want to execute the code in stage 2 and stage 3, you can run this script:
+```bash
+bash run.sh --stage 2 --stop_stage 3
+```
+Or you can set `stage` equal to `stop-stage` to only run one stage.
+For example, if you only want to run `stage 0`, you can use the script below:
+```bash
+bash run.sh --stage 0 --stop_stage 0
+```
+The document below will describe the scripts in `run.sh` in detail.
+## The Environment Variables
+The path.sh contains the environment variables. 
+```bash
+. ./path.sh
+. ./cmd.sh
+```
+This script needs to be run first. And another script is also needed:
+```bash
+source ${MAIN_ROOT}/utils/parse_options.sh
+```
+It will support the way of using `--variable value` in the shell scripts.
+## The Local Variables
+Some local variables are set in `run.sh`. 
+`gpus` denotes the GPU number you want to use. If you set `gpus=`, it means you only use CPU. 
+`stage` denotes the number of stages you want to start from in the experiments.
+`stop stage` denotes the number of the stage you want to end at in the experiments. 
+`conf_path` denotes the config path of the model.
+`avg_num` denotes the number K of top-K models you want to average to get the final model.
+`audio file` denotes the file path of the single file you want to infer in stage 5
+`ckpt` denotes the checkpoint prefix of the model, e.g. "wav2vec2ASR"
+
+You can set the local variables (except `ckpt`) when you use `run.sh`
+
+For example, you can set the `gpus` and `avg_num` when you use the command line:
+```bash
+bash run.sh --gpus 0,1 --avg_num 20
+```
+## Stage 0: Data Processing
+To use this example, you need to process data firstly and you can use stage 0 in `run.sh` to do this. The code is shown below:
+```bash
+ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+     # prepare data
+     bash ./local/data.sh || exit -1
+ fi
+```
+Stage 0 is for processing the data.
+
+If you only want to process the data. You can run
+```bash
+bash run.sh --stage 0 --stop_stage 0
+```
+You can also just run these scripts in your command line.
+```bash
+. ./path.sh
+. ./cmd.sh
+bash ./local/data.sh
+```
+After processing the data, the `data` directory will look like this:
+```bash
+data/
+|-- dev.meta
+|-- lang_char
+|   `-- vocab.txt
+|-- manifest.dev
+|-- manifest.dev.raw
+|-- manifest.test
+|-- manifest.test.raw
+|-- manifest.train
+|-- manifest.train.raw
+|-- mean_std.json
+|-- test.meta
+|-- train.meta
+|-- train.csv
+|-- dev.csv
+|-- test.csv
+```
+
+Stage 0 also downloads the Chinese pre-trained [wav2vec2](https://paddlespeech.bj.bcebos.com/wav2vec/chinese-wav2vec2-large.pdparams) model.
+```bash
+mkdir -p exp/wav2vec2
+wget -P exp/wav2vec2 https://paddlespeech.bj.bcebos.com/wav2vec/chinese-wav2vec2-large.pdparams
+```
+## Stage 1: Model Training
+If you want to train the model. you can use stage 1 in `run.sh`. The code is shown below. 
+```bash
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+     # train model, all `ckpt` under `exp` dir
+     CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt}
+ fi
+```
+If you want to train the model, you can use the script below to execute stage 0 and stage 1:
+```bash
+bash run.sh --stage 0 --stop_stage 1
+```
+or you can run these scripts in the command line (only use CPU).
+```bash
+. ./path.sh
+. ./cmd.sh
+bash ./local/data.sh
+CUDA_VISIBLE_DEVICES= ./local/train.sh conf/wav2vec2ASR.yaml wav2vec2ASR
+```
+## Stage 2: Top-k Models Averaging
+After training the model, we need to get the final model for testing and inference. In every epoch, the model checkpoint is saved, so we can choose the best model from them based on the validation loss or we can sort them and average the parameters of the top-k models to get the final model. We can use stage 2 to do this, and the code is shown below. Note: We only train one epoch for wav2vec2ASR, thus the `avg_num` is set to 1.
+```bash
+ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+     # avg n best model
+     avg.sh best exp/${ckpt}/checkpoints ${avg_num}
+ fi
+```
+The `avg.sh` is in the `../../../utils/` which is define in the `path.sh`.
+If you want to get the final model, you can use the script below to execute stage 0, stage 1, and stage 2:
+```bash
+bash run.sh --stage 0 --stop_stage 2
+```
+or you can run these scripts in the command line (only use CPU).
+
+```bash
+. ./path.sh
+. ./cmd.sh
+bash ./local/data.sh
+CUDA_VISIBLE_DEVICES= ./local/train.sh conf/wav2vec2ASR.yaml wav2vec2ASR
+avg.sh best exp/wav2vec2ASR/checkpoints 1
+```
+## Stage 3: Model Testing
+The test stage is to evaluate the model performance. The code of test stage is shown below:
+```bash
+ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+     # test ckpt avg_n
+     CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+ fi
+```
+If you want to train a model and test it, you can use the script below to execute stage 0, stage 1, stage 2, and stage 3 :
+```bash
+bash run.sh --stage 0 --stop_stage 3
+```
+or you can run these scripts in the command line (only use CPU).
+```bash
+. ./path.sh
+. ./cmd.sh
+bash ./local/data.sh
+CUDA_VISIBLE_DEVICES= ./local/train.sh conf/wav2vec2ASR.yaml wav2vec2ASR
+avg.sh best exp/wav2vec2ASR/checkpoints 1
+CUDA_VISIBLE_DEVICES= ./local/test.sh conf/wav2vec2ASR.yaml conf/tuning/decode.yaml exp/wav2vec2ASR/checkpoints/avg_1
+```
+## Pretrained Model
+You can get the pretrained wav2vec2ASR from [this](../../../docs/source/released_model.md).
+
+using the `tar` scripts to unpack the model and then you can use the script to test the model.
+
+For example:
+```bash
+wget https://paddlespeech.bj.bcebos.com/s2t/aishell/asr3/wav2vec2ASR-large-aishell1_ckpt_1.4.0.model.tar.gz
+tar xzvf wav2vec2ASR-large-aishell1_ckpt_1.4.0.model.tar.gz
+source path.sh
+# If you have process the data and get the manifest file， you can skip the following 2 steps
+bash local/data.sh --stage -1 --stop_stage -1
+bash local/data.sh --stage 2 --stop_stage 2
+CUDA_VISIBLE_DEVICES= ./local/test.sh conf/wav2vec2ASR.yaml conf/tuning/decode.yaml exp/wav2vec2ASR/checkpoints/avg_1
+```
+The performance of the released models are shown in [here](./RESULTS.md).
+
+
+## Stage 4: Single Audio File Inference
+In some situations, you want to use the trained model to do the inference for the single audio file. You can use stage 5. The code is shown below
+```bash
+ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+     # test a single .wav file
+     CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1
+ fi
+```
+you can train the model by yourself using ```bash run.sh --stage 0 --stop_stage 3```, or you can download the pretrained model through the script below:
+```bash
+wget https://paddlespeech.bj.bcebos.com/s2t/aishell/asr3/wav2vec2ASR-large-aishell1_ckpt_1.4.0.model.tar.gz
+tar xzvf wav2vec2ASR-large-aishell1_ckpt_1.4.0.model.tar.gz
+```
+You can download the audio demo:
+```bash
+wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/
+```
+You need to prepare an audio file or use the audio demo above, please confirm the sample rate of the audio is 16K. You can get the result of the audio demo by running the script below.
+```bash
+CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/wav2vec2ASR.yaml conf/tuning/decode.yaml exp/wav2vec2ASR/checkpoints/avg_1 data/demo_01_03.wav
+```
diff --git a/examples/aishell/asr3/RESULT.md b/examples/aishell/asr3/RESULT.md
new file mode 100644
index 00000000..42edeac1
--- /dev/null
+++ b/examples/aishell/asr3/RESULT.md
@@ -0,0 +1,18 @@
+# AISHELL
+
+## Version
+
+* paddle version: develop (commit id: daea892c67e85da91906864de40ce9f6f1b893ae)
+* paddlespeech version: develop (commit id: c14b4238b256693281e59605abff7c9435b3e2b2)
+* paddlenlp version: 2.5.2 
+
+## Device
+* python: 3.7
+* cuda: 10.2
+* cudnn: 7.6
+
+## Result
+train: Epoch 80, 2*V100-32G, batchsize:5
+| Model | Params | Config | Augmentation| Test set | Decode method | WER |  
+| --- | --- | --- | --- | --- | --- | --- |
+| wav2vec2ASR | 324.49 M | conf/wav2vec2ASR.yaml | spec_aug | test-set | greedy search | 5.1009 |  
diff --git a/examples/aishell/asr3/cmd.sh b/examples/aishell/asr3/cmd.sh
new file mode 100755
index 00000000..7b70ef5e
--- /dev/null
+++ b/examples/aishell/asr3/cmd.sh
@@ -0,0 +1,89 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# "qsub" (SGE, Torque, PBS, etc.)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/gpu.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/examples/aishell/asr3/conf/preprocess.yaml b/examples/aishell/asr3/conf/preprocess.yaml
new file mode 100755
index 00000000..724782ed
--- /dev/null
+++ b/examples/aishell/asr3/conf/preprocess.yaml
@@ -0,0 +1,3 @@
+process:
+    # use raw audio
+  - type: wav_process
diff --git a/examples/aishell/asr3/conf/train_with_wav2vec.yaml b/examples/aishell/asr3/conf/train_with_wav2vec.yaml
new file mode 100755
index 00000000..273175d2
--- /dev/null
+++ b/examples/aishell/asr3/conf/train_with_wav2vec.yaml
@@ -0,0 +1,101 @@
+# Copyright (c) 2023 speechbrain Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# 
+# Modified from speechbrain 2023 (https://github.com/speechbrain/speechbrain/blob/develop/recipes/AISHELL-1/ASR/CTC/hparams/train_with_wav2vec.yaml)
+
+# ############################################################################
+# Model: CTC-wav2vec2
+# Encoder: wav2vec2
+# Decoder: -
+# Tokens: Char
+# losses: CTC
+# Training: AISHELL-1
+# Authors:  Yingzhi WANG 2022
+# ############################################################################
+
+output_folder: !ref data
+cer_file: !ref <output_folder>/cer.txt
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+
+# Data files
+data_folder: data/aishell # e,g./path/to/aishell
+
+skip_prep: False
+ckpt_interval_minutes: 15 # save checkpoint every N min
+train_data: !ref <output_folder>/train.csv
+valid_data: !ref <output_folder>/dev.csv
+test_data: !ref <output_folder>/test.csv
+
+wav2vec2_hub: TencentGameMate/chinese-wav2vec2-large
+
+# Training parameters
+number_of_epochs: 80
+lr: 1.0
+lr_wav2vec: 0.0001
+sorting: ascending
+auto_mix_prec: False
+sample_rate: 16000
+
+# With data_parallel batch_size is split into N jobs
+# With DDP batch_size is multiplied by N jobs
+# Must be 8 per GPU to fit 32GB of VRAM
+batch_size: 5
+test_batch_size: 1 # need set to 1 when decoding
+
+dynamic_batching: False
+dynamic_batch_sampler:
+   feats_hop_size: 0.01
+   max_batch_len: 15 # in terms of "duration" in annotations by default, second here
+   left_bucket_len: 200 # old implementation attributs
+   multiplier: 1.1 # old implementation attributs
+   shuffle_ex: False # if true re-creates batches at each epoch shuffling examples.
+   num_buckets: 10 # floor(log(max_batch_len/left_bucket_len, multiplier)) + 1
+   batch_ordering: ascending
+
+num_workers: 6
+
+# Dataloader options
+train_dataloader_opts:
+   batch_size: !ref <batch_size>
+   num_workers: !ref <num_workers>
+valid_dataloader_opts:
+   batch_size: !ref <test_batch_size>
+   num_workers: !ref <num_workers>
+test_dataloader_opts:
+   batch_size: !ref <test_batch_size>
+   num_workers: !ref <num_workers>
+
+wav2vec_output_dim: 1024
+dnn_neurons: 1024
+freeze_wav2vec: False
+dropout: 0.15
+
+tokenizer: !apply:paddlenlp.transformers.AutoTokenizer.from_pretrained
+   pretrained_model_name_or_path: bert-base-chinese
+# bert-base-chinese tokens length
+output_neurons: 21128
+
+# Decoding parameters
+# Be sure that the bos and eos index match with the BPEs ones
+blank_index: 0
+
+# AISHELL-1 has spaces between words in the transcripts,
+# which Chinese writing normally does not do.
+# If remove_spaces, spaces are removed
+# from the transcript before computing CER.
+# (e.g., 祝 可爱 的 你 —> 祝可爱的你)
+remove_spaces: True
+split_tokens: !apply:operator.not_ [!ref <remove_spaces>]
diff --git a/examples/aishell/asr3/conf/tuning/decode.yaml b/examples/aishell/asr3/conf/tuning/decode.yaml
new file mode 100755
index 00000000..69d0a455
--- /dev/null
+++ b/examples/aishell/asr3/conf/tuning/decode.yaml
@@ -0,0 +1,4 @@
+decode_batch_size: 1
+error_rate_type: cer
+decoding_method: ctc_greedy_search  # 'ctc_greedy_search', 'ctc_prefix_beam_search'
+beam_size: 10
diff --git a/examples/aishell/asr3/conf/wav2vec2ASR.yaml b/examples/aishell/asr3/conf/wav2vec2ASR.yaml
new file mode 100644
index 00000000..4a127468
--- /dev/null
+++ b/examples/aishell/asr3/conf/wav2vec2ASR.yaml
@@ -0,0 +1,167 @@
+############################################
+#          Network Architecture           #
+############################################
+freeze_wav2vec2: False
+normalize_wav: True
+output_norm: True
+init_type: 'kaiming_uniform' # !Warning: need to convergence
+enc:
+  input_shape: 1024
+  dnn_blocks: 3
+  dnn_neurons: 1024
+  activation: True
+  normalization: True
+  dropout_rate: [0.15, 0.15, 0.0]
+ctc:
+  enc_n_units: 1024
+  blank_id: 0
+  dropout_rate: 0.0
+
+audio_augment:
+  speeds: [90, 100, 110]
+
+spec_augment:
+  time_warp: True
+  time_warp_window: 5
+  time_warp_mode: bicubic
+  freq_mask: True
+  n_freq_mask: 2
+  time_mask: True
+  n_time_mask: 2
+  replace_with_zero: False
+  freq_mask_width: 30
+  time_mask_width: 40
+wav2vec2_params_path: exp/wav2vec2/chinese-wav2vec2-large.pdparams
+
+
+############################################
+#               Wav2Vec2.0                 #
+############################################
+# vocab_size: 1000000
+hidden_size: 1024
+num_hidden_layers: 24
+num_attention_heads: 16
+intermediate_size: 4096
+hidden_act: gelu
+hidden_dropout: 0.1
+activation_dropout: 0.0
+attention_dropout: 0.1
+feat_proj_dropout: 0.1
+feat_quantizer_dropout: 0.0
+final_dropout: 0.0
+layerdrop: 0.1
+initializer_range: 0.02
+layer_norm_eps: 1e-5
+feat_extract_norm: layer
+feat_extract_activation: gelu
+conv_dim: [512, 512, 512, 512, 512, 512, 512]
+conv_stride: [5, 2, 2, 2, 2, 2, 2]
+conv_kernel: [10, 3, 3, 3, 3, 2, 2]
+conv_bias: True
+num_conv_pos_embeddings: 128
+num_conv_pos_embedding_groups: 16
+do_stable_layer_norm: True
+apply_spec_augment: False
+mask_channel_length: 10
+mask_channel_min_space: 1
+mask_channel_other: 0.0
+mask_channel_prob: 0.0
+mask_channel_selection: static
+mask_feature_length: 10
+mask_feature_min_masks: 0
+mask_feature_prob: 0.0
+mask_time_length: 10
+mask_time_min_masks: 2
+mask_time_min_space: 1
+mask_time_other: 0.0
+mask_time_prob: 0.075
+mask_time_selection: static
+num_codevectors_per_group: 320
+num_codevector_groups: 2
+contrastive_logits_temperature: 0.1
+num_negatives: 100
+codevector_dim: 256
+proj_codevector_dim: 256
+diversity_loss_weight: 0.1
+use_weighted_layer_sum: False
+# pad_token_id: 0
+# bos_token_id: 1
+# eos_token_id: 2
+add_adapter: False
+adapter_kernel_size: 3
+adapter_stride: 2
+num_adapter_layers: 3
+output_hidden_size: None
+
+###########################################
+#                   Data                  #
+###########################################
+
+train_manifest: data/manifest.train
+dev_manifest: data/manifest.dev
+test_manifest: data/manifest.test
+vocab_filepath: data/lang_char/vocab.txt 
+
+###########################################
+#              Dataloader                 #
+###########################################
+
+unit_type: 'char'
+tokenizer: bert-base-chinese
+mean_std_filepath: 
+preprocess_config: conf/preprocess.yaml
+sortagrad: -1 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+batch_size: 5  # Different batch_size may cause large differences in results
+maxlen_in: 51200000000  # if input length  > maxlen-in batchsize is automatically reduced
+maxlen_out: 1500000  # if output length > maxlen-out batchsize is automatically reduced
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0 
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+num_workers: 6
+subsampling_factor: 1
+num_encs: 1
+dist_sampler: True
+shortest_first: True
+return_lens_rate: True
+
+###########################################
+#        use speechbrain dataloader       #
+###########################################
+use_sb_pipeline: True  # whether use speechbrain pipeline. Default is True.
+sb_pipeline_conf: conf/train_with_wav2vec.yaml
+
+###########################################
+#                 Training                #
+###########################################
+n_epoch: 80
+accum_grad: 1
+global_grad_clip: 5.0
+
+model_optim: sgd
+model_optim_conf:
+  lr: 1.0
+  weight_decay: 0.0
+
+wav2vec2_optim: adam
+wav2vec2_optim_conf:
+  lr: 0.0001
+  weight_decay: 0.0
+
+model_scheduler: newbobscheduler
+model_scheduler_conf:
+  improvement_threshold: 0.0025
+  annealing_factor: 0.8
+  patient: 0
+wav2vec2_scheduler: newbobscheduler
+wav2vec2_scheduler_conf:
+  improvement_threshold: 0.0025
+  annealing_factor: 0.9
+  patient: 0
+log_interval: 1
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
+
diff --git a/examples/aishell/asr3/conf/wav2vec2ASR_adadelta.yaml b/examples/aishell/asr3/conf/wav2vec2ASR_adadelta.yaml
new file mode 100755
index 00000000..ec287f0c
--- /dev/null
+++ b/examples/aishell/asr3/conf/wav2vec2ASR_adadelta.yaml
@@ -0,0 +1,168 @@
+############################################
+#          Network Architecture           #
+############################################
+freeze_wav2vec2: False
+normalize_wav: True
+output_norm: True
+init_type: 'kaiming_uniform' # !Warning: need to convergence
+enc:
+  input_shape: 1024
+  dnn_blocks: 3
+  dnn_neurons: 1024
+  activation: True
+  normalization: True
+  dropout_rate: [0.15, 0.15, 0.0]
+ctc:
+  enc_n_units: 1024
+  blank_id: 0
+  dropout_rate: 0.0
+
+audio_augment:
+  speeds: [90, 100, 110]
+
+spec_augment:
+  time_warp: True
+  time_warp_window: 5
+  time_warp_mode: bicubic
+  freq_mask: True
+  n_freq_mask: 2
+  time_mask: True
+  n_time_mask: 2
+  replace_with_zero: False
+  freq_mask_width: 30
+  time_mask_width: 40
+wav2vec2_params_path: exp/wav2vec2/chinese-wav2vec2-large.pdparams
+
+
+############################################
+#               Wav2Vec2.0                 #
+############################################
+# vocab_size: 1000000
+hidden_size: 1024
+num_hidden_layers: 24
+num_attention_heads: 16
+intermediate_size: 4096
+hidden_act: gelu
+hidden_dropout: 0.1
+activation_dropout: 0.0
+attention_dropout: 0.1
+feat_proj_dropout: 0.1
+feat_quantizer_dropout: 0.0
+final_dropout: 0.0
+layerdrop: 0.1
+initializer_range: 0.02
+layer_norm_eps: 1e-5
+feat_extract_norm: layer
+feat_extract_activation: gelu
+conv_dim: [512, 512, 512, 512, 512, 512, 512]
+conv_stride: [5, 2, 2, 2, 2, 2, 2]
+conv_kernel: [10, 3, 3, 3, 3, 2, 2]
+conv_bias: True
+num_conv_pos_embeddings: 128
+num_conv_pos_embedding_groups: 16
+do_stable_layer_norm: True
+apply_spec_augment: False
+mask_channel_length: 10
+mask_channel_min_space: 1
+mask_channel_other: 0.0
+mask_channel_prob: 0.0
+mask_channel_selection: static
+mask_feature_length: 10
+mask_feature_min_masks: 0
+mask_feature_prob: 0.0
+mask_time_length: 10
+mask_time_min_masks: 2
+mask_time_min_space: 1
+mask_time_other: 0.0
+mask_time_prob: 0.075
+mask_time_selection: static
+num_codevectors_per_group: 320
+num_codevector_groups: 2
+contrastive_logits_temperature: 0.1
+num_negatives: 100
+codevector_dim: 256
+proj_codevector_dim: 256
+diversity_loss_weight: 0.1
+use_weighted_layer_sum: False
+# pad_token_id: 0
+# bos_token_id: 1
+# eos_token_id: 2
+add_adapter: False
+adapter_kernel_size: 3
+adapter_stride: 2
+num_adapter_layers: 3
+output_hidden_size: None
+
+###########################################
+#                   Data                  #
+###########################################
+
+train_manifest: data/manifest.train
+dev_manifest: data/manifest.dev
+test_manifest: data/manifest.test
+vocab_filepath: data/lang_char/vocab.txt 
+
+###########################################
+#              Dataloader                 #
+###########################################
+
+unit_type: 'char'
+tokenizer: bert-base-chinese
+mean_std_filepath: 
+preprocess_config: conf/preprocess.yaml
+sortagrad: -1 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+batch_size: 5  # Different batch_size may cause large differences in results
+maxlen_in: 51200000000  # if input length  > maxlen-in batchsize is automatically reduced
+maxlen_out: 1500000  # if output length > maxlen-out batchsize is automatically reduced
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0 
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+num_workers: 6
+subsampling_factor: 1
+num_encs: 1
+dist_sampler: True
+shortest_first: True
+return_lens_rate: True
+
+###########################################
+#        use speechbrain dataloader       #
+###########################################
+use_sb_pipeline: True  # whether use speechbrain pipeline. Default is True.
+sb_pipeline_conf: conf/train_with_wav2vec.yaml
+
+###########################################
+#                 Training                #
+###########################################
+n_epoch: 80
+accum_grad: 1
+global_grad_clip: 5.0
+
+model_optim: adadelta
+model_optim_conf:
+  lr: 1.0
+  weight_decay: 0.0
+  rho: 0.95
+  epsilon: 1.0e-8
+
+wav2vec2_optim: adam
+wav2vec2_optim_conf:
+  lr: 0.0001
+  weight_decay: 0.0
+
+model_scheduler: newbobscheduler
+model_scheduler_conf:
+  improvement_threshold: 0.0025
+  annealing_factor: 0.8
+  patient: 0
+wav2vec2_scheduler: newbobscheduler
+wav2vec2_scheduler_conf:
+  improvement_threshold: 0.0025
+  annealing_factor: 0.9
+  patient: 0
+log_interval: 1
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
diff --git a/examples/aishell/asr3/local/aishell_prepare.py b/examples/aishell/asr3/local/aishell_prepare.py
new file mode 100644
index 00000000..2a7ba5c6
--- /dev/null
+++ b/examples/aishell/asr3/local/aishell_prepare.py
@@ -0,0 +1,129 @@
+# Copyright (c) 2023 speechbrain Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Modified from speechbrain 2023
+# (https://github.com/speechbrain/speechbrain/blob/develop/recipes/AISHELL-1/aishell_prepare.py)
+import argparse
+import csv
+import glob
+import logging
+import os
+
+from paddlespeech.s2t.io.speechbrain.dataio import read_audio
+
+logger = logging.getLogger(__name__)
+
+DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--data_folder",
+    default=DATA_HOME + "/Aishell",
+    type=str,
+    help="Directory to save the dataset. (default: %(default)s)")
+parser.add_argument(
+    "--save_folder",
+    default="data/",
+    type=str,
+    help="Filepath prefix for output manifests. (default: %(default)s)")
+parser.add_argument(
+    "--skip_prep",
+    default=False,
+    type=bool,
+    help="If True, skip data preparation. (default: %(default)s)")
+args = parser.parse_args()
+
+
+def prepare_aishell(data_folder, save_folder, skip_prep=False):
+    """
+    This function prepares the AISHELL-1 dataset.
+    If the folder does not exist, the zip file will be extracted. If the zip file does not exist, it will be downloaded.
+    data_folder : path to AISHELL-1 dataset.
+    save_folder: path where to store the manifest csv files.
+    skip_prep: If True, skip data preparation.
+    """
+    if skip_prep:
+        return
+
+    # Create filename-to-transcript dictionary
+    filename2transcript = {}
+    with open(
+            os.path.join(data_folder,
+                         "data_aishell/transcript/aishell_transcript_v0.8.txt"),
+            "r", ) as f:
+        lines = f.readlines()
+        for line in lines:
+            key = line.split()[0]
+            value = " ".join(line.split()[1:])
+            filename2transcript[key] = value
+
+    splits = [
+        "train",
+        "dev",
+        "test",
+    ]
+    ID_start = 0  # needed to have a unique ID for each audio
+    for split in splits:
+        new_filename = os.path.join(save_folder, split) + ".csv"
+        if os.path.exists(new_filename):
+            continue
+        logger.info("Preparing %s..." % new_filename)
+
+        csv_output = [["ID", "duration", "wav", "transcript"]]
+        entry = []
+
+        all_wavs = glob.glob(
+            os.path.join(data_folder, "data_aishell/wav") + "/" + split +
+            "/*/*.wav")
+        for i in range(len(all_wavs)):
+            filename = all_wavs[i].split("/")[-1].split(".wav")[0]
+            if filename not in filename2transcript:
+                continue
+            signal = read_audio(all_wavs[i])
+            duration = signal.shape[0] / 16000
+            transcript_ = filename2transcript[filename]
+            csv_line = [
+                ID_start + i,
+                str(duration),
+                all_wavs[i],
+                transcript_,
+            ]
+            entry.append(csv_line)
+
+        csv_output = csv_output + entry
+
+        with open(new_filename, mode="w") as csv_f:
+            csv_writer = csv.writer(
+                csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
+            for line in csv_output:
+                csv_writer.writerow(line)
+
+        msg = "\t%s successfully created!" % (new_filename)
+        logger.info(msg)
+
+        ID_start += len(all_wavs)
+
+
+def main():
+    if args.data_folder.startswith('~'):
+        args.data_folder = os.path.expanduser(args.data_folder)
+
+    prepare_aishell(args.data_folder, args.save_folder, skip_prep=False)
+
+    print("Data csv prepare done!")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/aishell/asr3/local/data.sh b/examples/aishell/asr3/local/data.sh
new file mode 100755
index 00000000..bd26c1e7
--- /dev/null
+++ b/examples/aishell/asr3/local/data.sh
@@ -0,0 +1,101 @@
+#!/bin/bash
+
+stage=-1
+stop_stage=3
+dict_dir=data/lang_char
+
+. ${MAIN_ROOT}/utils/parse_options.sh || exit -1;
+
+mkdir -p data
+mkdir -p ${dict_dir}
+TARGET_DIR=${MAIN_ROOT}/dataset
+mkdir -p ${TARGET_DIR}
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    # download data, generate manifests
+    python3 ${TARGET_DIR}/aishell/aishell.py \
+    --manifest_prefix="data/manifest" \
+    --target_dir="${TARGET_DIR}/aishell"
+
+    #generate csv file for speechbrain dataloader
+    python3 local/aishell_prepare.py \
+    --data_folder="${TARGET_DIR}/aishell" \
+    --save_folder="data/"
+
+
+    if [ $? -ne 0 ]; then
+        echo "Prepare Aishell failed. Terminated."
+        exit 1
+    fi
+
+    for dataset in train dev test; do
+        mv data/manifest.${dataset} data/manifest.${dataset}.raw
+    done
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # compute mean and stddev for normalizer
+    num_workers=$(nproc)
+    python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
+    --manifest_path="data/manifest.train.raw" \
+    --spectrum_type="fbank" \
+    --feat_dim=80 \
+    --delta_delta=false \
+    --stride_ms=10 \
+    --window_ms=25 \
+    --sample_rate=16000 \
+    --use_dB_normalization=False \
+    --num_samples=-1 \
+    --num_workers=${num_workers} \
+    --output_path="data/mean_std.json"
+
+    if [ $? -ne 0 ]; then
+        echo "Compute mean and stddev failed. Terminated."
+        exit 1
+    fi
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # download data, generate manifests
+    # build vocabulary
+    python3 ${MAIN_ROOT}/utils/build_vocab.py \
+    --unit_type="char" \
+    --count_threshold=0 \
+    --vocab_path="${dict_dir}/vocab.txt" \
+    --manifest_paths "data/manifest.train.raw"
+
+    if [ $? -ne 0 ]; then
+        echo "Build vocabulary failed. Terminated."
+        exit 1
+    fi
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # format manifest with tokenids, vocab size
+    for dataset in train dev test; do
+    {
+        python3 ${MAIN_ROOT}/utils/format_data.py \
+            --cmvn_path "data/mean_std.json" \
+            --unit_type "char" \
+            --vocab_path="${dict_dir}/vocab.txt" \
+            --manifest_path="data/manifest.${dataset}.raw" \
+            --output_path="data/manifest.${dataset}"
+
+        if [ $? -ne 0 ]; then
+            echo "Formt mnaifest failed. Terminated."
+            exit 1
+        fi
+    } &
+    done
+    wait
+fi
+echo "Aishell data preparation done."
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    mkdir -p exp/wav2vec2
+    echo "Pretrained wav2vec2 model download"
+    wget -P exp/wav2vec2 https://paddlespeech.bj.bcebos.com/wav2vec/chinese-wav2vec2-large.pdparams
+fi
+
+exit 0
+
diff --git a/examples/aishell/asr3/local/test.sh b/examples/aishell/asr3/local/test.sh
new file mode 100755
index 00000000..91e1c545
--- /dev/null
+++ b/examples/aishell/asr3/local/test.sh
@@ -0,0 +1,82 @@
+#!/bin/bash
+
+set -e
+
+ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+echo "using $ngpu gpus..."
+
+expdir=exp
+datadir=data
+
+train_set=train
+
+config_path=$1
+decode_config_path=$2
+ckpt_prefix=$3
+
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
+
+# download language model
+#bash local/download_lm_en.sh
+#if [ $? -ne 0 ]; then
+#    exit 1
+#fi
+
+python3 utils/format_rsl.py \
+    --origin_ref data/manifest.test.raw \
+    --trans_ref data/manifest.test.text
+
+
+for type in ctc_greedy_search; do
+    echo "decoding ${type}"
+    batch_size=1
+    python3 -u ${BIN_DIR}/test.py \
+        --ngpu ${ngpu} \
+        --config ${config_path} \
+        --decode_cfg ${decode_config_path} \
+        --result_file ${ckpt_prefix}.${type}.rsl \
+        --checkpoint_path ${ckpt_prefix} \
+        --opts decode.decoding_method ${type} \
+        --opts decode.decode_batch_size ${batch_size}
+
+    if [ $? -ne 0 ]; then
+        echo "Failed in evaluation!"
+        exit 1
+    fi
+    python3 utils/format_rsl.py \
+        --origin_hyp ${ckpt_prefix}.${type}.rsl \
+        --trans_hyp ${ckpt_prefix}.${type}.rsl.text
+
+    python3 utils/compute-wer.py --char=1 --v=1 \
+        data/manifest.test.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error
+    echo "decoding ${type} done."
+done
+
+for type in ctc_prefix_beam_search; do
+    echo "decoding ${type}"
+    batch_size=1
+    python3 -u ${BIN_DIR}/test.py \
+        --ngpu ${ngpu} \
+        --config ${config_path} \
+        --decode_cfg ${decode_config_path} \
+        --result_file ${ckpt_prefix}.${type}.rsl \
+        --checkpoint_path ${ckpt_prefix} \
+        --opts decode.decoding_method ${type} \
+        --opts decode.decode_batch_size ${batch_size}
+
+    if [ $? -ne 0 ]; then
+        echo "Failed in evaluation!"
+        exit 1
+    fi
+    python3 utils/format_rsl.py \
+        --origin_hyp ${ckpt_prefix}.${type}.rsl \
+        --trans_hyp ${ckpt_prefix}.${type}.rsl.text
+
+    python3 utils/compute-wer.py --char=1 --v=1 \
+        data/manifest.test.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error
+    echo "decoding ${type} done."
+done
+
+echo "Finished"
+
+exit 0
diff --git a/examples/aishell/asr3/local/test_wav.sh b/examples/aishell/asr3/local/test_wav.sh
new file mode 100755
index 00000000..7ccef694
--- /dev/null
+++ b/examples/aishell/asr3/local/test_wav.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+
+if [ $# != 4 ];then
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix audio_file"
+    exit -1
+fi
+
+ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+echo "using $ngpu gpus..."
+
+config_path=$1
+decode_config_path=$2
+ckpt_prefix=$3
+audio_file=$4
+
+mkdir -p data
+wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/
+if [ $? -ne 0 ]; then
+   exit 1
+fi
+
+if [ ! -f ${audio_file} ]; then
+    echo "Plase input the right audio_file path"
+    exit 1
+fi
+
+chunk_mode=false
+if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then
+    chunk_mode=true
+fi
+
+# download language model
+#bash local/download_lm_ch.sh
+#if [ $? -ne 0 ]; then
+#    exit 1
+#fi
+
+for type in ctc_greedy_search; do
+    echo "decoding ${type}"
+    batch_size=1
+    output_dir=${ckpt_prefix}
+    mkdir -p ${output_dir}
+    python3 -u ${BIN_DIR}/test_wav.py \
+    --ngpu ${ngpu} \
+    --config ${config_path} \
+    --decode_cfg ${decode_config_path} \
+    --result_file ${output_dir}/${type}.rsl \
+    --checkpoint_path ${ckpt_prefix} \
+    --opts decode.decoding_method ${type} \
+    --opts decode.decode_batch_size ${batch_size} \
+    --audio_file ${audio_file}
+
+    if [ $? -ne 0 ]; then
+        echo "Failed in evaluation!"
+        exit 1
+    fi
+done
+exit 0
diff --git a/examples/aishell/asr3/local/train.sh b/examples/aishell/asr3/local/train.sh
new file mode 100755
index 00000000..33fef0fd
--- /dev/null
+++ b/examples/aishell/asr3/local/train.sh
@@ -0,0 +1,59 @@
+#!/bin/bash
+
+if [ $# -lt 2 ] || [ $# -gt 3 ];then
+    echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ips(optional)"
+    exit -1
+fi
+
+ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+echo "using $ngpu gpus..."
+
+config_path=$1
+ckpt_name=$2
+resume=$3
+ips=$4
+
+if [ ! $ips ];then
+  ips_config=
+else
+  ips_config="--ips="${ips}
+fi
+
+mkdir -p exp
+
+# seed may break model convergence
+seed=2
+if [ ${seed} != 0 ]; then
+    export FLAGS_cudnn_deterministic=True
+fi
+
+# export FLAGS_cudnn_exhaustive_search=true
+# export FLAGS_conv_workspace_size_limit=4000
+# export FLAGS_allocator_strategy=naive_best_fit
+
+if [ ${ngpu} == 0 ]; then
+python3 -u ${BIN_DIR}/train.py \
+--ngpu ${ngpu} \
+--config ${config_path} \
+--output exp/${ckpt_name} \
+--seed ${seed} \
+--resume ${resume}
+else
+python3 -m paddle.distributed.launch --log_dir=${ckpt_name} --gpus=${CUDA_VISIBLE_DEVICES} ${ips_config} ${BIN_DIR}/train.py \
+--ngpu ${ngpu} \
+--config ${config_path} \
+--output exp/${ckpt_name} \
+--seed ${seed} \
+--resume ${resume}
+fi
+
+if [ ${seed} != 0 ]; then
+    unset FLAGS_cudnn_deterministic
+fi
+
+if [ $? -ne 0 ]; then
+    echo "Failed in training!"
+    exit 1
+fi
+
+exit 0
diff --git a/examples/aishell/asr3/path.sh b/examples/aishell/asr3/path.sh
new file mode 100755
index 00000000..f4717838
--- /dev/null
+++ b/examples/aishell/asr3/path.sh
@@ -0,0 +1,15 @@
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/tools/sctk/bin:${PWD}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
+
+
+MODEL=wav2vec2
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/s2t/exps/${MODEL}/bin
diff --git a/examples/aishell/asr3/run.sh b/examples/aishell/asr3/run.sh
new file mode 100755
index 00000000..557ca0fc
--- /dev/null
+++ b/examples/aishell/asr3/run.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+set -e
+
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+
+gpus=0,1,2,3
+stage=0
+stop_stage=4
+conf_path=conf/wav2vec2ASR.yaml
+ips=            #xx.xx.xx.xx,xx.xx.xx.xx
+decode_conf_path=conf/tuning/decode.yaml
+avg_num=1
+resume=         # xx e.g. 30
+export FLAGS_cudnn_deterministic=1
+. ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
+
+audio_file=data/demo_01_03.wav
+
+avg_ckpt=avg_${avg_num}
+ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
+echo "checkpoint name ${ckpt}" 
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    bash ./local/data.sh || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `exp` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} ${resume} ${ips} 
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # avg n best model
+    avg.sh last exp/${ckpt}/checkpoints ${avg_num}
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # greedy search decoder
+    CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    # test a single .wav file
+    CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1
+fi
diff --git a/examples/aishell/asr3/utils b/examples/aishell/asr3/utils
new file mode 120000
index 00000000..973afe67
--- /dev/null
+++ b/examples/aishell/asr3/utils
@@ -0,0 +1 @@
+../../../utils
\ No newline at end of file
diff --git a/examples/aishell3/tts3/path.sh b/examples/aishell3/tts3/path.sh
deleted file mode 100755
index fb7e8411..00000000
--- a/examples/aishell3/tts3/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=fastspeech2
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
diff --git a/examples/aishell3/tts3/path.sh b/examples/aishell3/tts3/path.sh
new file mode 120000
index 00000000..4785b909
--- /dev/null
+++ b/examples/aishell3/tts3/path.sh
@@ -0,0 +1 @@
+../../csmsc/tts3/path.sh
\ No newline at end of file
diff --git a/examples/aishell3/tts3/run.sh b/examples/aishell3/tts3/run.sh
index b5da076b..8dcecaa0 100755
--- a/examples/aishell3/tts3/run.sh
+++ b/examples/aishell3/tts3/run.sh
@@ -43,10 +43,7 @@ fi
 
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
     # install paddle2onnx
-    version=$(echo `pip list |grep "paddle2onnx"` |awk -F" " '{print $2}')
-    if [[ -z "$version" || ${version} != '1.0.0' ]]; then
-        pip install paddle2onnx==1.0.0
-    fi
+    pip install paddle2onnx --upgrade
     ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx fastspeech2_aishell3
     # considering the balance between speed and quality, we recommend that you use hifigan as vocoder
     ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_aishell3
diff --git a/examples/aishell3/vc0/path.sh b/examples/aishell3/vc0/path.sh
deleted file mode 100755
index a37cd21e..00000000
--- a/examples/aishell3/vc0/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=tacotron2
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
diff --git a/examples/aishell3/vc0/path.sh b/examples/aishell3/vc0/path.sh
new file mode 120000
index 00000000..9e1fdbd1
--- /dev/null
+++ b/examples/aishell3/vc0/path.sh
@@ -0,0 +1 @@
+../../csmsc/tts0/path.sh
\ No newline at end of file
diff --git a/examples/aishell3/vc1/local/train.sh b/examples/aishell3/vc1/local/train.sh
deleted file mode 100755
index c775fcad..00000000
--- a/examples/aishell3/vc1/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-python3 ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=2 \
-    --phones-dict=dump/phone_id_map.txt \
-    --voice-cloning=True
\ No newline at end of file
diff --git a/examples/aishell3/vc1/local/train.sh b/examples/aishell3/vc1/local/train.sh
new file mode 120000
index 00000000..115a0b8d
--- /dev/null
+++ b/examples/aishell3/vc1/local/train.sh
@@ -0,0 +1 @@
+../../vc0/local/train.sh
\ No newline at end of file
diff --git a/examples/aishell3/vc1/path.sh b/examples/aishell3/vc1/path.sh
deleted file mode 100755
index fb7e8411..00000000
--- a/examples/aishell3/vc1/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=fastspeech2
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
diff --git a/examples/aishell3/vc1/path.sh b/examples/aishell3/vc1/path.sh
new file mode 120000
index 00000000..4785b909
--- /dev/null
+++ b/examples/aishell3/vc1/path.sh
@@ -0,0 +1 @@
+../../csmsc/tts3/path.sh
\ No newline at end of file
diff --git a/examples/aishell3/vc2/local/synthesize.sh b/examples/aishell3/vc2/local/synthesize.sh
deleted file mode 100755
index 8fd8977d..00000000
--- a/examples/aishell3/vc2/local/synthesize.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-ckpt_name=$3
-
-python3 ${BIN_DIR}/../synthesize.py \
-    --am=fastspeech2_aishell3 \
-    --am_config=${config_path} \
-    --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
-    --am_stat=dump/train/speech_stats.npy \
-    --voc=pwgan_aishell3 \
-    --voc_config=pwg_aishell3_ckpt_0.5/default.yaml \
-    --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
-    --voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
-    --test_metadata=dump/test/norm/metadata.jsonl \
-    --output_dir=${train_output_path}/test \
-    --phones_dict=dump/phone_id_map.txt \
-    --speaker_dict=dump/speaker_id_map.txt \
-    --voice-cloning=True
diff --git a/examples/aishell3/vc2/local/synthesize.sh b/examples/aishell3/vc2/local/synthesize.sh
new file mode 120000
index 00000000..ca8df6b0
--- /dev/null
+++ b/examples/aishell3/vc2/local/synthesize.sh
@@ -0,0 +1 @@
+../../vc1/local/synthesize.sh
\ No newline at end of file
diff --git a/examples/aishell3/vc2/local/train.sh b/examples/aishell3/vc2/local/train.sh
deleted file mode 100755
index c775fcad..00000000
--- a/examples/aishell3/vc2/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-python3 ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=2 \
-    --phones-dict=dump/phone_id_map.txt \
-    --voice-cloning=True
\ No newline at end of file
diff --git a/examples/aishell3/vc2/local/train.sh b/examples/aishell3/vc2/local/train.sh
new file mode 120000
index 00000000..115a0b8d
--- /dev/null
+++ b/examples/aishell3/vc2/local/train.sh
@@ -0,0 +1 @@
+../../vc0/local/train.sh
\ No newline at end of file
diff --git a/examples/aishell3/vc2/path.sh b/examples/aishell3/vc2/path.sh
deleted file mode 100755
index fb7e8411..00000000
--- a/examples/aishell3/vc2/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=fastspeech2
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
diff --git a/examples/aishell3/vc2/path.sh b/examples/aishell3/vc2/path.sh
new file mode 120000
index 00000000..4785b909
--- /dev/null
+++ b/examples/aishell3/vc2/path.sh
@@ -0,0 +1 @@
+../../csmsc/tts3/path.sh
\ No newline at end of file
diff --git a/examples/aishell3/vits/local/synthesize_e2e.sh b/examples/aishell3/vits/local/synthesize_e2e.sh
index f0136991..1bd58549 100755
--- a/examples/aishell3/vits/local/synthesize_e2e.sh
+++ b/examples/aishell3/vits/local/synthesize_e2e.sh
@@ -13,6 +13,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     FLAGS_allocator_strategy=naive_best_fit \
     FLAGS_fraction_of_gpu_memory_to_use=0.01 \
     python3 ${BIN_DIR}/synthesize_e2e.py \
+        --am=vits_aishell3 \
         --config=${config_path} \
         --ckpt=${train_output_path}/checkpoints/${ckpt_name} \
         --phones_dict=dump/phone_id_map.txt \
diff --git a/examples/aishell3/voc1/local/preprocess.sh b/examples/aishell3/voc1/local/preprocess.sh
index 44cc3dbe..71eab68a 100755
--- a/examples/aishell3/voc1/local/preprocess.sh
+++ b/examples/aishell3/voc1/local/preprocess.sh
@@ -42,14 +42,18 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     python3 ${BIN_DIR}/../normalize.py \
         --metadata=dump/train/raw/metadata.jsonl \
         --dumpdir=dump/train/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
+        --skip-wav-copy
+
     python3 ${BIN_DIR}/../normalize.py \
         --metadata=dump/dev/raw/metadata.jsonl \
         --dumpdir=dump/dev/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
+        --skip-wav-copy
     
     python3 ${BIN_DIR}/../normalize.py \
         --metadata=dump/test/raw/metadata.jsonl \
         --dumpdir=dump/test/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
+        --skip-wav-copy
 fi
diff --git a/examples/aishell3/voc1/local/synthesize.sh b/examples/aishell3/voc1/local/synthesize.sh
deleted file mode 100755
index 145557b3..00000000
--- a/examples/aishell3/voc1/local/synthesize.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-ckpt_name=$3
-
-FLAGS_allocator_strategy=naive_best_fit \
-FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ${BIN_DIR}/../synthesize.py \
-    --config=${config_path} \
-    --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
-    --test-metadata=dump/test/norm/metadata.jsonl \
-    --output-dir=${train_output_path}/test \
-    --generator-type=pwgan
diff --git a/examples/aishell3/voc1/local/synthesize.sh b/examples/aishell3/voc1/local/synthesize.sh
new file mode 120000
index 00000000..d6aecd8d
--- /dev/null
+++ b/examples/aishell3/voc1/local/synthesize.sh
@@ -0,0 +1 @@
+../../../csmsc/voc1/local/synthesize.sh
\ No newline at end of file
diff --git a/examples/aishell3/voc1/local/train.sh b/examples/aishell3/voc1/local/train.sh
deleted file mode 100755
index 9695631e..00000000
--- a/examples/aishell3/voc1/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-FLAGS_cudnn_exhaustive_search=true \
-FLAGS_conv_workspace_size_limit=4000 \
-python ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=1
diff --git a/examples/aishell3/voc1/local/train.sh b/examples/aishell3/voc1/local/train.sh
new file mode 120000
index 00000000..2942893d
--- /dev/null
+++ b/examples/aishell3/voc1/local/train.sh
@@ -0,0 +1 @@
+../../../csmsc/voc1/local/train.sh
\ No newline at end of file
diff --git a/examples/aishell3/voc1/path.sh b/examples/aishell3/voc1/path.sh
deleted file mode 100755
index 1e6647b8..00000000
--- a/examples/aishell3/voc1/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=parallelwave_gan
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL}
\ No newline at end of file
diff --git a/examples/aishell3/voc1/path.sh b/examples/aishell3/voc1/path.sh
new file mode 120000
index 00000000..b7ed4fb8
--- /dev/null
+++ b/examples/aishell3/voc1/path.sh
@@ -0,0 +1 @@
+../../csmsc/voc1/path.sh
\ No newline at end of file
diff --git a/examples/aishell3/voc5/local/preprocess.sh b/examples/aishell3/voc5/local/preprocess.sh
deleted file mode 100755
index 44cc3dbe..00000000
--- a/examples/aishell3/voc5/local/preprocess.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/bash
-
-stage=0
-stop_stage=100
-
-config_path=$1
-
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    # get durations from MFA's result
-    echo "Generate durations.txt from MFA results ..."
-    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
-        --inputdir=./aishell3_alignment_tone \
-        --output=durations.txt \
-        --config=${config_path}
-fi
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    # extract features
-    echo "Extract features ..."
-    python3 ${BIN_DIR}/../preprocess.py \
-        --rootdir=~/datasets/data_aishell3/ \
-        --dataset=aishell3 \
-        --dumpdir=dump \
-        --dur-file=durations.txt \
-        --config=${config_path} \
-        --cut-sil=True \
-        --num-cpu=20
-fi
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    # get features' stats(mean and std)
-    echo "Get features' stats ..."
-    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
-        --metadata=dump/train/raw/metadata.jsonl \
-        --field-name="feats"
-fi
-
-if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
-    # normalize, dev and test should use train's stats
-    echo "Normalize ..."
-   
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/train/raw/metadata.jsonl \
-        --dumpdir=dump/train/norm \
-        --stats=dump/train/feats_stats.npy
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/dev/raw/metadata.jsonl \
-        --dumpdir=dump/dev/norm \
-        --stats=dump/train/feats_stats.npy
-    
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/test/raw/metadata.jsonl \
-        --dumpdir=dump/test/norm \
-        --stats=dump/train/feats_stats.npy
-fi
diff --git a/examples/aishell3/voc5/local/preprocess.sh b/examples/aishell3/voc5/local/preprocess.sh
new file mode 120000
index 00000000..f0cb24de
--- /dev/null
+++ b/examples/aishell3/voc5/local/preprocess.sh
@@ -0,0 +1 @@
+../../voc1/local/preprocess.sh
\ No newline at end of file
diff --git a/examples/aishell3/voc5/local/synthesize.sh b/examples/aishell3/voc5/local/synthesize.sh
deleted file mode 100755
index 64789617..00000000
--- a/examples/aishell3/voc5/local/synthesize.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-ckpt_name=$3
-
-FLAGS_allocator_strategy=naive_best_fit \
-FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ${BIN_DIR}/../synthesize.py \
-    --config=${config_path} \
-    --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
-    --test-metadata=dump/test/norm/metadata.jsonl \
-    --output-dir=${train_output_path}/test \
-    --generator-type=hifigan
diff --git a/examples/aishell3/voc5/local/synthesize.sh b/examples/aishell3/voc5/local/synthesize.sh
new file mode 120000
index 00000000..c887112c
--- /dev/null
+++ b/examples/aishell3/voc5/local/synthesize.sh
@@ -0,0 +1 @@
+../../../csmsc/voc5/local/synthesize.sh
\ No newline at end of file
diff --git a/examples/aishell3/voc5/local/train.sh b/examples/aishell3/voc5/local/train.sh
deleted file mode 100755
index 9695631e..00000000
--- a/examples/aishell3/voc5/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-FLAGS_cudnn_exhaustive_search=true \
-FLAGS_conv_workspace_size_limit=4000 \
-python ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=1
diff --git a/examples/aishell3/voc5/local/train.sh b/examples/aishell3/voc5/local/train.sh
new file mode 120000
index 00000000..2942893d
--- /dev/null
+++ b/examples/aishell3/voc5/local/train.sh
@@ -0,0 +1 @@
+../../../csmsc/voc1/local/train.sh
\ No newline at end of file
diff --git a/examples/aishell3/voc5/path.sh b/examples/aishell3/voc5/path.sh
deleted file mode 100755
index 7451b321..00000000
--- a/examples/aishell3/voc5/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=hifigan
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL}
diff --git a/examples/aishell3/voc5/path.sh b/examples/aishell3/voc5/path.sh
new file mode 120000
index 00000000..b67fe2b3
--- /dev/null
+++ b/examples/aishell3/voc5/path.sh
@@ -0,0 +1 @@
+../../csmsc/voc5/path.sh
\ No newline at end of file
diff --git a/examples/aishell3_vctk/ernie_sat/local/synthesize.sh b/examples/aishell3_vctk/ernie_sat/local/synthesize.sh
deleted file mode 100755
index 8b4178f1..00000000
--- a/examples/aishell3_vctk/ernie_sat/local/synthesize.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-ckpt_name=$3
-
-stage=0
-stop_stage=0
-
-# hifigan
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    FLAGS_allocator_strategy=naive_best_fit \
-    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-    python3 ${BIN_DIR}/synthesize.py \
-        --erniesat_config=${config_path} \
-        --erniesat_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
-        --erniesat_stat=dump/train/speech_stats.npy \
-        --voc=hifigan_aishell3 \
-        --voc_config=hifigan_aishell3_ckpt_0.2.0/default.yaml \
-        --voc_ckpt=hifigan_aishell3_ckpt_0.2.0/snapshot_iter_2500000.pdz \
-        --voc_stat=hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \
-        --test_metadata=dump/test/norm/metadata.jsonl \
-        --output_dir=${train_output_path}/test \
-        --phones_dict=dump/phone_id_map.txt
-fi
diff --git a/examples/aishell3_vctk/ernie_sat/local/synthesize.sh b/examples/aishell3_vctk/ernie_sat/local/synthesize.sh
new file mode 120000
index 00000000..5703dcb2
--- /dev/null
+++ b/examples/aishell3_vctk/ernie_sat/local/synthesize.sh
@@ -0,0 +1 @@
+../../../aishell3/ernie_sat/local/synthesize.sh
\ No newline at end of file
diff --git a/examples/aishell3_vctk/ernie_sat/local/train.sh b/examples/aishell3_vctk/ernie_sat/local/train.sh
deleted file mode 100755
index 526aac43..00000000
--- a/examples/aishell3_vctk/ernie_sat/local/train.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-python3 ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=8 \
-    --phones-dict=dump/phone_id_map.txt
\ No newline at end of file
diff --git a/examples/aishell3_vctk/ernie_sat/local/train.sh b/examples/aishell3_vctk/ernie_sat/local/train.sh
new file mode 120000
index 00000000..9f1d2346
--- /dev/null
+++ b/examples/aishell3_vctk/ernie_sat/local/train.sh
@@ -0,0 +1 @@
+../../../aishell3/ernie_sat/local/train.sh
\ No newline at end of file
diff --git a/examples/aishell3_vctk/ernie_sat/path.sh b/examples/aishell3_vctk/ernie_sat/path.sh
deleted file mode 100755
index 4ecab025..00000000
--- a/examples/aishell3_vctk/ernie_sat/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=ernie_sat
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
\ No newline at end of file
diff --git a/examples/aishell3_vctk/ernie_sat/path.sh b/examples/aishell3_vctk/ernie_sat/path.sh
new file mode 120000
index 00000000..5ec39759
--- /dev/null
+++ b/examples/aishell3_vctk/ernie_sat/path.sh
@@ -0,0 +1 @@
+../../aishell3/ernie_sat/path.sh
\ No newline at end of file
diff --git a/examples/canton/tts3/README.md b/examples/canton/tts3/README.md
new file mode 100644
index 00000000..f46949d2
--- /dev/null
+++ b/examples/canton/tts3/README.md
@@ -0,0 +1,127 @@
+# FastSpeech2 with Cantonese language
+
+## Dataset
+### Download and Extract
+If you don't have the Cantonese datasets mentioned above, please download and unzip  [Guangzhou_Cantonese_Scripted_Speech_Corpus_Daily_Use_Sentence](https://magichub.com/datasets/guangzhou-cantonese-scripted-speech-corpus-daily-use-sentence/) and [Guangzhou_Cantonese_Scripted_Speech_Corpus_in_Vehicle](https://magichub.com/datasets/guangzhou-cantonese-scripted-speech-corpus-in-the-vehicle/) under `~/datasets/`.
+
+To obtain better performance, please combine these two datasets together as follows:
+
+```bash
+mkdir -p ~/datasets/canton_all/WAV
+cp -r ~/datasets/Guangzhou_Cantonese_Scripted_Speech_Corpus_Daily_Use_Sentence/WAV/* ~/datasets/canton_all/WAV
+cp -r ~/datasets/Guangzhou_Cantonese_Scripted_Speech_Corpus_in_Vehicle/WAV/* ~/datasets/canton_all/WAV
+```
+
+After that, it should be look like:
+```
+~/datasets/canton_all
+│   └── WAV
+│       └──G0001
+│       └──G0002
+│       ...
+│       └──G0071
+│       └──G0072
+```
+
+
+### Get MFA Result and Extract
+We use [MFA1.x](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for canton_fastspeech2.
+You can train your MFA model reference to [canton_mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) (use MFA1.x now) of our repo.
+We here provide the MFA results of these two datasets. [canton_alignment.zip](https://paddlespeech.bj.bcebos.com/MFA/Canton/canton_alignment.zip)
+
+## Get Started
+Assume the path to the Cantonese MFA result of the two datsets mentioned above is `./canton_alignment`.
+Run the command below to
+1. **source path**.
+2. preprocess the dataset.
+3. train the model.
+4. synthesize wavs.
+    - synthesize waveform from `metadata.jsonl`.
+    - synthesize waveform from text file.
+```bash
+./run.sh
+```
+You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, running the following command will only preprocess the dataset.
+```bash
+./run.sh --stage 0 --stop-stage 0
+```
+
+### Data Preprocessing
+```bash
+./local/preprocess.sh ${conf_path}
+```
+When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.
+```text
+dump
+├── dev
+│   ├── norm
+│   └── raw
+├── phone_id_map.txt
+├── speaker_id_map.txt
+├── test
+│   ├── norm
+│   └── raw
+└── train
+    ├── energy_stats.npy
+    ├── norm
+    ├── pitch_stats.npy
+    ├── raw
+    └── speech_stats.npy
+```
+The dataset is split into 3 parts, namely `train`, `dev`, and` test`, each of which contains a `norm` and `raw` subfolder. The raw folder contains speech、pitch and energy features of each utterance, while the norm folder contains normalized ones. The statistics used to normalize features are computed from the training set, which is located in `dump/train/*_stats.npy`.
+
+Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file that contains phones, text_lengths, speech_lengths, durations, the path of speech features, the path of pitch features, a path of energy features, speaker, and id of each utterance.
+
+### Training details can refer to the script of [examples/aishell3/tts3](../../aishell3/tts3).
+
+## Pretrained Model
+Pretrained FastSpeech2 model with no silence in the edge of audios:
+- [fastspeech2_canton_ckpt_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_canton_ckpt_1.4.0.zip)
+
+The static model can be downloaded here:
+- [fastspeech2_canton_static_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_canton_static_1.4.0.zip)
+
+The ONNX model can be downloaded here:  
+- [fastspeech2_canton_onnx_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_canton_onnx_1.4.0.zip)
+
+FastSpeech2 checkpoint contains files listed below.
+
+```text
+fastspeech2_canton_ckpt_1.4.0
+├── default.yaml            # default config used to train fastspeech2
+├── energy_stats.npy        # statistics used to normalize energy when training fastspeech2
+├── phone_id_map.txt        # phone vocabulary file when training fastspeech2
+├── pitch_stats.npy         # statistics used to normalize pitch when training fastspeech2
+├── snapshot_iter_140000.pdz # model parameters and optimizer states
+├── speaker_id_map.txt      # speaker id map file when training a multi-speaker fastspeech2
+└── speech_stats.npy        # statistics used to normalize spectrogram when training fastspeech2
+```
+We use [parallel wavegan](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1) as the neural vocoder.
+Download the pretrained parallel wavegan model from [pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip) and unzip it.
+```bash
+unzip pwg_aishell3_ckpt_0.5.zip
+```
+
+You can use the following scripts to synthesize for `${BIN_DIR}/../sentences_canton.txt` using pretrained fastspeech2 and parallel wavegan models.
+```bash
+source path.sh
+
+FLAGS_allocator_strategy=naive_best_fit \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python3 ${BIN_DIR}/../synthesize_e2e.py \
+  --am=fastspeech2_aishell3 \
+  --am_config=fastspeech2_canton_ckpt_1.4.0/default.yaml \
+  --am_ckpt=fastspeech2_canton_ckpt_1.4.0/snapshot_iter_140000.pdz \
+  --am_stat=fastspeech2_canton_ckpt_1.4.0/speech_stats.npy \
+  --voc=pwgan_aishell3 \
+  --voc_config=pwg_aishell3_ckpt_0.5/default.yaml \
+  --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
+  --voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
+  --lang=canton \
+  --text=${BIN_DIR}/../sentences_canton.txt \
+  --output_dir=exp/default/test_e2e \
+  --phones_dict=fastspeech2_canton_ckpt_1.4.0/phone_id_map.txt \
+  --speaker_dict=fastspeech2_canton_ckpt_1.4.0/speaker_id_map.txt \
+  --spk_id=10 \
+  --inference_dir=exp/default/inference
+```
diff --git a/examples/canton/tts3/conf/default.yaml b/examples/canton/tts3/conf/default.yaml
new file mode 100644
index 00000000..a101e6ee
--- /dev/null
+++ b/examples/canton/tts3/conf/default.yaml
@@ -0,0 +1,107 @@
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+
+fs: 24000          # sr
+n_fft: 2048        # FFT size (samples).
+n_shift: 300       # Hop size (samples). 12.5ms
+win_length: 1200   # Window length (samples). 50ms
+                   # If set to null, it will be the same as fft_size.
+window: "hann"     # Window function.
+
+# Only used for feats_type != raw
+
+fmin: 80           # Minimum frequency of Mel basis.
+fmax: 7600         # Maximum frequency of Mel basis.
+n_mels: 80         # The number of mel basis.
+
+# Only used for the model using pitch features (e.g. FastSpeech2)
+# The canton datasets we use are different from others like Databaker or LJSpeech, 
+# we set it to 110 to avoid too many zero-pitch problem. 
+# Reference: https://github.com/JeremyCCHsu/Python-Wrapper-for-World-Vocoder/issues/38
+f0min: 110          # Minimum f0 for pitch extraction.
+f0max: 400         # Maximum f0 for pitch extraction.
+
+
+###########################################################
+#                       DATA SETTING                      #
+###########################################################
+batch_size: 32
+num_workers: 2
+
+
+###########################################################
+#                       MODEL SETTING                     #
+###########################################################
+model:
+    adim: 384         # attention dimension
+    aheads: 2         # number of attention heads
+    elayers: 4        # number of encoder layers
+    eunits: 1536      # number of encoder ff units
+    dlayers: 4        # number of decoder layers
+    dunits: 1536      # number of decoder ff units
+    positionwise_layer_type: conv1d   # type of position-wise layer
+    positionwise_conv_kernel_size: 3  # kernel size of position wise conv layer
+    duration_predictor_layers: 2      # number of layers of duration predictor
+    duration_predictor_chans: 256     # number of channels of duration predictor
+    duration_predictor_kernel_size: 3 # filter size of duration predictor
+    postnet_layers: 5                 # number of layers of postnset
+    postnet_filts: 5                  # filter size of conv layers in postnet
+    postnet_chans: 256                # number of channels of conv layers in postnet
+    use_scaled_pos_enc: True          # whether to use scaled positional encoding
+    encoder_normalize_before: True    # whether to perform layer normalization before the input
+    decoder_normalize_before: True    # whether to perform layer normalization before the input
+    reduction_factor: 1               # reduction factor
+    init_type: xavier_uniform         # initialization type
+    init_enc_alpha: 1.0               # initial value of alpha of encoder scaled position encoding
+    init_dec_alpha: 1.0               # initial value of alpha of decoder scaled position encoding
+    transformer_enc_dropout_rate: 0.2            # dropout rate for transformer encoder layer
+    transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
+    transformer_enc_attn_dropout_rate: 0.2       # dropout rate for transformer encoder attention layer
+    transformer_dec_dropout_rate: 0.2            # dropout rate for transformer decoder layer
+    transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
+    transformer_dec_attn_dropout_rate: 0.2       # dropout rate for transformer decoder attention layer
+    pitch_predictor_layers: 5                  # number of conv layers in pitch predictor
+    pitch_predictor_chans: 256                 # number of channels of conv layers in pitch predictor
+    pitch_predictor_kernel_size: 5             # kernel size of conv leyers in pitch predictor
+    pitch_predictor_dropout: 0.5               # dropout rate in pitch predictor
+    pitch_embed_kernel_size: 1                 # kernel size of conv embedding layer for pitch
+    pitch_embed_dropout: 0.0                   # dropout rate after conv embedding layer for pitch
+    stop_gradient_from_pitch_predictor: True   # whether to stop the gradient from pitch predictor to encoder
+    energy_predictor_layers: 2                 # number of conv layers in energy predictor
+    energy_predictor_chans: 256                # number of channels of conv layers in energy predictor
+    energy_predictor_kernel_size: 3            # kernel size of conv leyers in energy predictor
+    energy_predictor_dropout: 0.5              # dropout rate in energy predictor
+    energy_embed_kernel_size: 1                # kernel size of conv embedding layer for energy
+    energy_embed_dropout: 0.0                  # dropout rate after conv embedding layer for energy
+    stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder
+    spk_embed_dim: 256                         # speaker embedding dimension
+    spk_embed_integration_type: concat         # speaker embedding integration type
+
+
+
+###########################################################
+#                       UPDATER SETTING                   #
+###########################################################
+updater:
+    use_masking: True                 # whether to apply masking for padded part in loss calculation
+
+
+###########################################################
+#                     OPTIMIZER SETTING                   #
+###########################################################
+optimizer:
+    optim: adam               # optimizer type
+    learning_rate: 0.001      # learning rate
+
+###########################################################
+#                     TRAINING SETTING                    #
+###########################################################
+max_epoch: 1000
+num_snapshots: 5
+
+
+###########################################################
+#                       OTHER SETTING                     #
+###########################################################
+seed: 10086
diff --git a/examples/canton/tts3/local/inference.sh b/examples/canton/tts3/local/inference.sh
new file mode 100755
index 00000000..caf0b438
--- /dev/null
+++ b/examples/canton/tts3/local/inference.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+
+train_output_path=$1
+
+stage=0
+stop_stage=0
+
+# pwgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    python3 ${BIN_DIR}/../inference.py \
+        --inference_dir=${train_output_path}/inference \
+        --am=fastspeech2_canton \
+        --voc=pwgan_aishell3 \
+        --spk_id=10 \
+        --text=${BIN_DIR}/../sentences_canton.txt \
+        --output_dir=${train_output_path}/pd_infer_out \
+        --phones_dict=dump/phone_id_map.txt \
+        --speaker_dict=dump/speaker_id_map.txt \
+        --lang=canton
+fi
+
+# for more GAN Vocoders
+# multi band melgan
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    python3 ${BIN_DIR}/../inference.py \
+        --inference_dir=${train_output_path}/inference \
+        --am=fastspeech2_canton \
+        --voc=mb_melgan_csmsc \
+        --spk_id=10 \
+        --text=${BIN_DIR}/../sentences_canton.txt \
+        --output_dir=${train_output_path}/pd_infer_out \
+        --phones_dict=dump/phone_id_map.txt \
+        --speaker_dict=dump/speaker_id_map.txt \
+        --lang=canton
+fi
+
+# hifigan
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    python3 ${BIN_DIR}/../inference.py \
+        --inference_dir=${train_output_path}/inference \
+        --am=fastspeech2_canton \
+        --voc=hifigan_csmsc \
+        --spk_id=10 \
+        --text=${BIN_DIR}/../sentences_canton.txt \
+        --output_dir=${train_output_path}/pd_infer_out \
+        --phones_dict=dump/phone_id_map.txt \
+        --speaker_dict=dump/speaker_id_map.txt \
+        --lang=canton
+fi
+
+# wavernn
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    python3 ${BIN_DIR}/../inference.py \
+        --inference_dir=${train_output_path}/inference \
+        --am=fastspeech2_canton \
+        --voc=wavernn_csmsc \
+        --spk_id=10 \
+        --text=${BIN_DIR}/../sentences_canton.txt \
+        --output_dir=${train_output_path}/pd_infer_out \
+        --phones_dict=dump/phone_id_map.txt \
+        --speaker_dict=dump/speaker_id_map.txt \
+        --lang=canton
+fi
diff --git a/examples/canton/tts3/local/ort_predict.sh b/examples/canton/tts3/local/ort_predict.sh
new file mode 100755
index 00000000..d95e49f9
--- /dev/null
+++ b/examples/canton/tts3/local/ort_predict.sh
@@ -0,0 +1,49 @@
+train_output_path=$1
+
+stage=0
+stop_stage=0
+
+# e2e, synthesize from text
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    python3 ${BIN_DIR}/../ort_predict_e2e.py \
+        --inference_dir=${train_output_path}/inference_onnx \
+        --am=fastspeech2_canton \
+        --voc=pwgan_aishell3 \
+        --spk_id=10 \
+        --output_dir=${train_output_path}/onnx_infer_out_e2e \
+        --text=${BIN_DIR}/../sentences_canton.txt \
+        --phones_dict=dump/phone_id_map.txt \
+        --speaker_dict=dump/speaker_id_map.txt \
+        --lang=canton \
+        --device=cpu \
+        --cpu_threads=2
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    python3 ${BIN_DIR}/../ort_predict_e2e.py \
+        --inference_dir=${train_output_path}/inference_onnx \
+        --am=fastspeech2_canton \
+        --voc=mb_melgan_csmsc \
+        --spk_id=10 \
+        --output_dir=${train_output_path}/onnx_infer_out_e2e \
+        --text=${BIN_DIR}/../sentences_canton.txt \
+        --phones_dict=dump/phone_id_map.txt \
+        --speaker_dict=dump/speaker_id_map.txt \
+        --lang=canton \
+        --device=cpu \
+        --cpu_threads=2
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    python3 ${BIN_DIR}/../ort_predict_e2e.py \
+        --inference_dir=${train_output_path}/inference_onnx \
+        --am=fastspeech2_canton \
+        --voc=hifigan_csmsc \
+        --output_dir=${train_output_path}/onnx_infer_out_e2e \
+        --text=${BIN_DIR}/../sentences_canton.txt \
+        --phones_dict=dump/phone_id_map.txt \
+        --speaker_dict=dump/speaker_id_map.txt \
+        --lang=canton \
+        --device=cpu \
+        --cpu_threads=2
+fi
diff --git a/examples/canton/tts3/local/paddle2onnx.sh b/examples/canton/tts3/local/paddle2onnx.sh
new file mode 120000
index 00000000..8d5dbef4
--- /dev/null
+++ b/examples/canton/tts3/local/paddle2onnx.sh
@@ -0,0 +1 @@
+../../../csmsc/tts3/local/paddle2onnx.sh
\ No newline at end of file
diff --git a/examples/canton/tts3/local/preprocess.sh b/examples/canton/tts3/local/preprocess.sh
new file mode 100755
index 00000000..f70b1c02
--- /dev/null
+++ b/examples/canton/tts3/local/preprocess.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+
+stage=0
+stop_stage=100
+
+config_path=$1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # get durations from MFA's result
+    echo "Generate durations.txt from MFA results ..."
+    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
+        --inputdir=./canton_alignment \
+        --output durations.txt \
+        --config=${config_path}
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # extract features
+    echo "Extract features ..."
+    python3 ${BIN_DIR}/preprocess.py \
+        --dataset=canton \
+        --rootdir=~/datasets/canton_all \
+        --dumpdir=dump \
+        --dur-file=durations.txt \
+        --config=${config_path} \
+        --num-cpu=20 \
+        --cut-sil=True
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # get features' stats(mean and std)
+    echo "Get features' stats ..."
+    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --field-name="speech"
+
+    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --field-name="pitch"
+
+    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --field-name="energy"
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # normalize and covert phone/speaker to id, dev and test should use train's stats
+    echo "Normalize ..."
+    python3 ${BIN_DIR}/normalize.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --dumpdir=dump/train/norm \
+        --speech-stats=dump/train/speech_stats.npy \
+        --pitch-stats=dump/train/pitch_stats.npy \
+        --energy-stats=dump/train/energy_stats.npy \
+        --phones-dict=dump/phone_id_map.txt \
+        --speaker-dict=dump/speaker_id_map.txt
+
+    python3 ${BIN_DIR}/normalize.py \
+        --metadata=dump/dev/raw/metadata.jsonl \
+        --dumpdir=dump/dev/norm \
+        --speech-stats=dump/train/speech_stats.npy \
+        --pitch-stats=dump/train/pitch_stats.npy \
+        --energy-stats=dump/train/energy_stats.npy \
+        --phones-dict=dump/phone_id_map.txt \
+        --speaker-dict=dump/speaker_id_map.txt
+
+    python3 ${BIN_DIR}/normalize.py \
+        --metadata=dump/test/raw/metadata.jsonl \
+        --dumpdir=dump/test/norm \
+        --speech-stats=dump/train/speech_stats.npy \
+        --pitch-stats=dump/train/pitch_stats.npy \
+        --energy-stats=dump/train/energy_stats.npy \
+        --phones-dict=dump/phone_id_map.txt \
+        --speaker-dict=dump/speaker_id_map.txt
+fi
diff --git a/examples/canton/tts3/local/synthesize.sh b/examples/canton/tts3/local/synthesize.sh
new file mode 120000
index 00000000..ca9966ed
--- /dev/null
+++ b/examples/canton/tts3/local/synthesize.sh
@@ -0,0 +1 @@
+../../../aishell3/tts3/local/synthesize.sh
\ No newline at end of file
diff --git a/examples/canton/tts3/local/synthesize_e2e.sh b/examples/canton/tts3/local/synthesize_e2e.sh
new file mode 100755
index 00000000..8cf7eb22
--- /dev/null
+++ b/examples/canton/tts3/local/synthesize_e2e.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
+stage=0
+stop_stage=0
+
+# pwgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=fastspeech2_canton \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=pwgan_aishell3 \
+        --voc_config=pwg_aishell3_ckpt_0.5/default.yaml \
+        --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
+        --voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
+        --lang=canton \
+        --text=${BIN_DIR}/../sentences_canton.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --speaker_dict=dump/speaker_id_map.txt \
+        --spk_id=10 \
+        --inference_dir=${train_output_path}/inference
+fi
+
+# hifigan
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    echo "in hifigan syn_e2e"
+    FLAGS_allocator_strategy=naive_best_fit \
+    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=fastspeech2_canton \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=hifigan_aishell3 \
+        --voc_config=hifigan_aishell3_ckpt_0.2.0/default.yaml \
+        --voc_ckpt=hifigan_aishell3_ckpt_0.2.0/snapshot_iter_2500000.pdz \
+        --voc_stat=hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \
+        --lang=canton \
+        --text=${BIN_DIR}/../sentences_canton.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --speaker_dict=dump/speaker_id_map.txt \
+        --spk_id=10 \
+        --inference_dir=${train_output_path}/inference
+    fi
diff --git a/examples/canton/tts3/local/train.sh b/examples/canton/tts3/local/train.sh
new file mode 120000
index 00000000..78885a30
--- /dev/null
+++ b/examples/canton/tts3/local/train.sh
@@ -0,0 +1 @@
+../../../aishell3/tts3/local/train.sh
\ No newline at end of file
diff --git a/examples/canton/tts3/path.sh b/examples/canton/tts3/path.sh
new file mode 120000
index 00000000..4785b909
--- /dev/null
+++ b/examples/canton/tts3/path.sh
@@ -0,0 +1 @@
+../../csmsc/tts3/path.sh
\ No newline at end of file
diff --git a/examples/canton/tts3/run.sh b/examples/canton/tts3/run.sh
new file mode 100755
index 00000000..acfc5022
--- /dev/null
+++ b/examples/canton/tts3/run.sh
@@ -0,0 +1,62 @@
+#!/bin/bash
+
+set -e
+source path.sh
+
+gpus=0,1
+stage=0
+stop_stage=100
+
+conf_path=conf/default.yaml
+train_output_path=exp/default
+
+ckpt_name=snapshot_iter_140000.pdz
+
+# with the following command, you can choose the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    ./local/preprocess.sh ${conf_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # synthesize, vocoder is pwgan by default
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # synthesize_e2e, vocoder is pwgan by default
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    # inference with static model, vocoder is pwgan by default
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} || exit -1
+fi
+
+# paddle2onnx, please make sure the static models are in ${train_output_path}/inference first
+# we have only tested the following models so far
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+    # install paddle2onnx
+    pip install paddle2onnx --upgrade
+    ../../csmsc/tts3/local/paddle2onnx.sh ${train_output_path} inference inference_onnx fastspeech2_canton
+    # considering the balance between speed and quality, we recommend that you use hifigan as vocoder
+    # ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_csmsc
+    ../../csmsc/tts3/local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_aishell3 
+    # ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx mb_melgan_csmsc
+    # ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx hifigan_csmsc
+    
+fi
+
+# inference with onnxruntime
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+    ./local/ort_predict.sh ${train_output_path}
+fi
diff --git a/examples/csmsc/jets/README.md b/examples/csmsc/jets/README.md
new file mode 100644
index 00000000..07dade0e
--- /dev/null
+++ b/examples/csmsc/jets/README.md
@@ -0,0 +1,108 @@
+# JETS with CSMSC
+This example contains code used to train a [JETS](https://arxiv.org/abs/2203.16852v1) model with [Chinese Standard Mandarin Speech Copus](https://www.data-baker.com/open_source.html).
+
+## Dataset
+### Download and Extract
+Download CSMSC from it's [Official Website](https://test.data-baker.com/data/index/source).
+
+### Get MFA Result and Extract
+We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get phonemes and durations for JETS.
+You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
+
+## Get Started
+Assume the path to the dataset is `~/datasets/BZNSYP`.
+Assume the path to the MFA result of CSMSC is `./baker_alignment_tone`.
+Run the command below to
+1. **source path**.
+2. preprocess the dataset.
+3. train the model.
+4. synthesize wavs.
+    - synthesize waveform from `metadata.jsonl`.
+    - synthesize waveform from a text file.
+
+```bash
+./run.sh
+```
+You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, running the following command will only preprocess the dataset.
+```bash
+./run.sh --stage 0 --stop-stage 0
+```
+### Data Preprocessing
+```bash
+./local/preprocess.sh ${conf_path}
+```
+When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.
+
+```text
+dump
+├── dev
+│   ├── norm
+│   └── raw
+├── phone_id_map.txt
+├── speaker_id_map.txt
+├── test
+│   ├── norm
+│   └── raw
+└── train
+    ├── feats_stats.npy
+    ├── norm
+    └── raw
+```
+The dataset is split into 3 parts, namely `train`, `dev`, and` test`, each of which contains a `norm` and `raw` subfolder. The raw folder contains wave、mel spectrogram、speech、pitch and energy features of each utterance, while the norm folder contains normalized ones. The statistics used to normalize features are computed from the training set, which is located in `dump/train/feats_stats.npy`.
+
+Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file that contains phones, text_lengths, the path of feats, feats_lengths, the path of pitch features, the path of energy features, the path of raw waves, speaker, and the id of each utterance.
+
+### Model Training
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
+```
+`./local/train.sh` calls `${BIN_DIR}/train.py`.
+Here's the complete help message.
+```text
+usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
+                [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
+                [--ngpu NGPU] [--phones-dict PHONES_DICT]
+
+Train a JETS model.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --config CONFIG       config file to overwrite default config.
+  --train-metadata TRAIN_METADATA
+                        training data.
+  --dev-metadata DEV_METADATA
+                        dev data.
+  --output-dir OUTPUT_DIR
+                        output dir.
+  --ngpu NGPU           if ngpu == 0, use cpu.
+  --phones-dict PHONES_DICT
+                        phone vocabulary file.
+```
+1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
+2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder.
+3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are saved in `checkpoints/` inside this directory.
+4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
+5. `--phones-dict` is the path of the phone vocabulary file.
+
+### Synthesizing
+
+`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
+
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
+```
+
+`./local/synthesize_e2e.sh` calls `${BIN_DIR}/synthesize_e2e.py`, which can synthesize waveform from text file.
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name}
+```
+
+## Pretrained Model
+
+The pretrained model can be downloaded here:
+
+- [jets_csmsc_ckpt_1.5.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/jets_csmsc_ckpt_1.5.0.zip)
+
+The static model can be downloaded here:
+
+- [jets_csmsc_static_1.5.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/jets_csmsc_static_1.5.0.zip)
diff --git a/examples/csmsc/jets/conf/default.yaml b/examples/csmsc/jets/conf/default.yaml
new file mode 100644
index 00000000..1dafd20c
--- /dev/null
+++ b/examples/csmsc/jets/conf/default.yaml
@@ -0,0 +1,224 @@
+# This configuration tested on 4 GPUs (V100) with 32GB GPU
+# memory. It takes around 2 weeks to finish the training
+# but 100k iters model should generate reasonable results.
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+
+n_mels: 80
+fs: 22050         # sr
+n_fft: 1024        # FFT size (samples).
+n_shift: 256       # Hop size (samples). 12.5ms
+win_length: null   # Window length (samples). 50ms
+                   # If set to null, it will be the same as fft_size.
+window: "hann"     # Window function.
+fmin: 0            # minimum frequency for Mel basis
+fmax: null         # maximum frequency for Mel basis
+f0min: 80          # Minimum f0 for pitch extraction.
+f0max: 400         # Maximum f0 for pitch extraction.
+
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+model:
+    # generator related
+    generator_type: jets_generator
+    generator_params:
+        adim: 256         # attention dimension
+        aheads: 2         # number of attention heads
+        elayers: 4        # number of encoder layers
+        eunits: 1024      # number of encoder ff units
+        dlayers: 4        # number of decoder layers
+        dunits: 1024      # number of decoder ff units
+        positionwise_layer_type: conv1d   # type of position-wise layer
+        positionwise_conv_kernel_size: 3  # kernel size of position wise conv layer
+        duration_predictor_layers: 2      # number of layers of duration predictor
+        duration_predictor_chans: 256     # number of channels of duration predictor
+        duration_predictor_kernel_size: 3 # filter size of duration predictor
+        use_masking: True                 # whether to apply masking for padded part in loss calculation
+        encoder_normalize_before: True    # whether to perform layer normalization before the input
+        decoder_normalize_before: True    # whether to perform layer normalization before the input
+        encoder_type: transformer           # encoder type
+        decoder_type: transformer           # decoder type
+        conformer_rel_pos_type: latest               # relative positional encoding type
+        conformer_pos_enc_layer_type: rel_pos        # conformer positional encoding type
+        conformer_self_attn_layer_type: rel_selfattn # conformer self-attention type
+        conformer_activation_type: swish             # conformer activation type
+        use_macaron_style_in_conformer: true         # whether to use macaron style in conformer
+        use_cnn_in_conformer: true                   # whether to use CNN in conformer
+        conformer_enc_kernel_size: 7                 # kernel size in CNN module of conformer-based encoder
+        conformer_dec_kernel_size: 31                # kernel size in CNN module of conformer-based decoder
+        init_type: xavier_uniform                    # initialization type
+        init_enc_alpha: 1.0                          # initial value of alpha for encoder
+        init_dec_alpha: 1.0                          # initial value of alpha for decoder
+        transformer_enc_dropout_rate: 0.2            # dropout rate for transformer encoder layer
+        transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
+        transformer_enc_attn_dropout_rate: 0.2       # dropout rate for transformer encoder attention layer
+        transformer_dec_dropout_rate: 0.2            # dropout rate for transformer decoder layer
+        transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
+        transformer_dec_attn_dropout_rate: 0.2       # dropout rate for transformer decoder attention layer
+        pitch_predictor_layers: 5                    # number of conv layers in pitch predictor
+        pitch_predictor_chans: 256                   # number of channels of conv layers in pitch predictor
+        pitch_predictor_kernel_size: 5               # kernel size of conv leyers in pitch predictor
+        pitch_predictor_dropout: 0.5                 # dropout rate in pitch predictor
+        pitch_embed_kernel_size: 1                   # kernel size of conv embedding layer for pitch
+        pitch_embed_dropout: 0.0                     # dropout rate after conv embedding layer for pitch
+        stop_gradient_from_pitch_predictor: true     # whether to stop the gradient from pitch predictor to encoder
+        energy_predictor_layers: 2                   # number of conv layers in energy predictor
+        energy_predictor_chans: 256                  # number of channels of conv layers in energy predictor
+        energy_predictor_kernel_size: 3              # kernel size of conv leyers in energy predictor
+        energy_predictor_dropout: 0.5                # dropout rate in energy predictor
+        energy_embed_kernel_size: 1                  # kernel size of conv embedding layer for energy
+        energy_embed_dropout: 0.0                    # dropout rate after conv embedding layer for energy
+        stop_gradient_from_energy_predictor: false   # whether to stop the gradient from energy predictor to encoder
+        generator_out_channels: 1
+        generator_channels: 512
+        generator_global_channels: -1
+        generator_kernel_size: 7
+        generator_upsample_scales: [8, 8, 2, 2]
+        generator_upsample_kernel_sizes: [16, 16, 4, 4]
+        generator_resblock_kernel_sizes: [3, 7, 11]
+        generator_resblock_dilations: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
+        generator_use_additional_convs: true
+        generator_bias: true
+        generator_nonlinear_activation: "leakyrelu"
+        generator_nonlinear_activation_params:
+            negative_slope: 0.1
+        generator_use_weight_norm: true
+        segment_size: 64              # segment size for random windowed discriminator
+
+    # discriminator related
+    discriminator_type: hifigan_multi_scale_multi_period_discriminator
+    discriminator_params:
+        scales: 1
+        scale_downsample_pooling: "AvgPool1D"
+        scale_downsample_pooling_params:
+            kernel_size: 4
+            stride: 2
+            padding: 2
+        scale_discriminator_params:
+            in_channels: 1
+            out_channels: 1
+            kernel_sizes: [15, 41, 5, 3]
+            channels: 128
+            max_downsample_channels: 1024
+            max_groups: 16
+            bias: True
+            downsample_scales: [2, 2, 4, 4, 1]
+            nonlinear_activation: "leakyrelu"
+            nonlinear_activation_params:
+                negative_slope: 0.1
+            use_weight_norm: True
+            use_spectral_norm: False
+        follow_official_norm: False
+        periods: [2, 3, 5, 7, 11]
+        period_discriminator_params:
+            in_channels: 1
+            out_channels: 1
+            kernel_sizes: [5, 3]
+            channels: 32
+            downsample_scales: [3, 3, 3, 3, 1]
+            max_downsample_channels: 1024
+            bias: True
+            nonlinear_activation: "leakyrelu"
+            nonlinear_activation_params:
+                negative_slope: 0.1
+            use_weight_norm: True
+            use_spectral_norm: False
+    # others
+    sampling_rate: 22050          # needed in the inference for saving wav
+    cache_generator_outputs: True # whether to cache generator outputs in the training
+use_alignment_module: False       # whether to use alignment module
+      
+###########################################################
+#                        LOSS SETTING                     #
+###########################################################
+# loss function related
+generator_adv_loss_params:
+    average_by_discriminators: False # whether to average loss value by #discriminators
+    loss_type: mse                   # loss type, "mse" or "hinge"
+discriminator_adv_loss_params:
+    average_by_discriminators: False # whether to average loss value by #discriminators
+    loss_type: mse                   # loss type, "mse" or "hinge"
+feat_match_loss_params:
+    average_by_discriminators: False # whether to average loss value by #discriminators
+    average_by_layers: False         # whether to average loss value by #layers of each discriminator
+    include_final_outputs: True      # whether to include final outputs for loss calculation
+mel_loss_params:
+    fs: 22050          # must be the same as the training data
+    fft_size: 1024        # fft points
+    hop_size: 256    # hop size
+    win_length: null   # window length
+    window: hann       # window type
+    num_mels: 80         # number of Mel basis
+    fmin: 0            # minimum frequency for Mel basis
+    fmax: null         # maximum frequency for Mel basis
+    log_base: null     # null represent natural log
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_adv: 1.0        # loss scaling coefficient for adversarial loss
+lambda_mel: 45.0       # loss scaling coefficient for Mel loss
+lambda_feat_match: 2.0 # loss scaling coefficient for feat match loss
+lambda_var: 1.0        # loss scaling coefficient for duration loss
+lambda_align: 2.0         # loss scaling coefficient for KL divergence loss
+# others
+sampling_rate: 22050          # needed in the inference for saving wav
+cache_generator_outputs: True # whether to cache generator outputs in the training
+
+
+# extra module for additional inputs
+pitch_extract: dio           # pitch extractor type
+pitch_extract_conf:
+    reduction_factor: 1
+    use_token_averaged_f0: false
+pitch_normalize: global_mvn  # normalizer for the pitch feature
+energy_extract: energy       # energy extractor type
+energy_extract_conf:
+    reduction_factor: 1
+    use_token_averaged_energy: false
+energy_normalize: global_mvn # normalizer for the energy feature
+
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 32              # Batch size.
+num_workers: 4              # Number of workers in DataLoader.
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+# optimizer setting for generator
+generator_optimizer_params:
+    beta1: 0.8
+    beta2: 0.99
+    epsilon: 1.0e-9
+    weight_decay: 0.0
+generator_scheduler: exponential_decay
+generator_scheduler_params:
+    learning_rate: 2.0e-4
+    gamma: 0.999875                   
+
+# optimizer setting for discriminator
+discriminator_optimizer_params:
+    beta1: 0.8
+    beta2: 0.99
+    epsilon: 1.0e-9
+    weight_decay: 0.0
+discriminator_scheduler: exponential_decay
+discriminator_scheduler_params:
+    learning_rate: 2.0e-4          
+    gamma: 0.999875
+generator_first: True # whether to start updating generator first
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_snapshots: 10            # max number of snapshots to keep while training
+train_max_steps: 350000      # Number of training steps. == total_iters / ngpus, total_iters = 1000000
+save_interval_steps: 1000    # Interval steps to save checkpoint.
+eval_interval_steps: 250     # Interval steps to evaluate the network.
+seed: 777                    # random seed number
diff --git a/examples/csmsc/jets/local/inference.sh b/examples/csmsc/jets/local/inference.sh
new file mode 100755
index 00000000..30941caa
--- /dev/null
+++ b/examples/csmsc/jets/local/inference.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+train_output_path=$1
+
+stage=0
+stop_stage=0
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    python3 ${BIN_DIR}/inference.py \
+        --inference_dir=${train_output_path}/inference \
+        --am=jets_csmsc \
+        --text=${BIN_DIR}/../sentences.txt \
+        --output_dir=${train_output_path}/pd_infer_out \
+        --phones_dict=dump/phone_id_map.txt
+fi
diff --git a/examples/csmsc/jets/local/preprocess.sh b/examples/csmsc/jets/local/preprocess.sh
new file mode 100755
index 00000000..60053131
--- /dev/null
+++ b/examples/csmsc/jets/local/preprocess.sh
@@ -0,0 +1,77 @@
+#!/bin/bash
+set -e
+stage=0
+stop_stage=100
+
+config_path=$1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # get durations from MFA's result
+    echo "Generate durations.txt from MFA results ..."
+    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
+        --inputdir=./baker_alignment_tone \
+        --output=durations.txt \
+        --config=${config_path}
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # extract features
+    echo "Extract features ..."
+    python3 ${BIN_DIR}/preprocess.py \
+        --dataset=baker \
+        --rootdir=~/datasets/BZNSYP/ \
+        --dumpdir=dump \
+        --dur-file=durations.txt \
+        --config=${config_path} \
+        --num-cpu=20 \
+        --cut-sil=True \
+        --token_average=True
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # get features' stats(mean and std)
+    echo "Get features' stats ..."
+    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --field-name="feats"
+
+    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --field-name="pitch"
+
+    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --field-name="energy"
+
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # normalize and covert phone/speaker to id, dev and test should use train's stats
+    echo "Normalize ..."
+    python3 ${BIN_DIR}/normalize.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --dumpdir=dump/train/norm \
+        --feats-stats=dump/train/feats_stats.npy \
+        --pitch-stats=dump/train/pitch_stats.npy \
+        --energy-stats=dump/train/energy_stats.npy \
+        --phones-dict=dump/phone_id_map.txt \
+        --speaker-dict=dump/speaker_id_map.txt
+
+    python3 ${BIN_DIR}/normalize.py \
+        --metadata=dump/dev/raw/metadata.jsonl \
+        --dumpdir=dump/dev/norm \
+        --feats-stats=dump/train/feats_stats.npy \
+        --pitch-stats=dump/train/pitch_stats.npy \
+        --energy-stats=dump/train/energy_stats.npy \
+        --phones-dict=dump/phone_id_map.txt \
+        --speaker-dict=dump/speaker_id_map.txt
+
+    python3 ${BIN_DIR}/normalize.py \
+        --metadata=dump/test/raw/metadata.jsonl \
+        --dumpdir=dump/test/norm \
+        --feats-stats=dump/train/feats_stats.npy \
+        --pitch-stats=dump/train/pitch_stats.npy \
+        --energy-stats=dump/train/energy_stats.npy \
+        --phones-dict=dump/phone_id_map.txt \
+        --speaker-dict=dump/speaker_id_map.txt
+fi
diff --git a/examples/csmsc/jets/local/synthesize.sh b/examples/csmsc/jets/local/synthesize.sh
new file mode 100755
index 00000000..a4b35ec0
--- /dev/null
+++ b/examples/csmsc/jets/local/synthesize.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+stage=0
+stop_stage=0
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+    python3 ${BIN_DIR}/synthesize.py \
+        --config=${config_path} \
+        --ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --phones_dict=dump/phone_id_map.txt \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test
+fi
diff --git a/examples/csmsc/jets/local/synthesize_e2e.sh b/examples/csmsc/jets/local/synthesize_e2e.sh
new file mode 100755
index 00000000..67ae14fa
--- /dev/null
+++ b/examples/csmsc/jets/local/synthesize_e2e.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
+stage=0
+stop_stage=0
+
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+    python3 ${BIN_DIR}/synthesize_e2e.py \
+        --am=jets_csmsc \
+        --config=${config_path} \
+        --ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --phones_dict=dump/phone_id_map.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --text=${BIN_DIR}/../sentences.txt \
+        --inference_dir=${train_output_path}/inference
+fi
diff --git a/examples/csmsc/jets/local/train.sh b/examples/csmsc/jets/local/train.sh
new file mode 100755
index 00000000..d1302f99
--- /dev/null
+++ b/examples/csmsc/jets/local/train.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+
+python3 ${BIN_DIR}/train.py \
+    --train-metadata=dump/train/norm/metadata.jsonl \
+    --dev-metadata=dump/dev/norm/metadata.jsonl \
+    --config=${config_path} \
+    --output-dir=${train_output_path} \
+    --ngpu=1 \
+    --phones-dict=dump/phone_id_map.txt
diff --git a/examples/csmsc/jets/path.sh b/examples/csmsc/jets/path.sh
new file mode 100755
index 00000000..73a0af7e
--- /dev/null
+++ b/examples/csmsc/jets/path.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+MODEL=jets
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
diff --git a/examples/csmsc/jets/run.sh b/examples/csmsc/jets/run.sh
new file mode 100755
index 00000000..d0985c50
--- /dev/null
+++ b/examples/csmsc/jets/run.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+set -e
+source path.sh
+
+gpus=0
+stage=0
+stop_stage=100
+
+conf_path=conf/default.yaml
+train_output_path=exp/default
+ckpt_name=snapshot_iter_150000.pdz
+
+# with the following command, you can choose the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    ./local/preprocess.sh ${conf_path}|| exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # synthesize_e2e
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} || exit -1
+fi
+
diff --git a/examples/csmsc/tts2/run.sh b/examples/csmsc/tts2/run.sh
index 6279ec57..5732ea3c 100755
--- a/examples/csmsc/tts2/run.sh
+++ b/examples/csmsc/tts2/run.sh
@@ -45,10 +45,7 @@ fi
 # we have only tested the following models so far
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
     # install paddle2onnx
-    version=$(echo `pip list |grep "paddle2onnx"` |awk -F" " '{print $2}')
-    if [[ -z "$version" || ${version} != '1.0.0' ]]; then
-        pip install paddle2onnx==1.0.0
-    fi
+    pip install paddle2onnx --upgrade
     ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx speedyspeech_csmsc
     # considering the balance between speed and quality, we recommend that you use hifigan as vocoder
     ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_csmsc
diff --git a/examples/csmsc/tts3/local/PTQ_static.sh b/examples/csmsc/tts3/local/PTQ_static.sh
index a70a77b5..c6dce53c 100755
--- a/examples/csmsc/tts3/local/PTQ_static.sh
+++ b/examples/csmsc/tts3/local/PTQ_static.sh
@@ -5,4 +5,4 @@ python3 ${BIN_DIR}/../PTQ_static.py \
     --dev-metadata=dump/dev/norm/metadata.jsonl \
     --inference_dir ${train_output_path}/inference \
     --model_name ${model_name} \
-    --onnx_forma=True
\ No newline at end of file
+    --onnx_format=True
\ No newline at end of file
diff --git a/examples/csmsc/tts3/run.sh b/examples/csmsc/tts3/run.sh
index dd8c9f3e..a7b4e423 100755
--- a/examples/csmsc/tts3/run.sh
+++ b/examples/csmsc/tts3/run.sh
@@ -45,10 +45,7 @@ fi
 # we have only tested the following models so far
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
     # install paddle2onnx
-    version=$(echo `pip list |grep "paddle2onnx"` |awk -F" " '{print $2}')
-    if [[ -z "$version" || ${version} != '1.0.0' ]]; then
-        pip install paddle2onnx==1.0.0
-    fi
+    pip install paddle2onnx --upgrade
     ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx fastspeech2_csmsc
     # considering the balance between speed and quality, we recommend that you use hifigan as vocoder
     ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_csmsc
diff --git a/examples/csmsc/tts3/run_cnndecoder.sh b/examples/csmsc/tts3/run_cnndecoder.sh
index 96b446c5..f356f313 100755
--- a/examples/csmsc/tts3/run_cnndecoder.sh
+++ b/examples/csmsc/tts3/run_cnndecoder.sh
@@ -58,10 +58,7 @@ fi
 # paddle2onnx non streaming
 if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
     # install paddle2onnx
-    version=$(echo `pip list |grep "paddle2onnx"` |awk -F" " '{print $2}')
-    if [[ -z "$version" || ${version} != '1.0.0' ]]; then
-        pip install paddle2onnx==1.0.0
-    fi
+    pip install paddle2onnx --upgrade
     ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx fastspeech2_csmsc
     # considering the balance between speed and quality, we recommend that you use hifigan as vocoder
     ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_csmsc
@@ -77,10 +74,7 @@ fi
 # paddle2onnx streaming
 if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then
     # install paddle2onnx
-    version=$(echo `pip list |grep "paddle2onnx"` |awk -F" " '{print $2}')
-    if [[ -z "$version" || ${version} != '1.0.0' ]]; then
-        pip install paddle2onnx==1.0.0
-    fi
+    pip install paddle2onnx --upgrade
     # streaming acoustic model
     ./local/paddle2onnx.sh ${train_output_path} inference_streaming inference_onnx_streaming fastspeech2_csmsc_am_encoder_infer
     ./local/paddle2onnx.sh ${train_output_path} inference_streaming inference_onnx_streaming fastspeech2_csmsc_am_decoder
diff --git a/examples/csmsc/vits/README.md b/examples/csmsc/vits/README.md
index 8f223e07..50d703b2 100644
--- a/examples/csmsc/vits/README.md
+++ b/examples/csmsc/vits/README.md
@@ -147,14 +147,14 @@ optional arguments:
 
 The pretrained model can be downloaded here:
 
-- [vits_csmsc_ckpt_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/vits/vits_csmsc_ckpt_1.1.0.zip) (add_blank=true)
+- [vits_csmsc_ckpt_1.4.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/vits/vits_csmsc_ckpt_1.4.0.zip) (add_blank=true)
 
 VITS checkpoint contains files listed below.
 ```text
-vits_csmsc_ckpt_1.1.0
-├── default.yaml              # default config used to train vitx
-├── phone_id_map.txt          # phone vocabulary file when training vits
-└── snapshot_iter_333000.pdz  # model parameters and optimizer states
+vits_csmsc_ckpt_1.4.0
+├── default.yaml                    # default config used to train vitx
+├── phone_id_map.txt                # phone vocabulary file when training vits
+└── snapshot_iter_150000.pdz  # model parameters and optimizer states
 ```
 
 ps: This ckpt is not good enough, a better result is training
@@ -168,9 +168,9 @@ add_blank=true
 FLAGS_allocator_strategy=naive_best_fit \
 FLAGS_fraction_of_gpu_memory_to_use=0.01 \
 python3 ${BIN_DIR}/synthesize_e2e.py \
-    --config=vits_csmsc_ckpt_1.1.0/default.yaml \
-    --ckpt=vits_csmsc_ckpt_1.1.0/snapshot_iter_333000.pdz \
-    --phones_dict=vits_csmsc_ckpt_1.1.0/phone_id_map.txt \
+    --config=vits_csmsc_ckpt_1.4.0/default.yaml \
+    --ckpt=vits_csmsc_ckpt_1.4.0/snapshot_iter_150000.pdz \
+    --phones_dict=vits_csmsc_ckpt_1.4.0/phone_id_map.txt \
     --output_dir=exp/default/test_e2e \
     --text=${BIN_DIR}/../sentences.txt \
     --add-blank=${add_blank} 
diff --git a/examples/csmsc/vits/local/export2lite.sh b/examples/csmsc/vits/local/export2lite.sh
new file mode 120000
index 00000000..402fd833
--- /dev/null
+++ b/examples/csmsc/vits/local/export2lite.sh
@@ -0,0 +1 @@
+../../tts3/local/export2lite.sh
\ No newline at end of file
diff --git a/examples/csmsc/vits/local/inference.sh b/examples/csmsc/vits/local/inference.sh
new file mode 100755
index 00000000..0a79c255
--- /dev/null
+++ b/examples/csmsc/vits/local/inference.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+
+train_output_path=$1
+add_blank=$2
+
+stage=0
+stop_stage=0
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    python3 ${BIN_DIR}/inference.py \
+        --inference_dir=${train_output_path}/inference \
+        --am=vits_csmsc \
+        --text=${BIN_DIR}/../sentences.txt \
+        --output_dir=${train_output_path}/pd_infer_out \
+        --phones_dict=dump/phone_id_map.txt \
+        --add-blank=${add_blank}
+fi
\ No newline at end of file
diff --git a/examples/csmsc/vits/local/lite_predict.sh b/examples/csmsc/vits/local/lite_predict.sh
new file mode 100755
index 00000000..e12f5349
--- /dev/null
+++ b/examples/csmsc/vits/local/lite_predict.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+train_output_path=$1
+add_blank=$2
+
+stage=0
+stop_stage=0
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    python3 ${BIN_DIR}/lite_predict.py \
+        --inference_dir=${train_output_path}/pdlite \
+        --am=vits_csmsc \
+        --text=${BIN_DIR}/../sentences.txt \
+        --output_dir=${train_output_path}/lite_infer_out \
+        --phones_dict=dump/phone_id_map.txt \
+        --add-blank=${add_blank}
+fi
+
diff --git a/examples/csmsc/vits/local/paddle2onnx.sh b/examples/csmsc/vits/local/paddle2onnx.sh
new file mode 120000
index 00000000..87c46634
--- /dev/null
+++ b/examples/csmsc/vits/local/paddle2onnx.sh
@@ -0,0 +1 @@
+../../tts3/local/paddle2onnx.sh
\ No newline at end of file
diff --git a/examples/csmsc/vits/local/synthesize_e2e.sh b/examples/csmsc/vits/local/synthesize_e2e.sh
index 3f3bf651..6a69b366 100755
--- a/examples/csmsc/vits/local/synthesize_e2e.sh
+++ b/examples/csmsc/vits/local/synthesize_e2e.sh
@@ -13,10 +13,12 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     FLAGS_allocator_strategy=naive_best_fit \
     FLAGS_fraction_of_gpu_memory_to_use=0.01 \
     python3 ${BIN_DIR}/synthesize_e2e.py \
+        --am=vits_csmsc \
         --config=${config_path} \
         --ckpt=${train_output_path}/checkpoints/${ckpt_name} \
         --phones_dict=dump/phone_id_map.txt \
         --output_dir=${train_output_path}/test_e2e \
         --text=${BIN_DIR}/../sentences.txt \
-        --add-blank=${add_blank}
+        --add-blank=${add_blank} #\
+        # --inference_dir=${train_output_path}/inference
 fi
diff --git a/examples/csmsc/vits/run.sh b/examples/csmsc/vits/run.sh
index 74505d9b..f6e8a086 100755
--- a/examples/csmsc/vits/run.sh
+++ b/examples/csmsc/vits/run.sh
@@ -35,3 +35,35 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     # synthesize_e2e, vocoder is pwgan
     CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} ${add_blank}|| exit -1
 fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} ${add_blank}|| exit -1
+fi
+
+# # not ready yet for operator missing in Paddle2ONNX
+# # paddle2onnx, please make sure the static models are in ${train_output_path}/inference first
+# # we have only tested the following models so far
+# if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+#     # install paddle2onnx
+#     pip install paddle2onnx --upgrade
+#     ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx vits_csmsc
+# fi
+
+# # inference with onnxruntime
+# if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+#     ./local/ort_predict.sh ${train_output_path}
+# fi
+
+# not ready yet for operator missing in Paddle-Lite
+# must run after stage 3 (which stage generated static models)
+if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
+    # NOTE by yuantian 2022.11.21: please compile develop version of Paddle-Lite to export and run TTS models,
+    #                   cause TTS models are supported by https://github.com/PaddlePaddle/Paddle-Lite/pull/10128
+    # vits can only run in arm
+    ./local/export2lite.sh ${train_output_path} inference pdlite vits_csmsc arm
+fi
+
+if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/lite_predict.sh ${train_output_path} || exit -1
+fi
+
diff --git a/examples/csmsc/voc1/local/PTQ_static.sh b/examples/csmsc/voc1/local/PTQ_static.sh
index 2e516614..c85ebd10 100755
--- a/examples/csmsc/voc1/local/PTQ_static.sh
+++ b/examples/csmsc/voc1/local/PTQ_static.sh
@@ -2,7 +2,7 @@ train_output_path=$1
 model_name=$2
 
 python3 ${BIN_DIR}/../../PTQ_static.py \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
+    --dev-metadata=dump/dev/raw/metadata.jsonl \
     --inference_dir ${train_output_path}/inference \
     --model_name ${model_name} \
     --onnx_format=True 
\ No newline at end of file
diff --git a/examples/csmsc/voc1/local/preprocess.sh b/examples/csmsc/voc1/local/preprocess.sh
index 61d6d62b..62d0717b 100755
--- a/examples/csmsc/voc1/local/preprocess.sh
+++ b/examples/csmsc/voc1/local/preprocess.sh
@@ -42,14 +42,18 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     python3 ${BIN_DIR}/../normalize.py \
         --metadata=dump/train/raw/metadata.jsonl \
         --dumpdir=dump/train/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
+        --skip-wav-copy
+        
     python3 ${BIN_DIR}/../normalize.py \
         --metadata=dump/dev/raw/metadata.jsonl \
         --dumpdir=dump/dev/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
+        --skip-wav-copy
     
     python3 ${BIN_DIR}/../normalize.py \
         --metadata=dump/test/raw/metadata.jsonl \
         --dumpdir=dump/test/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
+        --skip-wav-copy
 fi
diff --git a/examples/csmsc/voc3/finetune.sh b/examples/csmsc/voc3/finetune.sh
deleted file mode 100755
index 6719bd0b..00000000
--- a/examples/csmsc/voc3/finetune.sh
+++ /dev/null
@@ -1,64 +0,0 @@
-#!/bin/bash
-
-source path.sh
-
-gpus=0
-stage=0
-stop_stage=100
-
-source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
-
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    python3 ${MAIN_ROOT}/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py \
-        --fastspeech2-config=fastspeech2_nosil_baker_ckpt_0.4/default.yaml \
-        --fastspeech2-checkpoint=fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \
-        --fastspeech2-stat=fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \
-        --dur-file=durations.txt \
-        --output-dir=dump_finetune \
-        --phones-dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt \
-        --dataset=baker \
-        --rootdir=~/datasets/BZNSYP/
-fi
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    python3 ${MAIN_ROOT}/utils/link_wav.py \
-        --old-dump-dir=dump \
-        --dump-dir=dump_finetune
-fi
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    # get features' stats(mean and std)
-    echo "Get features' stats ..."
-    cp dump/train/feats_stats.npy dump_finetune/train/
-fi
-
-if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
-    # normalize, dev and test should use train's stats
-    echo "Normalize ..."
-   
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump_finetune/train/raw/metadata.jsonl \
-        --dumpdir=dump_finetune/train/norm \
-        --stats=dump_finetune/train/feats_stats.npy
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump_finetune/dev/raw/metadata.jsonl \
-        --dumpdir=dump_finetune/dev/norm \
-        --stats=dump_finetune/train/feats_stats.npy
-    
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump_finetune/test/raw/metadata.jsonl \
-        --dumpdir=dump_finetune/test/norm \
-        --stats=dump_finetune/train/feats_stats.npy
-fi
-
-if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
-    CUDA_VISIBLE_DEVICES=${gpus} \
-    FLAGS_cudnn_exhaustive_search=true \
-    FLAGS_conv_workspace_size_limit=4000 \
-    python ${BIN_DIR}/train.py \
-        --train-metadata=dump_finetune/train/norm/metadata.jsonl \
-        --dev-metadata=dump_finetune/dev/norm/metadata.jsonl \
-        --config=conf/finetune.yaml \
-        --output-dir=exp/finetune \
-        --ngpu=1
-fi 
\ No newline at end of file
diff --git a/examples/csmsc/voc3/finetune.sh b/examples/csmsc/voc3/finetune.sh
new file mode 120000
index 00000000..b6fa868e
--- /dev/null
+++ b/examples/csmsc/voc3/finetune.sh
@@ -0,0 +1 @@
+../voc5/finetune.sh
\ No newline at end of file
diff --git a/examples/csmsc/voc3/local/preprocess.sh b/examples/csmsc/voc3/local/preprocess.sh
deleted file mode 100755
index 61d6d62b..00000000
--- a/examples/csmsc/voc3/local/preprocess.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/bash
-
-stage=0
-stop_stage=100
-
-config_path=$1
-
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    # get durations from MFA's result
-    echo "Generate durations.txt from MFA results ..."
-    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
-        --inputdir=./baker_alignment_tone \
-        --output=durations.txt \
-        --config=${config_path}
-fi
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    # extract features
-    echo "Extract features ..."
-    python3 ${BIN_DIR}/../preprocess.py \
-        --rootdir=~/datasets/BZNSYP/ \
-        --dataset=baker \
-        --dumpdir=dump \
-        --dur-file=durations.txt \
-        --config=${config_path} \
-        --cut-sil=True \
-        --num-cpu=20
-fi
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    # get features' stats(mean and std)
-    echo "Get features' stats ..."
-    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
-        --metadata=dump/train/raw/metadata.jsonl \
-        --field-name="feats"
-fi
-
-if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
-    # normalize, dev and test should use train's stats
-    echo "Normalize ..."
-   
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/train/raw/metadata.jsonl \
-        --dumpdir=dump/train/norm \
-        --stats=dump/train/feats_stats.npy
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/dev/raw/metadata.jsonl \
-        --dumpdir=dump/dev/norm \
-        --stats=dump/train/feats_stats.npy
-    
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/test/raw/metadata.jsonl \
-        --dumpdir=dump/test/norm \
-        --stats=dump/train/feats_stats.npy
-fi
diff --git a/examples/csmsc/voc3/local/preprocess.sh b/examples/csmsc/voc3/local/preprocess.sh
new file mode 120000
index 00000000..f0cb24de
--- /dev/null
+++ b/examples/csmsc/voc3/local/preprocess.sh
@@ -0,0 +1 @@
+../../voc1/local/preprocess.sh
\ No newline at end of file
diff --git a/examples/csmsc/voc3/local/train.sh b/examples/csmsc/voc3/local/train.sh
deleted file mode 100755
index 9695631e..00000000
--- a/examples/csmsc/voc3/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-FLAGS_cudnn_exhaustive_search=true \
-FLAGS_conv_workspace_size_limit=4000 \
-python ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=1
diff --git a/examples/csmsc/voc3/local/train.sh b/examples/csmsc/voc3/local/train.sh
new file mode 120000
index 00000000..9ec3ed94
--- /dev/null
+++ b/examples/csmsc/voc3/local/train.sh
@@ -0,0 +1 @@
+../../voc1/local/train.sh
\ No newline at end of file
diff --git a/examples/csmsc/voc4/local/preprocess.sh b/examples/csmsc/voc4/local/preprocess.sh
deleted file mode 100755
index 61d6d62b..00000000
--- a/examples/csmsc/voc4/local/preprocess.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/bash
-
-stage=0
-stop_stage=100
-
-config_path=$1
-
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    # get durations from MFA's result
-    echo "Generate durations.txt from MFA results ..."
-    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
-        --inputdir=./baker_alignment_tone \
-        --output=durations.txt \
-        --config=${config_path}
-fi
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    # extract features
-    echo "Extract features ..."
-    python3 ${BIN_DIR}/../preprocess.py \
-        --rootdir=~/datasets/BZNSYP/ \
-        --dataset=baker \
-        --dumpdir=dump \
-        --dur-file=durations.txt \
-        --config=${config_path} \
-        --cut-sil=True \
-        --num-cpu=20
-fi
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    # get features' stats(mean and std)
-    echo "Get features' stats ..."
-    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
-        --metadata=dump/train/raw/metadata.jsonl \
-        --field-name="feats"
-fi
-
-if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
-    # normalize, dev and test should use train's stats
-    echo "Normalize ..."
-   
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/train/raw/metadata.jsonl \
-        --dumpdir=dump/train/norm \
-        --stats=dump/train/feats_stats.npy
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/dev/raw/metadata.jsonl \
-        --dumpdir=dump/dev/norm \
-        --stats=dump/train/feats_stats.npy
-    
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/test/raw/metadata.jsonl \
-        --dumpdir=dump/test/norm \
-        --stats=dump/train/feats_stats.npy
-fi
diff --git a/examples/csmsc/voc4/local/preprocess.sh b/examples/csmsc/voc4/local/preprocess.sh
new file mode 120000
index 00000000..f0cb24de
--- /dev/null
+++ b/examples/csmsc/voc4/local/preprocess.sh
@@ -0,0 +1 @@
+../../voc1/local/preprocess.sh
\ No newline at end of file
diff --git a/examples/csmsc/voc4/local/train.sh b/examples/csmsc/voc4/local/train.sh
deleted file mode 100755
index 9695631e..00000000
--- a/examples/csmsc/voc4/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-FLAGS_cudnn_exhaustive_search=true \
-FLAGS_conv_workspace_size_limit=4000 \
-python ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=1
diff --git a/examples/csmsc/voc4/local/train.sh b/examples/csmsc/voc4/local/train.sh
new file mode 120000
index 00000000..9ec3ed94
--- /dev/null
+++ b/examples/csmsc/voc4/local/train.sh
@@ -0,0 +1 @@
+../../voc1/local/train.sh
\ No newline at end of file
diff --git a/examples/csmsc/voc5/conf/iSTFT.yaml b/examples/csmsc/voc5/conf/iSTFT.yaml
new file mode 100644
index 00000000..06677d79
--- /dev/null
+++ b/examples/csmsc/voc5/conf/iSTFT.yaml
@@ -0,0 +1,174 @@
+# This is the configuration file for CSMSC dataset.
+# This configuration is based on HiFiGAN V1, which is an official configuration. 
+# But I found that the optimizer setting does not work well with my implementation.
+# So I changed optimizer settings as follows:
+# - AdamW -> Adam
+# - betas: [0.8, 0.99] -> betas: [0.5, 0.9]
+# - Scheduler: ExponentialLR -> MultiStepLR
+# To match the shift size difference, the upsample scales is also modified from the original 256 shift setting.
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+fs: 24000                # Sampling rate.
+n_fft: 2048              # FFT size (samples).
+n_shift: 300             # Hop size (samples). 12.5ms
+win_length: 1200         # Window length (samples). 50ms
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+n_mels: 80               # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation. (Hz)
+fmax: 7600               # Maximum frequency in mel basis calculation. (Hz)
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_params:
+    use_istft: True                       # Use iSTFTNet.
+    istft_layer_id: 2                     # Use istft after istft_layer_id layers of upsample layer if use_istft=True.
+    n_fft: 2048                           # FFT size (samples) in feature extraction.
+    win_length: 1200                      # Window length (samples) in feature extraction.
+    in_channels: 80                       # Number of input channels.
+    out_channels: 1                       # Number of output channels.
+    channels: 512                         # Number of initial channels.
+    kernel_size: 7                        # Kernel size of initial and final conv layers.
+    upsample_scales:  [5, 5, 4, 3]        # Upsampling scales.
+    upsample_kernel_sizes: [10, 10, 8, 6] # Kernel size for upsampling layers.
+    resblock_kernel_sizes: [3, 7, 11]     # Kernel size for residual blocks.
+    resblock_dilations:                   # Dilations for residual blocks.
+        - [1, 3, 5]
+        - [1, 3, 5]
+        - [1, 3, 5]
+    use_additional_convs: True            # Whether to use additional conv layer in residual blocks.
+    bias: True                            # Whether to use bias parameter in conv.
+    nonlinear_activation: "leakyrelu"     # Nonlinear activation type.
+    nonlinear_activation_params:          # Nonlinear activation paramters.
+        negative_slope: 0.1
+    use_weight_norm: True                 # Whether to apply weight normalization.
+
+
+
+
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_params:
+    scales: 3                              # Number of multi-scale discriminator.
+    scale_downsample_pooling: "AvgPool1D"  # Pooling operation for scale discriminator.
+    scale_downsample_pooling_params:
+        kernel_size: 4                     # Pooling kernel size.
+        stride: 2                          # Pooling stride.
+        padding: 2                         # Padding size.
+    scale_discriminator_params:
+        in_channels: 1                     # Number of input channels.
+        out_channels: 1                    # Number of output channels.
+        kernel_sizes: [15, 41, 5, 3]       # List of kernel sizes.
+        channels: 128                      # Initial number of channels.
+        max_downsample_channels: 1024      # Maximum number of channels in downsampling conv layers.
+        max_groups: 16                     # Maximum number of groups in downsampling conv layers.
+        bias: True
+        downsample_scales: [4, 4, 4, 4, 1] # Downsampling scales.
+        nonlinear_activation: "leakyrelu"  # Nonlinear activation.
+        nonlinear_activation_params:
+            negative_slope: 0.1
+    follow_official_norm: True             # Whether to follow the official norm setting.
+    periods: [2, 3, 5, 7, 11]              # List of period for multi-period discriminator.
+    period_discriminator_params:
+        in_channels: 1                     # Number of input channels.
+        out_channels: 1                    # Number of output channels.
+        kernel_sizes: [5, 3]               # List of kernel sizes.
+        channels: 32                       # Initial number of channels.
+        downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales.
+        max_downsample_channels: 1024      # Maximum number of channels in downsampling conv layers.
+        bias: True                         # Whether to use bias parameter in conv layer."
+        nonlinear_activation: "leakyrelu"  # Nonlinear activation.
+        nonlinear_activation_params:       # Nonlinear activation paramters.
+            negative_slope: 0.1
+        use_weight_norm: True              # Whether to apply weight normalization.
+        use_spectral_norm: False           # Whether to apply spectral normalization.
+    
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+use_stft_loss: False                 # Whether to use multi-resolution STFT loss.
+use_mel_loss: True                   # Whether to use Mel-spectrogram loss.
+mel_loss_params:
+    fs: 24000
+    fft_size: 2048
+    hop_size: 300
+    win_length: 1200
+    window: "hann"
+    num_mels: 80
+    fmin: 0
+    fmax: 12000
+    log_base: null
+generator_adv_loss_params:
+    average_by_discriminators: False # Whether to average loss by #discriminators.
+discriminator_adv_loss_params:
+    average_by_discriminators: False # Whether to average loss by #discriminators.
+use_feat_match_loss: True
+feat_match_loss_params:
+    average_by_discriminators: False # Whether to average loss by #discriminators.
+    average_by_layers: False         # Whether to average loss by #layers in each discriminator.
+    include_final_outputs: False     # Whether to include final outputs in feat match loss calculation.
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_aux: 45.0       # Loss balancing coefficient for STFT loss.
+lambda_adv: 1.0        # Loss balancing coefficient for adversarial loss.
+lambda_feat_match: 2.0 # Loss balancing coefficient for feat match loss..
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 16              # Batch size.
+batch_max_steps: 8400       # Length of each audio in batch. Make sure dividable by hop_size.
+num_workers: 2              # Number of workers in DataLoader.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+    beta1: 0.5
+    beta2: 0.9
+    weight_decay: 0.0                   # Generator's weight decay coefficient.
+generator_scheduler_params:
+    learning_rate: 2.0e-4               # Generator's learning rate.
+    gamma: 0.5                          # Generator's scheduler gamma.
+    milestones:                         # At each milestone, lr will be multiplied by gamma.
+        - 200000
+        - 400000
+        - 600000
+        - 800000
+generator_grad_norm: -1                 # Generator's gradient norm.
+discriminator_optimizer_params:
+    beta1: 0.5
+    beta2: 0.9
+    weight_decay: 0.0                   # Discriminator's weight decay coefficient.
+discriminator_scheduler_params:
+    learning_rate: 2.0e-4               # Discriminator's learning rate.
+    gamma: 0.5                          # Discriminator's scheduler gamma.
+    milestones:                         # At each milestone, lr will be multiplied by gamma.
+        - 200000
+        - 400000
+        - 600000
+        - 800000    
+discriminator_grad_norm: -1             # Discriminator's gradient norm.            
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+generator_train_start_steps: 1     # Number of steps to start to train discriminator.
+discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
+train_max_steps: 2500000          # Number of training steps.
+save_interval_steps: 5000         # Interval steps to save checkpoint.
+eval_interval_steps: 1000          # Interval steps to evaluate the network.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_snapshots: 10                 # max number of snapshots to keep while training
+seed: 42                          # random seed for paddle, random, and np.random
diff --git a/examples/csmsc/voc5/finetune.sh b/examples/csmsc/voc5/finetune.sh
index 6719bd0b..eb8325ae 100755
--- a/examples/csmsc/voc5/finetune.sh
+++ b/examples/csmsc/voc5/finetune.sh
@@ -39,16 +39,19 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     python3 ${BIN_DIR}/../normalize.py \
         --metadata=dump_finetune/train/raw/metadata.jsonl \
         --dumpdir=dump_finetune/train/norm \
-        --stats=dump_finetune/train/feats_stats.npy
+        --stats=dump_finetune/train/feats_stats.npy \
+        --skip-wav-copy
     python3 ${BIN_DIR}/../normalize.py \
         --metadata=dump_finetune/dev/raw/metadata.jsonl \
         --dumpdir=dump_finetune/dev/norm \
-        --stats=dump_finetune/train/feats_stats.npy
+        --stats=dump_finetune/train/feats_stats.npy \
+        --skip-wav-copy
     
     python3 ${BIN_DIR}/../normalize.py \
         --metadata=dump_finetune/test/raw/metadata.jsonl \
         --dumpdir=dump_finetune/test/norm \
-        --stats=dump_finetune/train/feats_stats.npy
+        --stats=dump_finetune/train/feats_stats.npy \
+        --skip-wav-copy
 fi
 
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
diff --git a/examples/csmsc/voc5/iSTFTNet.md b/examples/csmsc/voc5/iSTFTNet.md
new file mode 100644
index 00000000..8f121938
--- /dev/null
+++ b/examples/csmsc/voc5/iSTFTNet.md
@@ -0,0 +1,145 @@
+# iSTFTNet with CSMSC
+
+This example contains code used to train a [iSTFTNet](https://arxiv.org/abs/2203.02395) model with [Chinese Standard Mandarin Speech Copus](https://www.data-baker.com/open_source.html).
+
+## Dataset
+### Download and Extract
+Download CSMSC from it's [official website](https://test.data-baker.com/data/index/TNtts/) and extract it to `~/datasets`. Then the dataset is in the directory `~/datasets/BZNSYP`.
+
+### Get MFA Result and Extract
+We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence at the edge of audio.
+You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
+
+## Get Started
+Assume the path to the dataset is `~/datasets/BZNSYP`.
+Assume the path to the MFA result of CSMSC is `./baker_alignment_tone`.
+Run the command below to
+1. **source path**.
+2. preprocess the dataset.
+3. train the model.
+4. synthesize wavs.
+    - synthesize waveform from `metadata.jsonl`.
+```bash
+./run.sh
+```
+You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, running the following command will only preprocess the dataset.
+```bash
+./run.sh --stage 0 --stop-stage 0
+```
+### Data Preprocessing
+```bash
+./local/preprocess.sh ${conf_path}
+```
+When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.
+
+```text
+dump
+├── dev
+│   ├── norm
+│   └── raw
+├── test
+│   ├── norm
+│   └── raw
+└── train
+    ├── norm
+    ├── raw
+    └── feats_stats.npy
+```
+The dataset is split into 3 parts, namely `train`, `dev`, and `test`, each of which contains a `norm` and `raw` subfolder. The `raw` folder contains the log magnitude of the mel spectrogram of each utterance, while the norm folder contains the normalized spectrogram. The statistics used to normalize the spectrogram are computed from the training set, which is located in `dump/train/feats_stats.npy`.
+
+Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file that contains id and paths to the spectrogram of each utterance.
+
+### Model Training
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
+```
+`./local/train.sh` calls `${BIN_DIR}/train.py`.
+Here's the complete help message.
+
+```text
+usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
+                [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
+                [--ngpu NGPU]
+
+Train a HiFiGAN model.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --config CONFIG       HiFiGAN config file.
+  --train-metadata TRAIN_METADATA
+                        training data.
+  --dev-metadata DEV_METADATA
+                        dev data.
+  --output-dir OUTPUT_DIR
+                        output dir.
+  --ngpu NGPU           if ngpu == 0, use cpu.
+```
+
+1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/iSTFT.yaml`.
+2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder.
+3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are saved in `checkpoints/` inside this directory.
+4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
+
+### Synthesizing
+`./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
+```
+```text
+usage: synthesize.py [-h] [--generator-type GENERATOR_TYPE] [--config CONFIG]
+                     [--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA]
+                     [--output-dir OUTPUT_DIR] [--ngpu NGPU]
+
+Synthesize with GANVocoder.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --generator-type GENERATOR_TYPE
+                        type of GANVocoder, should in {pwgan, mb_melgan,
+                        style_melgan, } now
+  --config CONFIG       GANVocoder config file.
+  --checkpoint CHECKPOINT
+                        snapshot to load.
+  --test-metadata TEST_METADATA
+                        dev data.
+  --output-dir OUTPUT_DIR
+                        output dir.
+  --ngpu NGPU           if ngpu == 0, use cpu.
+```
+
+1. `--config` config file. You should use the same config with which the model is trained.
+2. `--checkpoint` is the checkpoint to load. Pick one of the checkpoints from `checkpoints` inside the training output directory.
+3. `--test-metadata` is the metadata of the test dataset. Use the `metadata.jsonl` in the `dev/norm` subfolder from the processed directory.
+4. `--output-dir` is the directory to save the synthesized audio files.
+5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
+
+## Pretrained Models
+
+The pretrained model can be downloaded here:
+
+- [iSTFTNet_csmsc_ckpt.zip](https://pan.baidu.com/s/1SNDlRWOGOcbbrKf5w-TJaA?pwd=r1e5)
+
+iSTFTNet checkpoint contains files listed below.
+
+```text
+iSTFTNet_csmsc_ckpt
+├── iSTFT.yaml                    # config used to train iSTFTNet
+├── feats_stats.npy               # statistics used to normalize spectrogram when training hifigan
+└── snapshot_iter_50000.pdz       # generator parameters of hifigan
+```
+
+A Comparison between iSTFTNet and Hifigan
+|  Model   |      Step      | eval/generator_loss | eval/mel_loss | eval/feature_matching_loss |  rtf   |
+|:--------:|:--------------:|:-------------------:|:-------------:|:--------------------------:| :---: |
+| hifigan  | 1(gpu) x 50000 |       13.989        |    0.14683    |           1.3484           |  0.01767   |
+| istftNet | 1(gpu) x 50000 |       13.319        |    0.14818    |           1.1069           |  0.01069   |
+
+> Rtf is tested on the CSMSC test dataset, and the test environment is aistudio v100 16G 1GPU, the test command is `./run.sh --stage 2 --stop-stage 2`
+
+The pretained hifigan model int the comparison  can be downloaded here:
+
+- [hifigan_csmsc_ckpt.zip](https://pan.baidu.com/s/1pGY6RYV7yEB_5hRI_JoWig?pwd=tcaj)
+
+## Acknowledgement
+
+We adapted some code from https://github.com/rishikksh20/iSTFTNet-pytorch.git.
diff --git a/examples/csmsc/voc5/local/preprocess.sh b/examples/csmsc/voc5/local/preprocess.sh
deleted file mode 100755
index 61d6d62b..00000000
--- a/examples/csmsc/voc5/local/preprocess.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/bash
-
-stage=0
-stop_stage=100
-
-config_path=$1
-
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    # get durations from MFA's result
-    echo "Generate durations.txt from MFA results ..."
-    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
-        --inputdir=./baker_alignment_tone \
-        --output=durations.txt \
-        --config=${config_path}
-fi
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    # extract features
-    echo "Extract features ..."
-    python3 ${BIN_DIR}/../preprocess.py \
-        --rootdir=~/datasets/BZNSYP/ \
-        --dataset=baker \
-        --dumpdir=dump \
-        --dur-file=durations.txt \
-        --config=${config_path} \
-        --cut-sil=True \
-        --num-cpu=20
-fi
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    # get features' stats(mean and std)
-    echo "Get features' stats ..."
-    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
-        --metadata=dump/train/raw/metadata.jsonl \
-        --field-name="feats"
-fi
-
-if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
-    # normalize, dev and test should use train's stats
-    echo "Normalize ..."
-   
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/train/raw/metadata.jsonl \
-        --dumpdir=dump/train/norm \
-        --stats=dump/train/feats_stats.npy
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/dev/raw/metadata.jsonl \
-        --dumpdir=dump/dev/norm \
-        --stats=dump/train/feats_stats.npy
-    
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/test/raw/metadata.jsonl \
-        --dumpdir=dump/test/norm \
-        --stats=dump/train/feats_stats.npy
-fi
diff --git a/examples/csmsc/voc5/local/preprocess.sh b/examples/csmsc/voc5/local/preprocess.sh
new file mode 120000
index 00000000..f0cb24de
--- /dev/null
+++ b/examples/csmsc/voc5/local/preprocess.sh
@@ -0,0 +1 @@
+../../voc1/local/preprocess.sh
\ No newline at end of file
diff --git a/examples/csmsc/voc5/local/train.sh b/examples/csmsc/voc5/local/train.sh
deleted file mode 100755
index 9695631e..00000000
--- a/examples/csmsc/voc5/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-FLAGS_cudnn_exhaustive_search=true \
-FLAGS_conv_workspace_size_limit=4000 \
-python ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=1
diff --git a/examples/csmsc/voc5/local/train.sh b/examples/csmsc/voc5/local/train.sh
new file mode 120000
index 00000000..9ec3ed94
--- /dev/null
+++ b/examples/csmsc/voc5/local/train.sh
@@ -0,0 +1 @@
+../../voc1/local/train.sh
\ No newline at end of file
diff --git a/examples/csmsc/voc6/local/preprocess.sh b/examples/csmsc/voc6/local/preprocess.sh
index 2dcc39ac..509824b8 100755
--- a/examples/csmsc/voc6/local/preprocess.sh
+++ b/examples/csmsc/voc6/local/preprocess.sh
@@ -42,14 +42,18 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     python3 ${BIN_DIR}/../gan_vocoder/normalize.py \
         --metadata=dump/train/raw/metadata.jsonl \
         --dumpdir=dump/train/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
+        --skip-wav-copy
+
     python3 ${BIN_DIR}/../gan_vocoder/normalize.py \
         --metadata=dump/dev/raw/metadata.jsonl \
         --dumpdir=dump/dev/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
+        --skip-wav-copy
     
     python3 ${BIN_DIR}/../gan_vocoder/normalize.py \
         --metadata=dump/test/raw/metadata.jsonl \
         --dumpdir=dump/test/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
+        --skip-wav-copy
 fi
diff --git a/examples/csmsc/voc6/local/train.sh b/examples/csmsc/voc6/local/train.sh
deleted file mode 100755
index 9695631e..00000000
--- a/examples/csmsc/voc6/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-FLAGS_cudnn_exhaustive_search=true \
-FLAGS_conv_workspace_size_limit=4000 \
-python ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=1
diff --git a/examples/csmsc/voc6/local/train.sh b/examples/csmsc/voc6/local/train.sh
new file mode 120000
index 00000000..9ec3ed94
--- /dev/null
+++ b/examples/csmsc/voc6/local/train.sh
@@ -0,0 +1 @@
+../../voc1/local/train.sh
\ No newline at end of file
diff --git a/examples/librispeech/asr2/README.md b/examples/librispeech/asr2/README.md
index 26978520..253c9b45 100644
--- a/examples/librispeech/asr2/README.md
+++ b/examples/librispeech/asr2/README.md
@@ -153,7 +153,7 @@ After training the model, we need to get the final model for testing and inferen
 ```bash
  if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
      # avg n best model
-     avg.sh lastest exp/${ckpt}/checkpoints ${avg_num}
+     avg.sh latest exp/${ckpt}/checkpoints ${avg_num}
  fi
 ```
 The `avg.sh` is in the `../../../utils/` which is define in the `path.sh`.
diff --git a/examples/librispeech/asr3/local/data.sh b/examples/librispeech/asr3/local/data.sh
old mode 100644
new mode 100755
diff --git a/examples/librispeech/asr3/local/test.sh b/examples/librispeech/asr3/local/test.sh
old mode 100644
new mode 100755
diff --git a/examples/librispeech/asr3/local/test_wav.sh b/examples/librispeech/asr3/local/test_wav.sh
old mode 100644
new mode 100755
diff --git a/examples/librispeech/asr3/local/train.sh b/examples/librispeech/asr3/local/train.sh
old mode 100644
new mode 100755
diff --git a/examples/librispeech/asr3/run.sh b/examples/librispeech/asr3/run.sh
index 05ad505c..f52266a1 100644
--- a/examples/librispeech/asr3/run.sh
+++ b/examples/librispeech/asr3/run.sh
@@ -6,7 +6,7 @@ set -e
 
 gpus=0
 stage=0
-stop_stage=0
+stop_stage=4
 conf_path=conf/wav2vec2ASR.yaml
 ips=            #xx.xx.xx.xx,xx.xx.xx.xx
 decode_conf_path=conf/tuning/decode.yaml
diff --git a/examples/ljspeech/tts0/local/train.sh b/examples/ljspeech/tts0/local/train.sh
deleted file mode 100755
index f90db915..00000000
--- a/examples/ljspeech/tts0/local/train.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-python3 ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=1 \
-    --phones-dict=dump/phone_id_map.txt
\ No newline at end of file
diff --git a/examples/ljspeech/tts0/local/train.sh b/examples/ljspeech/tts0/local/train.sh
new file mode 120000
index 00000000..7f54e923
--- /dev/null
+++ b/examples/ljspeech/tts0/local/train.sh
@@ -0,0 +1 @@
+../../../csmsc/tts0/local/train.sh
\ No newline at end of file
diff --git a/examples/ljspeech/tts0/path.sh b/examples/ljspeech/tts0/path.sh
deleted file mode 100755
index a37cd21e..00000000
--- a/examples/ljspeech/tts0/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=tacotron2
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
diff --git a/examples/ljspeech/tts0/path.sh b/examples/ljspeech/tts0/path.sh
new file mode 120000
index 00000000..9e1fdbd1
--- /dev/null
+++ b/examples/ljspeech/tts0/path.sh
@@ -0,0 +1 @@
+../../csmsc/tts0/path.sh
\ No newline at end of file
diff --git a/examples/ljspeech/tts3/local/train.sh b/examples/ljspeech/tts3/local/train.sh
deleted file mode 100755
index d1302f99..00000000
--- a/examples/ljspeech/tts3/local/train.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-python3 ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=1 \
-    --phones-dict=dump/phone_id_map.txt
diff --git a/examples/ljspeech/tts3/local/train.sh b/examples/ljspeech/tts3/local/train.sh
new file mode 120000
index 00000000..d7b05058
--- /dev/null
+++ b/examples/ljspeech/tts3/local/train.sh
@@ -0,0 +1 @@
+../../../csmsc/tts3/local/train.sh
\ No newline at end of file
diff --git a/examples/ljspeech/tts3/path.sh b/examples/ljspeech/tts3/path.sh
deleted file mode 100755
index fb7e8411..00000000
--- a/examples/ljspeech/tts3/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=fastspeech2
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
diff --git a/examples/ljspeech/tts3/path.sh b/examples/ljspeech/tts3/path.sh
new file mode 120000
index 00000000..4785b909
--- /dev/null
+++ b/examples/ljspeech/tts3/path.sh
@@ -0,0 +1 @@
+../../csmsc/tts3/path.sh
\ No newline at end of file
diff --git a/examples/ljspeech/tts3/run.sh b/examples/ljspeech/tts3/run.sh
index aacd4cc0..0d8da920 100755
--- a/examples/ljspeech/tts3/run.sh
+++ b/examples/ljspeech/tts3/run.sh
@@ -45,10 +45,7 @@ fi
 # we have only tested the following models so far
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
     # install paddle2onnx
-    version=$(echo `pip list |grep "paddle2onnx"` |awk -F" " '{print $2}')
-    if [[ -z "$version" || ${version} != '1.0.0' ]]; then
-        pip install paddle2onnx==1.0.0
-    fi
+    pip install paddle2onnx --upgrade
     ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx fastspeech2_ljspeech
     # considering the balance between speed and quality, we recommend that you use hifigan as vocoder
     ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_ljspeech
diff --git a/examples/ljspeech/voc1/local/preprocess.sh b/examples/ljspeech/voc1/local/preprocess.sh
index d1af60da..bfbf75b7 100755
--- a/examples/ljspeech/voc1/local/preprocess.sh
+++ b/examples/ljspeech/voc1/local/preprocess.sh
@@ -42,14 +42,18 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     python3 ${BIN_DIR}/../normalize.py \
         --metadata=dump/train/raw/metadata.jsonl \
         --dumpdir=dump/train/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
+        --skip-wav-copy
+
     python3 ${BIN_DIR}/../normalize.py \
         --metadata=dump/dev/raw/metadata.jsonl \
         --dumpdir=dump/dev/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
+        --skip-wav-copy
     
     python3 ${BIN_DIR}/../normalize.py \
         --metadata=dump/test/raw/metadata.jsonl \
         --dumpdir=dump/test/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
+        --skip-wav-copy
 fi
diff --git a/examples/ljspeech/voc1/local/synthesize.sh b/examples/ljspeech/voc1/local/synthesize.sh
deleted file mode 100755
index 145557b3..00000000
--- a/examples/ljspeech/voc1/local/synthesize.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-ckpt_name=$3
-
-FLAGS_allocator_strategy=naive_best_fit \
-FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ${BIN_DIR}/../synthesize.py \
-    --config=${config_path} \
-    --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
-    --test-metadata=dump/test/norm/metadata.jsonl \
-    --output-dir=${train_output_path}/test \
-    --generator-type=pwgan
diff --git a/examples/ljspeech/voc1/local/synthesize.sh b/examples/ljspeech/voc1/local/synthesize.sh
new file mode 120000
index 00000000..d6aecd8d
--- /dev/null
+++ b/examples/ljspeech/voc1/local/synthesize.sh
@@ -0,0 +1 @@
+../../../csmsc/voc1/local/synthesize.sh
\ No newline at end of file
diff --git a/examples/ljspeech/voc1/local/train.sh b/examples/ljspeech/voc1/local/train.sh
deleted file mode 100755
index 9695631e..00000000
--- a/examples/ljspeech/voc1/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-FLAGS_cudnn_exhaustive_search=true \
-FLAGS_conv_workspace_size_limit=4000 \
-python ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=1
diff --git a/examples/ljspeech/voc1/local/train.sh b/examples/ljspeech/voc1/local/train.sh
new file mode 120000
index 00000000..2942893d
--- /dev/null
+++ b/examples/ljspeech/voc1/local/train.sh
@@ -0,0 +1 @@
+../../../csmsc/voc1/local/train.sh
\ No newline at end of file
diff --git a/examples/ljspeech/voc1/path.sh b/examples/ljspeech/voc1/path.sh
deleted file mode 100755
index 1e6647b8..00000000
--- a/examples/ljspeech/voc1/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=parallelwave_gan
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL}
\ No newline at end of file
diff --git a/examples/ljspeech/voc1/path.sh b/examples/ljspeech/voc1/path.sh
new file mode 120000
index 00000000..b7ed4fb8
--- /dev/null
+++ b/examples/ljspeech/voc1/path.sh
@@ -0,0 +1 @@
+../../csmsc/voc1/path.sh
\ No newline at end of file
diff --git a/examples/ljspeech/voc5/local/preprocess.sh b/examples/ljspeech/voc5/local/preprocess.sh
deleted file mode 100755
index d1af60da..00000000
--- a/examples/ljspeech/voc5/local/preprocess.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/bash
-
-stage=0
-stop_stage=100
-
-config_path=$1
-
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    # get durations from MFA's result
-    echo "Generate durations.txt from MFA results ..."
-    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
-        --inputdir=./ljspeech_alignment \
-        --output=durations.txt \
-        --config=${config_path}
-fi
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    # extract features
-    echo "Extract features ..."
-    python3 ${BIN_DIR}/../preprocess.py \
-        --rootdir=~/datasets/LJSpeech-1.1/ \
-        --dataset=ljspeech \
-        --dumpdir=dump \
-        --dur-file=durations.txt \
-        --config=${config_path} \
-        --cut-sil=True \
-        --num-cpu=20
-fi
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    # get features' stats(mean and std)
-    echo "Get features' stats ..."
-    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
-        --metadata=dump/train/raw/metadata.jsonl \
-        --field-name="feats"
-fi
-
-if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
-    # normalize, dev and test should use train's stats
-    echo "Normalize ..."
-   
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/train/raw/metadata.jsonl \
-        --dumpdir=dump/train/norm \
-        --stats=dump/train/feats_stats.npy
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/dev/raw/metadata.jsonl \
-        --dumpdir=dump/dev/norm \
-        --stats=dump/train/feats_stats.npy
-    
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/test/raw/metadata.jsonl \
-        --dumpdir=dump/test/norm \
-        --stats=dump/train/feats_stats.npy
-fi
diff --git a/examples/ljspeech/voc5/local/preprocess.sh b/examples/ljspeech/voc5/local/preprocess.sh
new file mode 120000
index 00000000..f0cb24de
--- /dev/null
+++ b/examples/ljspeech/voc5/local/preprocess.sh
@@ -0,0 +1 @@
+../../voc1/local/preprocess.sh
\ No newline at end of file
diff --git a/examples/ljspeech/voc5/local/synthesize.sh b/examples/ljspeech/voc5/local/synthesize.sh
deleted file mode 100755
index 64789617..00000000
--- a/examples/ljspeech/voc5/local/synthesize.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-ckpt_name=$3
-
-FLAGS_allocator_strategy=naive_best_fit \
-FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ${BIN_DIR}/../synthesize.py \
-    --config=${config_path} \
-    --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
-    --test-metadata=dump/test/norm/metadata.jsonl \
-    --output-dir=${train_output_path}/test \
-    --generator-type=hifigan
diff --git a/examples/ljspeech/voc5/local/synthesize.sh b/examples/ljspeech/voc5/local/synthesize.sh
new file mode 120000
index 00000000..c887112c
--- /dev/null
+++ b/examples/ljspeech/voc5/local/synthesize.sh
@@ -0,0 +1 @@
+../../../csmsc/voc5/local/synthesize.sh
\ No newline at end of file
diff --git a/examples/ljspeech/voc5/local/train.sh b/examples/ljspeech/voc5/local/train.sh
deleted file mode 100755
index 9695631e..00000000
--- a/examples/ljspeech/voc5/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-FLAGS_cudnn_exhaustive_search=true \
-FLAGS_conv_workspace_size_limit=4000 \
-python ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=1
diff --git a/examples/ljspeech/voc5/local/train.sh b/examples/ljspeech/voc5/local/train.sh
new file mode 120000
index 00000000..2942893d
--- /dev/null
+++ b/examples/ljspeech/voc5/local/train.sh
@@ -0,0 +1 @@
+../../../csmsc/voc1/local/train.sh
\ No newline at end of file
diff --git a/examples/ljspeech/voc5/path.sh b/examples/ljspeech/voc5/path.sh
deleted file mode 100755
index 7451b321..00000000
--- a/examples/ljspeech/voc5/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=hifigan
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL}
diff --git a/examples/ljspeech/voc5/path.sh b/examples/ljspeech/voc5/path.sh
new file mode 120000
index 00000000..b67fe2b3
--- /dev/null
+++ b/examples/ljspeech/voc5/path.sh
@@ -0,0 +1 @@
+../../csmsc/voc5/path.sh
\ No newline at end of file
diff --git a/examples/opencpop/README.md b/examples/opencpop/README.md
new file mode 100644
index 00000000..5a574dc8
--- /dev/null
+++ b/examples/opencpop/README.md
@@ -0,0 +1,6 @@
+
+# Opencpop
+
+* svs1 - DiffSinger
+* voc1 - Parallel WaveGAN
+* voc5 - HiFiGAN
diff --git a/examples/opencpop/svs1/README.md b/examples/opencpop/svs1/README.md
new file mode 100644
index 00000000..1600d0c7
--- /dev/null
+++ b/examples/opencpop/svs1/README.md
@@ -0,0 +1,276 @@
+([简体中文](./README_cn.md)|English)
+# DiffSinger with Opencpop
+This example contains code used to train a [DiffSinger](https://arxiv.org/abs/2105.02446) model with [Mandarin singing corpus](https://wenet.org.cn/opencpop/).
+
+## Dataset
+### Download and Extract
+Download Opencpop from it's [Official Website](https://wenet.org.cn/opencpop/download/) and extract it to `~/datasets`. Then the dataset is in the directory `~/datasets/Opencpop`.
+
+## Get Started
+Assume the path to the dataset is `~/datasets/Opencpop`.
+Run the command below to
+1. **source path**.
+2. preprocess the dataset.
+3. train the model.
+4. synthesize wavs.
+    - synthesize waveform from `metadata.jsonl`.
+    - (Supporting) synthesize waveform from a text file. 
+5. (Supporting) inference using the static model.
+```bash
+./run.sh
+```
+You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, running the following command will only preprocess the dataset.
+```bash
+./run.sh --stage 0 --stop-stage 0
+```
+### Data Preprocessing
+```bash
+./local/preprocess.sh ${conf_path}
+```
+When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.
+
+```text
+dump
+├── dev
+│   ├── norm
+│   └── raw
+├── phone_id_map.txt
+├── speaker_id_map.txt
+├── test
+│   ├── norm
+│   └── raw
+└── train
+    ├── energy_stats.npy
+    ├── norm
+    ├── pitch_stats.npy
+    ├── raw
+    ├── speech_stats.npy
+    └── speech_stretchs.npy
+
+```
+The dataset is split into 3 parts, namely `train`, `dev`, and` test`, each of which contains a `norm` and `raw` subfolder. The raw folder contains speech, pitch and energy features of each utterance, while the norm folder contains normalized ones. The statistics used to normalize features are computed from the training set, which is located in `dump/train/*_stats.npy`. `speech_stretchs.npy` contains the minimum and maximum values of each dimension of the mel spectrum, which is used for linear stretching before training/inference of the diffusion module.
+Note: Since the training effect of non-norm features is due to norm, the features saved under `norm` are features that have not been normed.
+
+
+Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file that contains utterance id, speaker id, phones, text_lengths, speech_lengths, phone durations, the path of speech features, the path of pitch features, the path of energy features, note, note durations, slur.
+
+### Model Training
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
+```
+`./local/train.sh` calls `${BIN_DIR}/train.py`.
+Here's the complete help message.
+```text
+usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
+                [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
+                [--ngpu NGPU] [--phones-dict PHONES_DICT]
+                [--speaker-dict SPEAKER_DICT] [--speech-stretchs SPEECH_STRETCHS]
+
+Train a FastSpeech2 model.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --config CONFIG       diffsinger config file.
+  --train-metadata TRAIN_METADATA
+                        training data.
+  --dev-metadata DEV_METADATA
+                        dev data.
+  --output-dir OUTPUT_DIR
+                        output dir.
+  --ngpu NGPU           if ngpu=0, use cpu.
+  --phones-dict PHONES_DICT
+                        phone vocabulary file.
+  --speaker-dict SPEAKER_DICT
+                        speaker id map file for multiple speaker model.
+  --speech-stretchs SPEECH_STRETCHS
+                        min amd max mel for stretching.
+```
+1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
+2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder.
+3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are saved in `checkpoints/` inside this directory.
+4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
+5. `--phones-dict` is the path of the phone vocabulary file.
+6. `--speech-stretchs` is the path of mel's min-max data file.
+
+### Synthesizing
+We use parallel wavegan as the neural vocoder.
+Download pretrained parallel wavegan model from [pwgan_opencpop_ckpt_1.4.0.zip](https://paddlespeech.bj.bcebos.com/t2s/svs/opencpop/pwgan_opencpop_ckpt_1.4.0.zip) and unzip it.
+```bash
+unzip pwgan_opencpop_ckpt_1.4.0.zip
+```
+Parallel WaveGAN checkpoint contains files listed below.
+```text
+pwgan_opencpop_ckpt_1.4.0.zip
+├── default.yaml                   # default config used to train parallel wavegan
+├── snapshot_iter_100000.pdz       # model parameters of parallel wavegan
+└── feats_stats.npy                # statistics used to normalize spectrogram when training parallel wavegan
+```
+`./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
+```
+```text
+usage: synthesize.py [-h]
+                     [--am {diffsinger_opencpop}]
+                     [--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
+                     [--am_stat AM_STAT] [--phones_dict PHONES_DICT]
+                     [--voc {pwgan_opencpop}]
+                     [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
+                     [--voc_stat VOC_STAT] [--ngpu NGPU]
+                     [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR]
+                     [--speech_stretchs SPEECH_STRETCHS]
+
+Synthesize with acoustic model & vocoder
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}
+                        Choose acoustic model type of tts task.
+       {diffsinger_opencpop} Choose acoustic model type of svs task.
+  --am_config AM_CONFIG
+                        Config of acoustic model.
+  --am_ckpt AM_CKPT     Checkpoint file of acoustic model.
+  --am_stat AM_STAT     mean and standard deviation used to normalize
+                        spectrogram when training acoustic model.
+  --phones_dict PHONES_DICT
+                        phone vocabulary file.
+  --tones_dict TONES_DICT
+                        tone vocabulary file.
+  --speaker_dict SPEAKER_DICT
+                        speaker id map file.
+  --voice-cloning VOICE_CLONING
+                        whether training voice cloning model.
+  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}
+                        Choose vocoder type of tts task.
+        {pwgan_opencpop, hifigan_opencpop} Choose vocoder type of svs task.
+  --voc_config VOC_CONFIG
+                        Config of voc.
+  --voc_ckpt VOC_CKPT   Checkpoint file of voc.
+  --voc_stat VOC_STAT   mean and standard deviation used to normalize
+                        spectrogram when training voc.
+  --ngpu NGPU           if ngpu == 0, use cpu.
+  --test_metadata TEST_METADATA
+                        test metadata.
+  --output_dir OUTPUT_DIR
+                        output dir.
+  --speech-stretchs     SPEECH_STRETCHS
+                        The min and max values of the mel spectrum, using on diffusion of diffsinger.
+```
+
+`./local/synthesize_e2e.sh` calls `${BIN_DIR}/../synthesize_e2e.py`, which can synthesize waveform from text file. 
+`local/pinyin_to_phone.txt` comes from the readme of the opencpop dataset, indicating the mapping from pinyin to phonemes in opencpop.
+
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name}
+```
+```text
+usage: synthesize_e2e.py [-h]
+                         [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}]
+                         [--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
+                         [--am_stat AM_STAT] [--phones_dict PHONES_DICT]
+                         [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID]
+                         [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}]
+                         [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
+                         [--voc_stat VOC_STAT] [--lang LANG]
+                         [--inference_dir INFERENCE_DIR] [--ngpu NGPU]
+                         [--text TEXT] [--output_dir OUTPUT_DIR]
+                         [--pinyin_phone PINYIN_PHONE]
+                         [--speech_stretchs SPEECH_STRETCHS]
+
+Synthesize with acoustic model & vocoder
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}
+                        Choose acoustic model type of tts task.
+       {diffsinger_opencpop} Choose acoustic model type of svs task.
+  --am_config AM_CONFIG
+                        Config of acoustic model.
+  --am_ckpt AM_CKPT     Checkpoint file of acoustic model.
+  --am_stat AM_STAT     mean and standard deviation used to normalize
+                        spectrogram when training acoustic model.
+  --phones_dict PHONES_DICT
+                        phone vocabulary file.
+  --speaker_dict SPEAKER_DICT
+                        speaker id map file.
+  --spk_id SPK_ID       spk id for multi speaker acoustic model
+  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}
+                        Choose vocoder type of tts task.
+        {pwgan_opencpop, hifigan_opencpop} Choose vocoder type of svs task.
+  --voc_config VOC_CONFIG
+                        Config of voc.
+  --voc_ckpt VOC_CKPT   Checkpoint file of voc.
+  --voc_stat VOC_STAT   mean and standard deviation used to normalize
+                        spectrogram when training voc.
+  --lang LANG           {zh, en, mix, canton} Choose language type of tts task.
+                        {sing} Choose language type of svs task.
+  --inference_dir INFERENCE_DIR
+                        dir to save inference models
+  --ngpu NGPU           if ngpu == 0, use cpu.
+  --text TEXT           text to synthesize file, a 'utt_id sentence' pair per line for tts task.
+                        A '{ utt_id input_type (is word) text notes note_durs}' or '{utt_id input_type (is phoneme) phones notes note_durs is_slurs}' pair per line for svs task.
+  --output_dir OUTPUT_DIR
+                        output dir.
+  --pinyin_phone PINYIN_PHONE
+                        pinyin to phone map file, using on sing_frontend.
+  --speech_stretchs SPEECH_STRETCHS
+                        The min and max values of the mel spectrum, using on diffusion of diffsinger.
+```
+1. `--am` is acoustic model type with the format {model_name}_{dataset}
+2. `--am_config`, `--am_ckpt`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the diffsinger pretrained model.
+3. `--voc` is vocoder type with the format {model_name}_{dataset}
+4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
+5. `--lang` is language. `zh`, `en`, `mix` and `canton` for tts task. `sing` for tts task.
+6. `--test_metadata` should be the metadata file in the normalized subfolder of `test`  in the `dump` folder.
+7. `--text` is the text file, which contains sentences to synthesize.
+8. `--output_dir` is the directory to save synthesized audio files.
+9. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
+10. `--inference_dir` is the directory to save static models. If this line is not added, it will not be generated and saved as a static model.
+11. `--pinyin_phone` pinyin to phone map file, using on sing_frontend.
+12. `--speech_stretchs` The min and max values of the mel spectrum, using on diffusion of diffsinger.
+
+Note: At present, the diffsinger model does not support dynamic to static, so do not add `--inference_dir`.
+
+
+## Pretrained Model
+Pretrained DiffSinger model:
+- [diffsinger_opencpop_ckpt_1.4.0.zip](https://paddlespeech.bj.bcebos.com/t2s/svs/opencpop/diffsinger_opencpop_ckpt_1.4.0.zip)
+
+DiffSinger checkpoint contains files listed below.
+```text
+diffsinger_opencpop_ckpt_1.4.0.zip
+├── default.yaml             # default config used to train diffsinger
+├── energy_stats.npy         # statistics used to normalize energy when training diffsinger if norm is needed
+├── phone_id_map.txt         # phone vocabulary file when training diffsinger
+├── pinyin_to_phone.txt      # pinyin-to-phoneme mapping file when training diffsinger
+├── pitch_stats.npy          # statistics used to normalize pitch when training diffsinger if norm is needed 
+├── snapshot_iter_160000.pdz # model parameters of diffsinger
+├── speech_stats.npy         # statistics used to normalize mel when training diffsinger if norm is needed
+└── speech_stretchs.npy      # min and max values to use for mel spectral stretching before training diffusion
+
+```
+
+You can use the following scripts to synthesize for `${BIN_DIR}/../sentences_sing.txt` using pretrained diffsinger and parallel wavegan models.
+
+```bash
+source path.sh
+
+FLAGS_allocator_strategy=naive_best_fit \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python3 ${BIN_DIR}/../synthesize_e2e.py \
+  --am=diffsinger_opencpop \
+  --am_config=diffsinger_opencpop_ckpt_1.4.0/default.yaml \
+  --am_ckpt=diffsinger_opencpop_ckpt_1.4.0/snapshot_iter_160000.pdz \
+  --am_stat=diffsinger_opencpop_ckpt_1.4.0/speech_stats.npy  \
+  --voc=pwgan_opencpop \
+  --voc_config=pwgan_opencpop_ckpt_1.4.0/default.yaml \
+  --voc_ckpt=pwgan_opencpop_ckpt_1.4.0/snapshot_iter_100000.pdz \
+  --voc_stat=pwgan_opencpop_ckpt_1.4.0/feats_stats.npy \
+  --lang=sing \
+  --text=${BIN_DIR}/../sentences_sing.txt \
+  --output_dir=exp/default/test_e2e \
+  --phones_dict=diffsinger_opencpop_ckpt_1.4.0/phone_id_map.txt \
+  --pinyin_phone=diffsinger_opencpop_ckpt_1.4.0/pinyin_to_phone.txt \
+  --speech_stretchs=diffsinger_opencpop_ckpt_1.4.0/speech_stretchs.npy
+  
+```
diff --git a/examples/opencpop/svs1/README_cn.md b/examples/opencpop/svs1/README_cn.md
new file mode 100644
index 00000000..1435b42e
--- /dev/null
+++ b/examples/opencpop/svs1/README_cn.md
@@ -0,0 +1,280 @@
+(简体中文|[English](./README.md))
+# 用 Opencpop 数据集训练 DiffSinger 模型
+
+本用例包含用于训练 [DiffSinger](https://arxiv.org/abs/2105.02446) 模型的代码，使用 [Mandarin singing corpus](https://wenet.org.cn/opencpop/) 数据集。
+
+## 数据集
+### 下载并解压
+从 [官方网站](https://wenet.org.cn/opencpop/download/) 下载数据集
+
+## 开始
+假设数据集的路径是 `~/datasets/Opencpop`.
+运行下面的命令会进行如下操作：
+
+1. **设置原路径**。
+2. 对数据集进行预处理。
+3. 训练模型
+4. 合成波形
+    - 从 `metadata.jsonl` 合成波形。
+    - （支持中）从文本文件合成波形。
+5. （支持中）使用静态模型进行推理。
+```bash
+./run.sh
+```
+您可以选择要运行的一系列阶段，或者将 `stage` 设置为 `stop-stage` 以仅使用一个阶段，例如，运行以下命令只会预处理数据集。
+```bash
+./run.sh --stage 0 --stop-stage 0
+```
+### 数据预处理
+```bash
+./local/preprocess.sh ${conf_path}
+```
+当它完成时。将在当前目录中创建 `dump` 文件夹。转储文件夹的结构如下所示。
+
+```text
+dump
+├── dev
+│   ├── norm
+│   └── raw
+├── phone_id_map.txt
+├── speaker_id_map.txt
+├── test
+│   ├── norm
+│   └── raw
+└── train
+    ├── energy_stats.npy
+    ├── norm
+    ├── pitch_stats.npy
+    ├── raw
+    ├── speech_stats.npy
+    └── speech_stretchs.npy
+```
+
+数据集分为三个部分，即 `train` 、 `dev` 和 `test` ，每个部分都包含一个 `norm` 和 `raw` 子文件夹。原始文件夹包含每个话语的语音、音调和能量特征，而 `norm` 文件夹包含规范化的特征。用于规范化特征的统计数据是从 `dump/train/*_stats.npy` 中的训练集计算出来的。`speech_stretchs.npy` 中包含 mel谱每个维度上的最小值和最大值，用于 diffusion 模块训练/推理前的线性拉伸。
+注意：由于非 norm 特征训练效果由于 norm，因此 `norm` 下保存的特征是未经过 norm 的特征。
+
+
+此外，还有一个 `metadata.jsonl` 在每个子文件夹中。它是一个类似表格的文件，包含话语id，音色id，音素、文本长度、语音长度、音素持续时间、语音特征路径、音调特征路径、能量特征路径、音调，音调持续时间，是否为转音。
+
+### 模型训练
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
+```
+`./local/train.sh` 调用 `${BIN_DIR}/train.py` 。
+以下是完整的帮助信息。
+
+```text
+usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
+                [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
+                [--ngpu NGPU] [--phones-dict PHONES_DICT]
+                [--speaker-dict SPEAKER_DICT] [--speech-stretchs SPEECH_STRETCHS]
+
+Train a DiffSinger model.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --config CONFIG       diffsinger config file.
+  --train-metadata TRAIN_METADATA
+                        training data.
+  --dev-metadata DEV_METADATA
+                        dev data.
+  --output-dir OUTPUT_DIR
+                        output dir.
+  --ngpu NGPU           if ngpu=0, use cpu.
+  --phones-dict PHONES_DICT
+                        phone vocabulary file.
+  --speaker-dict SPEAKER_DICT
+                        speaker id map file for multiple speaker model.
+  --speech-stretchs SPEECH_STRETCHS
+                        min amd max mel for stretching.
+```
+1. `--config` 是一个 yaml 格式的配置文件，用于覆盖默认配置，位于 `conf/default.yaml`.
+2. `--train-metadata` 和 `--dev-metadata` 应为 `dump` 文件夹中 `train` 和 `dev` 下的规范化元数据文件
+3. `--output-dir` 是保存结果的目录。 检查点保存在此目录中的 `checkpoints/` 目录下。
+4. `--ngpu` 要使用的 GPU 数，如果 ngpu==0，则使用 cpu 。
+5. `--phones-dict` 是音素词汇表文件的路径。
+6. `--speech-stretchs` mel的最小最大值数据的文件路径。
+
+### 合成
+我们使用 parallel opencpop 作为神经声码器（vocoder）。
+从 [pwgan_opencpop_ckpt_1.4.0.zip](https://paddlespeech.bj.bcebos.com/t2s/svs/opencpop/pwgan_opencpop_ckpt_1.4.0.zip) 下载预训练的 parallel wavegan 模型并将其解压。
+
+```bash
+unzip pwgan_opencpop_ckpt_1.4.0.zip
+```
+Parallel WaveGAN 检查点包含如下文件。
+```text
+pwgan_opencpop_ckpt_1.4.0.zip
+├── default.yaml               # 用于训练 parallel wavegan 的默认配置
+├── snapshot_iter_100000.pdz   # parallel wavegan 的模型参数
+└── feats_stats.npy            # 训练平行波形时用于规范化谱图的统计数据
+```
+`./local/synthesize.sh` 调用 `${BIN_DIR}/../synthesize.py` 即可从 `metadata.jsonl`中合成波形。
+
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
+```
+```text
+usage: synthesize.py [-h]
+                     [--am {diffsinger_opencpop}]
+                     [--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
+                     [--am_stat AM_STAT] [--phones_dict PHONES_DICT]
+                     [--voc {pwgan_opencpop}]
+                     [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
+                     [--voc_stat VOC_STAT] [--ngpu NGPU]
+                     [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR]
+                     [--speech_stretchs SPEECH_STRETCHS]
+
+Synthesize with acoustic model & vocoder
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}
+                        Choose acoustic model type of tts task.
+       {diffsinger_opencpop} Choose acoustic model type of svs task.
+  --am_config AM_CONFIG
+                        Config of acoustic model.
+  --am_ckpt AM_CKPT     Checkpoint file of acoustic model.
+  --am_stat AM_STAT     mean and standard deviation used to normalize
+                        spectrogram when training acoustic model.
+  --phones_dict PHONES_DICT
+                        phone vocabulary file.
+  --tones_dict TONES_DICT
+                        tone vocabulary file.
+  --speaker_dict SPEAKER_DICT
+                        speaker id map file.
+  --voice-cloning VOICE_CLONING
+                        whether training voice cloning model.
+  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}
+                        Choose vocoder type of tts task.
+        {pwgan_opencpop, hifigan_opencpop} Choose vocoder type of svs task.
+  --voc_config VOC_CONFIG
+                        Config of voc.
+  --voc_ckpt VOC_CKPT   Checkpoint file of voc.
+  --voc_stat VOC_STAT   mean and standard deviation used to normalize
+                        spectrogram when training voc.
+  --ngpu NGPU           if ngpu == 0, use cpu.
+  --test_metadata TEST_METADATA
+                        test metadata.
+  --output_dir OUTPUT_DIR
+                        output dir.
+  --speech-stretchs     SPEECH_STRETCHS
+                        The min and max values of the mel spectrum, using on diffusion of diffsinger.
+```
+
+`./local/synthesize_e2e.sh` 调用 `${BIN_DIR}/../synthesize_e2e.py`，即可从文本文件中合成波形。
+`local/pinyin_to_phone.txt`来源于opencpop数据集中的README，表示opencpop中拼音到音素的映射。
+
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name}
+```
+```text
+usage: synthesize_e2e.py [-h]
+                         [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}]
+                         [--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
+                         [--am_stat AM_STAT] [--phones_dict PHONES_DICT]
+                         [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID]
+                         [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}]
+                         [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
+                         [--voc_stat VOC_STAT] [--lang LANG]
+                         [--inference_dir INFERENCE_DIR] [--ngpu NGPU]
+                         [--text TEXT] [--output_dir OUTPUT_DIR]
+                         [--pinyin_phone PINYIN_PHONE]
+                         [--speech_stretchs SPEECH_STRETCHS]
+
+Synthesize with acoustic model & vocoder
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}
+                        Choose acoustic model type of tts task.
+       {diffsinger_opencpop} Choose acoustic model type of svs task.
+  --am_config AM_CONFIG
+                        Config of acoustic model.
+  --am_ckpt AM_CKPT     Checkpoint file of acoustic model.
+  --am_stat AM_STAT     mean and standard deviation used to normalize
+                        spectrogram when training acoustic model.
+  --phones_dict PHONES_DICT
+                        phone vocabulary file.
+  --speaker_dict SPEAKER_DICT
+                        speaker id map file.
+  --spk_id SPK_ID       spk id for multi speaker acoustic model
+  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}
+                        Choose vocoder type of tts task.
+        {pwgan_opencpop, hifigan_opencpop} Choose vocoder type of svs task.
+  --voc_config VOC_CONFIG
+                        Config of voc.
+  --voc_ckpt VOC_CKPT   Checkpoint file of voc.
+  --voc_stat VOC_STAT   mean and standard deviation used to normalize
+                        spectrogram when training voc.
+  --lang LANG           {zh, en, mix, canton} Choose language type of tts task.
+                        {sing} Choose language type of svs task.
+  --inference_dir INFERENCE_DIR
+                        dir to save inference models
+  --ngpu NGPU           if ngpu == 0, use cpu.
+  --text TEXT           text to synthesize file, a 'utt_id sentence' pair per line for tts task.
+                        A '{ utt_id input_type (is word) text notes note_durs}' or '{utt_id input_type (is phoneme) phones notes note_durs is_slurs}' pair per line for svs task.
+  --output_dir OUTPUT_DIR
+                        output dir.
+  --pinyin_phone PINYIN_PHONE
+                        pinyin to phone map file, using on sing_frontend.
+  --speech_stretchs SPEECH_STRETCHS
+                        The min and max values of the mel spectrum, using on diffusion of diffsinger.
+```
+1. `--am` 声学模型格式是否符合 {model_name}_{dataset}
+2. `--am_config`, `--am_ckpt`, `--am_stat` 和 `--phones_dict` 是声学模型的参数，对应于 diffsinger 预训练模型中的 4 个文件。
+3. `--voc` 声码器(vocoder)格式是否符合 {model_name}_{dataset}
+4. `--voc_config`, `--voc_ckpt`, `--voc_stat` 是声码器的参数，对应于 parallel wavegan 预训练模型中的 3 个文件。
+5. `--lang` tts对应模型的语言可以是 `zh`、`en`、`mix`和`canton`。 svs 对应的语言是 `sing` 。
+6. `--test_metadata` 应为 `dump` 文件夹中 `test` 下的规范化元数据文件、
+7. `--text` 是文本文件，其中包含要合成的句子。
+8. `--output_dir` 是保存合成音频文件的目录。
+9. `--ngpu` 要使用的GPU数，如果 ngpu==0，则使用 cpu。
+10. `--inference_dir` 静态模型保存的目录。如果不加这一行，就不会生并保存成静态模型。
+11. `--pinyin_phone` 拼音到音素的映射文件。
+12. `--speech_stretchs` mel谱的最大最小值用于diffsinger中diffusion之前的线性拉伸。
+
+注意： 目前 diffsinger 模型还不支持动转静，所以不要加 `--inference_dir`。
+
+
+## 预训练模型
+预先训练的 DiffSinger 模型：
+- [diffsinger_opencpop_ckpt_1.4.0.zip](https://paddlespeech.bj.bcebos.com/t2s/svs/opencpop/diffsinger_opencpop_ckpt_1.4.0.zip)
+
+
+DiffSinger 检查点包含下列文件。
+```text
+diffsinger_opencpop_ckpt_1.4.0.zip
+├── default.yaml             # 用于训练 diffsinger 的默认配置
+├── energy_stats.npy         # 训练 diffsinger 时如若需要 norm energy 会使用到的统计数据 
+├── phone_id_map.txt         # 训练 diffsinger 时的音素词汇文件
+├── pinyin_to_phone.txt      # 训练 diffsinger 时的拼音到音素映射文件
+├── pitch_stats.npy          # 训练 diffsinger 时如若需要 norm pitch 会使用到的统计数据 
+├── snapshot_iter_160000.pdz # 模型参数和优化器状态
+├── speech_stats.npy         # 训练 diffsinger 时用于规范化频谱图的统计数据
+└── speech_stretchs.npy      # 训练 diffusion 前用于 mel 谱拉伸的最小及最大值
+
+```
+您可以使用以下脚本通过使用预训练的 diffsinger 和 parallel wavegan 模型为 `${BIN_DIR}/../sentences_sing.txt` 合成句子
+```bash
+source path.sh
+
+FLAGS_allocator_strategy=naive_best_fit \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python3 ${BIN_DIR}/../synthesize_e2e.py \
+  --am=diffsinger_opencpop \
+  --am_config=diffsinger_opencpop_ckpt_1.4.0/default.yaml \
+  --am_ckpt=diffsinger_opencpop_ckpt_1.4.0/snapshot_iter_160000.pdz \
+  --am_stat=diffsinger_opencpop_ckpt_1.4.0/speech_stats.npy  \
+  --voc=pwgan_opencpop \
+  --voc_config=pwgan_opencpop_ckpt_1.4.0/default.yaml \
+  --voc_ckpt=pwgan_opencpop_ckpt_1.4.0/snapshot_iter_100000.pdz \
+  --voc_stat=pwgan_opencpop_ckpt_1.4.0/feats_stats.npy \
+  --lang=sing \
+  --text=${BIN_DIR}/../sentences_sing.txt \
+  --output_dir=exp/default/test_e2e \
+  --phones_dict=diffsinger_opencpop_ckpt_1.4.0/phone_id_map.txt \
+  --pinyin_phone=diffsinger_opencpop_ckpt_1.4.0/pinyin_to_phone.txt \
+  --speech_stretchs=diffsinger_opencpop_ckpt_1.4.0/speech_stretchs.npy
+  
+```
diff --git a/examples/opencpop/svs1/conf/default.yaml b/examples/opencpop/svs1/conf/default.yaml
new file mode 100644
index 00000000..5d806063
--- /dev/null
+++ b/examples/opencpop/svs1/conf/default.yaml
@@ -0,0 +1,159 @@
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+
+fs: 24000          # sr
+n_fft: 512         # FFT size (samples).
+n_shift: 128       # Hop size (samples). 12.5ms
+win_length: 512    # Window length (samples). 50ms
+                   # If set to null, it will be the same as fft_size.
+window: "hann"     # Window function.
+
+# Only used for feats_type != raw
+
+fmin: 30           # Minimum frequency of Mel basis.
+fmax: 12000        # Maximum frequency of Mel basis.
+n_mels: 80         # The number of mel basis.
+
+# Only used for the model using pitch features (e.g. FastSpeech2)
+f0min: 80          # Minimum f0 for pitch extraction.
+f0max: 750         # Maximum f0 for pitch extraction.
+
+
+###########################################################
+#                       DATA SETTING                      #
+###########################################################
+batch_size: 48     # batch size
+num_workers: 1     # number of gpu
+
+
+###########################################################
+#                       MODEL SETTING                     #
+###########################################################
+model:
+    # music score related
+    note_num: 300                                     # number of note
+    is_slur_num: 2                                    # number of slur
+    # fastspeech2 module options
+    use_energy_pred: False                            # whether use energy predictor
+    use_postnet: False                                # whether use postnet
+
+    # fastspeech2 module
+    fastspeech2_params:
+        adim: 256                                     # attention dimension
+        aheads: 2                                     # number of attention heads
+        elayers: 4                                    # number of encoder layers
+        eunits: 1024                                  # number of encoder ff units
+        dlayers: 4                                    # number of decoder layers
+        dunits: 1024                                  # number of decoder ff units
+        positionwise_layer_type: conv1d-linear        # type of position-wise layer
+        positionwise_conv_kernel_size: 9              # kernel size of position wise conv layer
+        transformer_enc_dropout_rate: 0.1             # dropout rate for transformer encoder layer
+        transformer_enc_positional_dropout_rate: 0.1  # dropout rate for transformer encoder positional encoding
+        transformer_enc_attn_dropout_rate: 0.0        # dropout rate for transformer encoder attention layer
+        transformer_activation_type: "gelu"           # Activation function type in transformer.
+        encoder_normalize_before: True                # whether to perform layer normalization before the input
+        decoder_normalize_before: True                # whether to perform layer normalization before the input
+        reduction_factor: 1                           # reduction factor
+        init_type: xavier_uniform                     # initialization type
+        init_enc_alpha: 1.0                           # initial value of alpha of encoder scaled position encoding
+        init_dec_alpha: 1.0                           # initial value of alpha of decoder scaled position encoding
+        use_scaled_pos_enc: True                      # whether to use scaled positional encoding
+        transformer_dec_dropout_rate: 0.1             # dropout rate for transformer decoder layer
+        transformer_dec_positional_dropout_rate: 0.1  # dropout rate for transformer decoder positional encoding
+        transformer_dec_attn_dropout_rate: 0.0        # dropout rate for transformer decoder attention layer
+        duration_predictor_layers: 5                  # number of layers of duration predictor
+        duration_predictor_chans: 256                 # number of channels of duration predictor
+        duration_predictor_kernel_size: 3             # filter size of duration predictor
+        duration_predictor_dropout_rate: 0.5          # dropout rate in energy predictor
+        pitch_predictor_layers: 5                     # number of conv layers in pitch predictor
+        pitch_predictor_chans: 256                    # number of channels of conv layers in pitch predictor
+        pitch_predictor_kernel_size: 5                # kernel size of conv leyers in pitch predictor
+        pitch_predictor_dropout: 0.5                  # dropout rate in pitch predictor
+        pitch_embed_kernel_size: 1                    # kernel size of conv embedding layer for pitch
+        pitch_embed_dropout: 0.0                      # dropout rate after conv embedding layer for pitch
+        stop_gradient_from_pitch_predictor: True      # whether to stop the gradient from pitch predictor to encoder
+        energy_predictor_layers: 2                    # number of conv layers in energy predictor
+        energy_predictor_chans: 256                   # number of channels of conv layers in energy predictor
+        energy_predictor_kernel_size: 3               # kernel size of conv leyers in energy predictor
+        energy_predictor_dropout: 0.5                 # dropout rate in energy predictor
+        energy_embed_kernel_size: 1                   # kernel size of conv embedding layer for energy
+        energy_embed_dropout: 0.0                     # dropout rate after conv embedding layer for energy
+        stop_gradient_from_energy_predictor: False    # whether to stop the gradient from energy predictor to encoder
+        postnet_layers: 5                             # number of layers of postnet
+        postnet_filts: 5                              # filter size of conv layers in postnet
+        postnet_chans: 256                            # number of channels of conv layers in postnet
+        postnet_dropout_rate: 0.5                     # dropout rate for postnet
+ 
+    # denoiser module
+    denoiser_params:
+        in_channels: 80                               # Number of channels of the input mel-spectrogram
+        out_channels: 80                              # Number of channels of the output mel-spectrogram
+        kernel_size: 3                                # Kernel size of the residual blocks inside                           
+        layers: 20                                    # Number of residual blocks inside
+        stacks: 5                                     # The number of groups to split the residual blocks into
+        residual_channels: 256                        # Residual channel of the residual blocks
+        gate_channels: 512                            # Gate channel of the residual blocks
+        skip_channels: 256                            # Skip channel of the residual blocks
+        aux_channels: 256                             # Auxiliary channel of the residual blocks
+        dropout: 0.1                                  # Dropout of the residual blocks
+        bias: True                                    # Whether to use bias in residual blocks
+        use_weight_norm: False                        # Whether to use weight norm in all convolutions
+        init_type: "kaiming_normal"                   # Type of initialize weights of a neural network module
+
+
+    diffusion_params:
+        num_train_timesteps: 100                      # The number of timesteps between the noise and the real during training
+        beta_start: 0.0001                            # beta start parameter for the scheduler
+        beta_end: 0.06                                # beta end parameter for the scheduler
+        beta_schedule: "linear"                       # beta schedule parameter for the scheduler
+        num_max_timesteps: 100                        # The max timestep transition from real to noise
+        stretch: True                                 # whether to stretch before diffusion
+
+
+###########################################################
+#                       UPDATER SETTING                   #
+###########################################################
+fs2_updater:
+    use_masking: True                 # whether to apply masking for padded part in loss calculation
+
+ds_updater:
+    use_masking: True                 # whether to apply masking for padded part in loss calculation
+
+
+###########################################################
+#                     OPTIMIZER SETTING                   #
+###########################################################
+# fastspeech2 optimizer
+fs2_optimizer:
+    optim: adam              # optimizer type
+    learning_rate: 0.001     # learning rate
+
+# diffusion optimizer
+ds_optimizer_params:
+    beta1: 0.9
+    beta2: 0.98
+    weight_decay: 0.0
+
+ds_scheduler_params:
+    learning_rate: 0.001              
+    gamma: 0.5                          
+    step_size: 50000
+ds_grad_norm: 1
+
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+only_train_diffusion: True                 # Whether to freeze fastspeech2 parameters when training diffusion
+ds_train_start_steps: 160000              # Number of steps to start to train diffusion module.
+train_max_steps: 320000                   # Number of training steps.
+save_interval_steps: 2000                 # Interval steps to save checkpoint.
+eval_interval_steps: 2000                 # Interval steps to evaluate the network.
+num_snapshots: 5
+
+
+###########################################################
+#                       OTHER SETTING                     #
+###########################################################
+seed: 10086
diff --git a/examples/opencpop/svs1/local/pinyin_to_phone.txt b/examples/opencpop/svs1/local/pinyin_to_phone.txt
new file mode 100644
index 00000000..34ed079d
--- /dev/null
+++ b/examples/opencpop/svs1/local/pinyin_to_phone.txt
@@ -0,0 +1,418 @@
+a|a
+ai|ai
+an|an
+ang|ang
+ao|ao
+ba|b a
+bai|b ai
+ban|b an
+bang|b ang
+bao|b ao
+bei|b ei
+ben|b en
+beng|b eng
+bi|b i
+bian|b ian
+biao|b iao
+bie|b ie
+bin|b in
+bing|b ing
+bo|b o
+bu|b u
+ca|c a
+cai|c ai
+can|c an
+cang|c ang
+cao|c ao
+ce|c e
+cei|c ei
+cen|c en
+ceng|c eng
+cha|ch a
+chai|ch ai
+chan|ch an
+chang|ch ang
+chao|ch ao
+che|ch e
+chen|ch en
+cheng|ch eng
+chi|ch i
+chong|ch ong
+chou|ch ou
+chu|ch u
+chua|ch ua
+chuai|ch uai
+chuan|ch uan
+chuang|ch uang
+chui|ch ui
+chun|ch un
+chuo|ch uo
+ci|c i
+cong|c ong
+cou|c ou
+cu|c u
+cuan|c uan
+cui|c ui
+cun|c un
+cuo|c uo
+da|d a
+dai|d ai
+dan|d an
+dang|d ang
+dao|d ao
+de|d e
+dei|d ei
+den|d en
+deng|d eng
+di|d i
+dia|d ia
+dian|d ian
+diao|d iao
+die|d ie
+ding|d ing
+diu|d iu
+dong|d ong
+dou|d ou
+du|d u
+duan|d uan
+dui|d ui
+dun|d un
+duo|d uo
+e|e
+ei|ei
+en|en
+eng|eng
+er|er
+fa|f a
+fan|f an
+fang|f ang
+fei|f ei
+fen|f en
+feng|f eng
+fo|f o
+fou|f ou
+fu|f u
+ga|g a
+gai|g ai
+gan|g an
+gang|g ang
+gao|g ao
+ge|g e
+gei|g ei
+gen|g en
+geng|g eng
+gong|g ong
+gou|g ou
+gu|g u
+gua|g ua
+guai|g uai
+guan|g uan
+guang|g uang
+gui|g ui
+gun|g un
+guo|g uo
+ha|h a
+hai|h ai
+han|h an
+hang|h ang
+hao|h ao
+he|h e
+hei|h ei
+hen|h en
+heng|h eng
+hm|h m
+hng|h ng
+hong|h ong
+hou|h ou
+hu|h u
+hua|h ua
+huai|h uai
+huan|h uan
+huang|h uang
+hui|h ui
+hun|h un
+huo|h uo
+ji|j i
+jia|j ia
+jian|j ian
+jiang|j iang
+jiao|j iao
+jie|j ie
+jin|j in
+jing|j ing
+jiong|j iong
+jiu|j iu
+ju|j v
+juan|j van
+jue|j ve
+jun|j vn
+ka|k a
+kai|k ai
+kan|k an
+kang|k ang
+kao|k ao
+ke|k e
+kei|k ei
+ken|k en
+keng|k eng
+kong|k ong
+kou|k ou
+ku|k u
+kua|k ua
+kuai|k uai
+kuan|k uan
+kuang|k uang
+kui|k ui
+kun|k un
+kuo|k uo
+la|l a
+lai|l ai
+lan|l an
+lang|l ang
+lao|l ao
+le|l e
+lei|l ei
+leng|l eng
+li|l i
+lia|l ia
+lian|l ian
+liang|l iang
+liao|l iao
+lie|l ie
+lin|l in
+ling|l ing
+liu|l iu
+lo|l o
+long|l ong
+lou|l ou
+lu|l u
+luan|l uan
+lun|l un
+luo|l uo
+lv|l v
+lve|l ve
+m|m
+ma|m a
+mai|m ai
+man|m an
+mang|m ang
+mao|m ao
+me|m e
+mei|m ei
+men|m en
+meng|m eng
+mi|m i
+mian|m ian
+miao|m iao
+mie|m ie
+min|m in
+ming|m ing
+miu|m iu
+mo|m o
+mou|m ou
+mu|m u
+n|n
+na|n a
+nai|n ai
+nan|n an
+nang|n ang
+nao|n ao
+ne|n e
+nei|n ei
+nen|n en
+neng|n eng
+ng|n g
+ni|n i
+nian|n ian
+niang|n iang
+niao|n iao
+nie|n ie
+nin|n in
+ning|n ing
+niu|n iu
+nong|n ong
+nou|n ou
+nu|n u
+nuan|n uan
+nun|n un
+nuo|n uo
+nv|n v
+nve|n ve
+o|o
+ou|ou
+pa|p a
+pai|p ai
+pan|p an
+pang|p ang
+pao|p ao
+pei|p ei
+pen|p en
+peng|p eng
+pi|p i
+pian|p ian
+piao|p iao
+pie|p ie
+pin|p in
+ping|p ing
+po|p o
+pou|p ou
+pu|p u
+qi|q i
+qia|q ia
+qian|q ian
+qiang|q iang
+qiao|q iao
+qie|q ie
+qin|q in
+qing|q ing
+qiong|q iong
+qiu|q iu
+qu|q v
+quan|q van
+que|q ve
+qun|q vn
+ran|r an
+rang|r ang
+rao|r ao
+re|r e
+ren|r en
+reng|r eng
+ri|r i
+rong|r ong
+rou|r ou
+ru|r u
+rua|r ua
+ruan|r uan
+rui|r ui
+run|r un
+ruo|r uo
+sa|s a
+sai|s ai
+san|s an
+sang|s ang
+sao|s ao
+se|s e
+sen|s en
+seng|s eng
+sha|sh a
+shai|sh ai
+shan|sh an
+shang|sh ang
+shao|sh ao
+she|sh e
+shei|sh ei
+shen|sh en
+sheng|sh eng
+shi|sh i
+shou|sh ou
+shu|sh u
+shua|sh ua
+shuai|sh uai
+shuan|sh uan
+shuang|sh uang
+shui|sh ui
+shun|sh un
+shuo|sh uo
+si|s i
+song|s ong
+sou|s ou
+su|s u
+suan|s uan
+sui|s ui
+sun|s un
+suo|s uo
+ta|t a
+tai|t ai
+tan|t an
+tang|t ang
+tao|t ao
+te|t e
+tei|t ei
+teng|t eng
+ti|t i
+tian|t ian
+tiao|t iao
+tie|t ie
+ting|t ing
+tong|t ong
+tou|t ou
+tu|t u
+tuan|t uan
+tui|t ui
+tun|t un
+tuo|t uo
+wa|w a
+wai|w ai
+wan|w an
+wang|w ang
+wei|w ei
+wen|w en
+weng|w eng
+wo|w o
+wu|w u
+xi|x i
+xia|x ia
+xian|x ian
+xiang|x iang
+xiao|x iao
+xie|x ie
+xin|x in
+xing|x ing
+xiong|x iong
+xiu|x iu
+xu|x v
+xuan|x van
+xue|x ve
+xun|x vn
+ya|y a
+yan|y an
+yang|y ang
+yao|y ao
+ye|y e
+yi|y i
+yin|y in
+ying|y ing
+yo|y o
+yong|y ong
+you|y ou
+yu|y v
+yuan|y van
+yue|y ve
+yun|y vn
+za|z a
+zai|z ai
+zan|z an
+zang|z ang
+zao|z ao
+ze|z e
+zei|z ei
+zen|z en
+zeng|z eng
+zha|zh a
+zhai|zh ai
+zhan|zh an
+zhang|zh ang
+zhao|zh ao
+zhe|zh e
+zhei|zh ei
+zhen|zh en
+zheng|zh eng
+zhi|zh i
+zhong|zh ong
+zhou|zh ou
+zhu|zh u
+zhua|zh ua
+zhuai|zh uai
+zhuan|zh uan
+zhuang|zh uang
+zhui|zh ui
+zhun|zh un
+zhuo|zh uo
+zi|z i
+zong|z ong
+zou|z ou
+zu|z u
+zuan|z uan
+zui|z ui
+zun|z un
+zuo|z uo
\ No newline at end of file
diff --git a/examples/opencpop/svs1/local/preprocess.sh b/examples/opencpop/svs1/local/preprocess.sh
new file mode 100755
index 00000000..26fd4468
--- /dev/null
+++ b/examples/opencpop/svs1/local/preprocess.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+
+stage=0
+stop_stage=100
+
+config_path=$1
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # extract features
+    echo "Extract features ..."
+    python3 ${BIN_DIR}/preprocess.py \
+        --dataset=opencpop \
+        --rootdir=~/datasets/Opencpop/segments \
+        --dumpdir=dump \
+        --label-file=~/datasets/Opencpop/segments/transcriptions.txt \
+        --config=${config_path} \
+        --num-cpu=20 \
+        --cut-sil=True
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # get features' stats(mean and std)
+    echo "Get features' stats ..."
+    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --field-name="speech"
+
+    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --field-name="pitch"
+
+    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --field-name="energy"
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # normalize and covert phone/speaker to id, dev and test should use train's stats
+    echo "Normalize ..."
+    python3 ${BIN_DIR}/normalize.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --dumpdir=dump/train/norm \
+        --speech-stats=dump/train/speech_stats.npy \
+        --pitch-stats=dump/train/pitch_stats.npy \
+        --energy-stats=dump/train/energy_stats.npy \
+        --phones-dict=dump/phone_id_map.txt \
+        --speaker-dict=dump/speaker_id_map.txt
+
+    python3 ${BIN_DIR}/normalize.py \
+        --metadata=dump/dev/raw/metadata.jsonl \
+        --dumpdir=dump/dev/norm \
+        --speech-stats=dump/train/speech_stats.npy \
+        --pitch-stats=dump/train/pitch_stats.npy \
+        --energy-stats=dump/train/energy_stats.npy \
+        --phones-dict=dump/phone_id_map.txt \
+        --speaker-dict=dump/speaker_id_map.txt
+
+    python3 ${BIN_DIR}/normalize.py \
+        --metadata=dump/test/raw/metadata.jsonl \
+        --dumpdir=dump/test/norm \
+        --speech-stats=dump/train/speech_stats.npy \
+        --pitch-stats=dump/train/pitch_stats.npy \
+        --energy-stats=dump/train/energy_stats.npy \
+        --phones-dict=dump/phone_id_map.txt \
+        --speaker-dict=dump/speaker_id_map.txt
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    # Get feature(mel) extremum for diffusion stretch
+    echo "Get feature(mel) extremum  ..."
+    python3 ${BIN_DIR}/get_minmax.py \
+        --metadata=dump/train/norm/metadata.jsonl \
+        --speech-stretchs=dump/train/speech_stretchs.npy
+fi
diff --git a/examples/opencpop/svs1/local/synthesize.sh b/examples/opencpop/svs1/local/synthesize.sh
new file mode 100755
index 00000000..1159e007
--- /dev/null
+++ b/examples/opencpop/svs1/local/synthesize.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+stage=0
+stop_stage=0
+
+# pwgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+    python3 ${BIN_DIR}/../synthesize.py \
+        --am=diffsinger_opencpop \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=pwgan_opencpop \
+        --voc_config=pwgan_opencpop_ckpt_1.4.0/default.yaml \
+        --voc_ckpt=pwgan_opencpop_ckpt_1.4.0/snapshot_iter_100000.pdz \
+        --voc_stat=pwgan_opencpop_ckpt_1.4.0/feats_stats.npy \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test \
+        --phones_dict=dump/phone_id_map.txt \
+	--speech_stretchs=dump/train/speech_stretchs.npy
+fi
+
diff --git a/examples/opencpop/svs1/local/synthesize_e2e.sh b/examples/opencpop/svs1/local/synthesize_e2e.sh
new file mode 100755
index 00000000..b3dc29b1
--- /dev/null
+++ b/examples/opencpop/svs1/local/synthesize_e2e.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
+stage=0
+stop_stage=0
+
+# pwgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=diffsinger_opencpop \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=pwgan_opencpop \
+        --voc_config=pwgan_opencpop_ckpt_1.4.0/default.yaml \
+        --voc_ckpt=pwgan_opencpop_ckpt_1.4.0/snapshot_iter_100000.pdz \
+        --voc_stat=pwgan_opencpop_ckpt_1.4.0/feats_stats.npy \
+        --lang=sing \
+        --text=${BIN_DIR}/../sentences_sing.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --speech_stretchs=dump/train/speech_stretchs.npy \
+        --pinyin_phone=local/pinyin_to_phone.txt
+fi
+
+# for more GAN Vocoders
+# hifigan
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    echo "in hifigan syn_e2e"
+    FLAGS_allocator_strategy=naive_best_fit \
+    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=diffsinger_opencpop \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=hifigan_opencpop \
+        --voc_config=hifigan_opencpop_ckpt_1.4.0/default.yaml \
+        --voc_ckpt=hifigan_opencpop_ckpt_1.4.0/snapshot_iter_625000.pdz \
+        --voc_stat=hifigan_opencpop_ckpt_1.4.0/feats_stats.npy \
+        --lang=sing \
+        --text=${BIN_DIR}/../sentences_sing.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --speech_stretchs=dump/train/speech_stretchs.npy \
+        --pinyin_phone=local/pinyin_to_phone.txt
+        
+fi
diff --git a/examples/opencpop/svs1/local/train.sh b/examples/opencpop/svs1/local/train.sh
new file mode 100755
index 00000000..5be624fc
--- /dev/null
+++ b/examples/opencpop/svs1/local/train.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+
+python3 ${BIN_DIR}/train.py \
+    --train-metadata=dump/train/norm/metadata.jsonl \
+    --dev-metadata=dump/dev/norm/metadata.jsonl \
+    --config=${config_path} \
+    --output-dir=${train_output_path} \
+    --ngpu=1 \
+    --phones-dict=dump/phone_id_map.txt \
+    --speech-stretchs=dump/train/speech_stretchs.npy
diff --git a/examples/opencpop/svs1/path.sh b/examples/opencpop/svs1/path.sh
new file mode 100755
index 00000000..8bda5dce
--- /dev/null
+++ b/examples/opencpop/svs1/path.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+MODEL=diffsinger
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
diff --git a/examples/opencpop/svs1/run.sh b/examples/opencpop/svs1/run.sh
new file mode 100755
index 00000000..bfe5b659
--- /dev/null
+++ b/examples/opencpop/svs1/run.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+set -e
+source path.sh
+
+gpus=0
+stage=0
+stop_stage=100
+
+conf_path=conf/default.yaml
+train_output_path=exp/default
+ckpt_name=snapshot_iter_320000.pdz
+
+# with the following command, you can choose the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    ./local/preprocess.sh ${conf_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # synthesize, vocoder is pwgan by default
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # synthesize_e2e, vocoder is pwgan by default
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
diff --git a/examples/opencpop/voc1/README.md b/examples/opencpop/voc1/README.md
new file mode 100644
index 00000000..37570a64
--- /dev/null
+++ b/examples/opencpop/voc1/README.md
@@ -0,0 +1,139 @@
+# Parallel WaveGAN with Opencpop
+This example contains code used to train a [parallel wavegan](http://arxiv.org/abs/1910.11480) model with [Mandarin singing corpus](https://wenet.org.cn/opencpop/).
+
+## Dataset
+### Download and Extract
+Download Opencpop from it's [Official Website](https://wenet.org.cn/opencpop/download/) and extract it to `~/datasets`. Then the dataset is in the directory `~/datasets/Opencpop`.
+
+## Get Started
+Assume the path to the dataset is `~/datasets/Opencpop`.
+Run the command below to
+1. **source path**.
+2. preprocess the dataset.
+3. train the model.
+4. synthesize wavs.
+    - synthesize waveform from `metadata.jsonl`.
+```bash
+./run.sh
+```
+You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, running the following command will only preprocess the dataset.
+```bash
+./run.sh --stage 0 --stop-stage 0
+```
+### Data Preprocessing
+```bash
+./local/preprocess.sh ${conf_path}
+```
+When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.
+
+```text
+dump
+├── dev
+│   ├── norm
+│   └── raw
+├── test
+│   ├── norm
+│   └── raw
+└── train
+    ├── norm
+    ├── raw
+    └── feats_stats.npy
+```
+The dataset is split into 3 parts, namely `train`, `dev`, and `test`, each of which contains a `norm` and `raw` subfolder. The `raw` folder contains the log magnitude of the mel spectrogram of each utterance, while the norm folder contains the normalized spectrogram. The statistics used to normalize the spectrogram are computed from the training set, which is located in `dump/train/feats_stats.npy`.
+
+Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file that contains id and paths to the spectrogram of each utterance.
+
+### Model Training
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
+```
+`./local/train.sh` calls `${BIN_DIR}/train.py`.
+Here's the complete help message.
+
+```text
+usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
+                [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
+                [--ngpu NGPU] [--batch-size BATCH_SIZE] [--max-iter MAX_ITER]
+                [--run-benchmark RUN_BENCHMARK]
+                [--profiler_options PROFILER_OPTIONS]
+
+Train a ParallelWaveGAN model.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --config CONFIG       ParallelWaveGAN config file.
+  --train-metadata TRAIN_METADATA
+                        training data.
+  --dev-metadata DEV_METADATA
+                        dev data.
+  --output-dir OUTPUT_DIR
+                        output dir.
+  --ngpu NGPU           if ngpu == 0, use cpu.
+
+benchmark:
+  arguments related to benchmark.
+
+  --batch-size BATCH_SIZE
+                        batch size.
+  --max-iter MAX_ITER   train max steps.
+  --run-benchmark RUN_BENCHMARK
+                        runing benchmark or not, if True, use the --batch-size
+                        and --max-iter.
+  --profiler_options PROFILER_OPTIONS
+                        The option of profiler, which should be in format
+                        "key1=value1;key2=value2;key3=value3".
+```
+
+1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
+2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder.
+3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are saved in `checkpoints/` inside this directory.
+4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
+
+### Synthesizing
+`./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
+```
+```text
+usage: synthesize.py [-h] [--generator-type GENERATOR_TYPE] [--config CONFIG]
+                     [--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA]
+                     [--output-dir OUTPUT_DIR] [--ngpu NGPU]
+
+Synthesize with GANVocoder.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --generator-type GENERATOR_TYPE
+                        type of GANVocoder, should in {pwgan, mb_melgan,
+                        style_melgan, } now
+  --config CONFIG       GANVocoder config file.
+  --checkpoint CHECKPOINT
+                        snapshot to load.
+  --test-metadata TEST_METADATA
+                        dev data.
+  --output-dir OUTPUT_DIR
+                        output dir.
+  --ngpu NGPU           if ngpu == 0, use cpu.
+```
+
+1. `--config` parallel wavegan config file. You should use the same config with which the model is trained.
+2. `--checkpoint` is the checkpoint to load. Pick one of the checkpoints from `checkpoints` inside the training output directory.
+3. `--test-metadata` is the metadata of the test dataset. Use the `metadata.jsonl` in the `dev/norm` subfolder from the processed directory.
+4. `--output-dir` is the directory to save the synthesized audio files.
+5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
+
+## Pretrained Models
+The pretrained model can be downloaded here:
+- [pwgan_opencpop_ckpt_1.4.0](https://paddlespeech.bj.bcebos.com/t2s/svs/opencpop/pwgan_opencpop_ckpt_1.4.0.zip)
+
+
+Parallel WaveGAN checkpoint contains files listed below.
+
+```text
+pwgan_opencpop_ckpt_1.4.0
+├── default.yaml                    # default config used to train parallel wavegan
+├── snapshot_iter_100000.pdz        # generator parameters of parallel wavegan
+└── feats_stats.npy                 # statistics used to normalize spectrogram when training parallel wavegan
+```
+## Acknowledgement
+We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN.
diff --git a/examples/opencpop/voc1/conf/default.yaml b/examples/opencpop/voc1/conf/default.yaml
new file mode 100644
index 00000000..ee99719d
--- /dev/null
+++ b/examples/opencpop/voc1/conf/default.yaml
@@ -0,0 +1,119 @@
+# This is the hyperparameter configuration file for Parallel WaveGAN.
+# Please make sure this is adjusted for the CSMSC dataset. If you want to
+# apply to the other dataset, you might need to carefully change some parameters.
+# This configuration requires 12 GB GPU memory and takes ~3 days on RTX TITAN.
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+fs: 24000                # Sampling rate.
+n_fft: 512              # FFT size (samples).
+n_shift: 128             # Hop size (samples). 12.5ms
+win_length: 512         # Window length (samples). 50ms
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+n_mels: 80               # Number of mel basis.
+fmin: 30                 # Minimum freq in mel basis calculation. (Hz)
+fmax: 12000               # Maximum frequency in mel basis calculation. (Hz)
+
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Kernel size of dilated convolution.
+    layers: 30            # Number of residual block layers.
+    stacks: 3             # Number of stacks i.e., dilation cycles.
+    residual_channels: 64 # Number of channels in residual conv.
+    gate_channels: 128    # Number of channels in gated conv.
+    skip_channels: 64     # Number of channels in skip conv.
+    aux_channels: 80      # Number of channels for auxiliary feature conv.
+                          # Must be the same as num_mels.
+    aux_context_window: 2 # Context window size for auxiliary feature.
+                          # If set to 2, previous 2 and future 2 frames will be considered.
+    dropout: 0.0          # Dropout rate. 0.0 means no dropout applied.
+    bias: True            # use bias in residual blocks
+    use_weight_norm: True # Whether to use weight norm.
+                          # If set to true, it will be applied to all of the conv layers.
+    use_causal_conv: False               # use causal conv in residual blocks and upsample layers
+    upsample_scales: [8, 4, 2, 2]     # Upsampling scales. Prodcut of these must be the same as hop size.
+    interpolate_mode: "nearest" # upsample net interpolate mode
+    freq_axis_kernel_size: 1 # upsamling net: convolution kernel size in frequencey axis
+    nonlinear_activation: null
+    nonlinear_activation_params: {}
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_params:
+    in_channels: 1        # Number of input channels.
+    out_channels: 1       # Number of output channels.
+    kernel_size: 3        # Number of output channels.
+    layers: 10            # Number of conv layers.
+    conv_channels: 64     # Number of chnn layers.
+    bias: True            # Whether to use bias parameter in conv.
+    use_weight_norm: True # Whether to use weight norm.
+                          # If set to true, it will be applied to all of the conv layers.
+    nonlinear_activation: "leakyrelu" # Nonlinear function after each conv.
+    nonlinear_activation_params:      # Nonlinear function parameters
+        negative_slope: 0.2           # Alpha in leakyrelu.
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+stft_loss_params:
+    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
+    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
+    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
+    window: "hann"         # Window function for STFT-based loss
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_adv: 4.0  # Loss balancing coefficient.
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 8              # Batch size.
+batch_max_steps: 25500     # Length of each audio in batch. Make sure dividable by n_shift.
+num_workers: 1             # Number of workers in DataLoader.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+    epsilon: 1.0e-6        # Generator's epsilon.
+    weight_decay: 0.0      # Generator's weight decay coefficient.
+generator_scheduler_params:
+    learning_rate: 0.0001  # Generator's learning rate.
+    step_size: 200000      # Generator's scheduler step size.
+    gamma: 0.5             # Generator's scheduler gamma.
+                           # At each step size, lr will be multiplied by this parameter.
+generator_grad_norm: 10    # Generator's gradient norm.
+discriminator_optimizer_params:
+    epsilon: 1.0e-6            # Discriminator's epsilon.
+    weight_decay: 0.0          # Discriminator's weight decay coefficient.
+discriminator_scheduler_params:
+    learning_rate: 0.00005     # Discriminator's learning rate.
+    step_size: 200000          # Discriminator's scheduler step size.
+    gamma: 0.5                 # Discriminator's scheduler gamma.
+                               # At each step size, lr will be multiplied by this parameter.
+discriminator_grad_norm: 1     # Discriminator's gradient norm.
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
+train_max_steps: 400000                 # Number of training steps.
+save_interval_steps: 5000               # Interval steps to save checkpoint.
+eval_interval_steps: 1000               # Interval steps to evaluate the network.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
+num_snapshots: 10                 # max number of snapshots to keep while training
+seed: 42                          # random seed for paddle, random, and np.random
diff --git a/examples/opencpop/voc1/local/PTQ_static.sh b/examples/opencpop/voc1/local/PTQ_static.sh
new file mode 120000
index 00000000..247ce5c7
--- /dev/null
+++ b/examples/opencpop/voc1/local/PTQ_static.sh
@@ -0,0 +1 @@
+../../../csmsc/voc1/local/PTQ_static.sh
\ No newline at end of file
diff --git a/examples/opencpop/voc1/local/dygraph_to_static.sh b/examples/opencpop/voc1/local/dygraph_to_static.sh
new file mode 100755
index 00000000..40a2c51b
--- /dev/null
+++ b/examples/opencpop/voc1/local/dygraph_to_static.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
+FLAGS_allocator_strategy=naive_best_fit \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python3 ${BIN_DIR}/../../dygraph_to_static.py \
+    --type=voc \
+    --voc=pwgan_opencpop \
+    --voc_config=${config_path} \
+    --voc_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+    --voc_stat=dump/train/feats_stats.npy \
+    --inference_dir=exp/default/inference/
diff --git a/examples/opencpop/voc1/local/preprocess.sh b/examples/opencpop/voc1/local/preprocess.sh
new file mode 100755
index 00000000..edab4d0d
--- /dev/null
+++ b/examples/opencpop/voc1/local/preprocess.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+
+stage=0
+stop_stage=100
+
+config_path=$1
+
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # extract features
+    echo "Extract features ..."
+    python3 ${BIN_DIR}/../preprocess.py \
+        --rootdir=~/datasets/Opencpop/segments/ \
+        --dataset=opencpop \
+        --dumpdir=dump \
+        --dur-file=~/datasets/Opencpop/segments/transcriptions.txt \
+        --config=${config_path} \
+        --cut-sil=False \
+        --num-cpu=20
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # get features' stats(mean and std)
+    echo "Get features' stats ..."
+    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --field-name="feats"
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # normalize, dev and test should use train's stats
+    echo "Normalize ..."
+   
+    python3 ${BIN_DIR}/../normalize.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --dumpdir=dump/train/norm \
+        --stats=dump/train/feats_stats.npy
+    python3 ${BIN_DIR}/../normalize.py \
+        --metadata=dump/dev/raw/metadata.jsonl \
+        --dumpdir=dump/dev/norm \
+        --stats=dump/train/feats_stats.npy
+    
+    python3 ${BIN_DIR}/../normalize.py \
+        --metadata=dump/test/raw/metadata.jsonl \
+        --dumpdir=dump/test/norm \
+        --stats=dump/train/feats_stats.npy
+fi
diff --git a/examples/opencpop/voc1/local/synthesize.sh b/examples/opencpop/voc1/local/synthesize.sh
new file mode 120000
index 00000000..d6aecd8d
--- /dev/null
+++ b/examples/opencpop/voc1/local/synthesize.sh
@@ -0,0 +1 @@
+../../../csmsc/voc1/local/synthesize.sh
\ No newline at end of file
diff --git a/examples/opencpop/voc1/local/train.sh b/examples/opencpop/voc1/local/train.sh
new file mode 120000
index 00000000..2942893d
--- /dev/null
+++ b/examples/opencpop/voc1/local/train.sh
@@ -0,0 +1 @@
+../../../csmsc/voc1/local/train.sh
\ No newline at end of file
diff --git a/examples/opencpop/voc1/path.sh b/examples/opencpop/voc1/path.sh
new file mode 120000
index 00000000..b7ed4fb8
--- /dev/null
+++ b/examples/opencpop/voc1/path.sh
@@ -0,0 +1 @@
+../../csmsc/voc1/path.sh
\ No newline at end of file
diff --git a/examples/opencpop/voc1/run.sh b/examples/opencpop/voc1/run.sh
new file mode 100755
index 00000000..1f87425f
--- /dev/null
+++ b/examples/opencpop/voc1/run.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+set -e
+source path.sh
+
+gpus=0
+stage=0
+stop_stage=100
+
+conf_path=conf/default.yaml
+train_output_path=exp/default
+ckpt_name=snapshot_iter_100000.pdz
+
+# with the following command, you can choose the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    ./local/preprocess.sh ${conf_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # synthesize
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+# dygraph to static
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/dygraph_to_static.sh  ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+# PTQ_static
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/PTQ_static.sh  ${train_output_path} pwgan_opencpop || exit -1
+fi
diff --git a/examples/opencpop/voc5/conf/default.yaml b/examples/opencpop/voc5/conf/default.yaml
new file mode 100644
index 00000000..10449f86
--- /dev/null
+++ b/examples/opencpop/voc5/conf/default.yaml
@@ -0,0 +1,167 @@
+# This is the configuration file for CSMSC dataset.
+# This configuration is based on HiFiGAN V1, which is an official configuration. 
+# But I found that the optimizer setting does not work well with my implementation.
+# So I changed optimizer settings as follows:
+# - AdamW -> Adam
+# - betas: [0.8, 0.99] -> betas: [0.5, 0.9]
+# - Scheduler: ExponentialLR -> MultiStepLR
+# To match the shift size difference, the upsample scales is also modified from the original 256 shift setting.
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+fs: 24000                # Sampling rate.
+n_fft: 512              # FFT size (samples).
+n_shift: 128             # Hop size (samples). 12.5ms
+win_length: 512         # Window length (samples). 50ms
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+n_mels: 80               # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation. (Hz)
+fmax: 12000               # Maximum frequency in mel basis calculation. (Hz)
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_params:
+    in_channels: 80                       # Number of input channels.
+    out_channels: 1                       # Number of output channels.
+    channels: 512                         # Number of initial channels.
+    kernel_size: 7                        # Kernel size of initial and final conv layers.
+    upsample_scales: [8, 4, 2, 2]         # Upsampling scales.
+    upsample_kernel_sizes: [16, 8, 4, 4] # Kernel size for upsampling layers.
+    resblock_kernel_sizes: [3, 7, 11]     # Kernel size for residual blocks.
+    resblock_dilations:                   # Dilations for residual blocks.
+        - [1, 3, 5]
+        - [1, 3, 5]
+        - [1, 3, 5]
+    use_additional_convs: True            # Whether to use additional conv layer in residual blocks.
+    bias: True                            # Whether to use bias parameter in conv.
+    nonlinear_activation: "leakyrelu"     # Nonlinear activation type.
+    nonlinear_activation_params:          # Nonlinear activation paramters.
+        negative_slope: 0.1
+    use_weight_norm: True                 # Whether to apply weight normalization.
+
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_params:
+    scales: 3                              # Number of multi-scale discriminator.
+    scale_downsample_pooling: "AvgPool1D"  # Pooling operation for scale discriminator.
+    scale_downsample_pooling_params:
+        kernel_size: 4                     # Pooling kernel size.
+        stride: 2                          # Pooling stride.
+        padding: 2                         # Padding size.
+    scale_discriminator_params:
+        in_channels: 1                     # Number of input channels.
+        out_channels: 1                    # Number of output channels.
+        kernel_sizes: [15, 41, 5, 3]       # List of kernel sizes.
+        channels: 128                      # Initial number of channels.
+        max_downsample_channels: 1024      # Maximum number of channels in downsampling conv layers.
+        max_groups: 16                     # Maximum number of groups in downsampling conv layers.
+        bias: True
+        downsample_scales: [4, 4, 4, 4, 1] # Downsampling scales.
+        nonlinear_activation: "leakyrelu"  # Nonlinear activation.
+        nonlinear_activation_params:
+            negative_slope: 0.1
+    follow_official_norm: True             # Whether to follow the official norm setting.
+    periods: [2, 3, 5, 7, 11]              # List of period for multi-period discriminator.
+    period_discriminator_params:
+        in_channels: 1                     # Number of input channels.
+        out_channels: 1                    # Number of output channels.
+        kernel_sizes: [5, 3]               # List of kernel sizes.
+        channels: 32                       # Initial number of channels.
+        downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales.
+        max_downsample_channels: 1024      # Maximum number of channels in downsampling conv layers.
+        bias: True                         # Whether to use bias parameter in conv layer."
+        nonlinear_activation: "leakyrelu"  # Nonlinear activation.
+        nonlinear_activation_params:       # Nonlinear activation paramters.
+            negative_slope: 0.1
+        use_weight_norm: True              # Whether to apply weight normalization.
+        use_spectral_norm: False           # Whether to apply spectral normalization.
+    
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+use_stft_loss: False                 # Whether to use multi-resolution STFT loss.
+use_mel_loss: True                   # Whether to use Mel-spectrogram loss.
+mel_loss_params:
+    fs: 24000
+    fft_size: 512
+    hop_size: 128
+    win_length: 512
+    window: "hann"
+    num_mels: 80
+    fmin: 30
+    fmax: 12000
+    log_base: null
+generator_adv_loss_params:
+    average_by_discriminators: False # Whether to average loss by #discriminators.
+discriminator_adv_loss_params:
+    average_by_discriminators: False # Whether to average loss by #discriminators.
+use_feat_match_loss: True
+feat_match_loss_params:
+    average_by_discriminators: False # Whether to average loss by #discriminators.
+    average_by_layers: False         # Whether to average loss by #layers in each discriminator.
+    include_final_outputs: False     # Whether to include final outputs in feat match loss calculation.
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_aux: 45.0       # Loss balancing coefficient for STFT loss.
+lambda_adv: 1.0        # Loss balancing coefficient for adversarial loss.
+lambda_feat_match: 2.0 # Loss balancing coefficient for feat match loss..
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 16              # Batch size.
+batch_max_steps: 8400       # Length of each audio in batch. Make sure dividable by hop_size.
+num_workers: 1              # Number of workers in DataLoader.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+    beta1: 0.5
+    beta2: 0.9
+    weight_decay: 0.0                   # Generator's weight decay coefficient.
+generator_scheduler_params:
+    learning_rate: 2.0e-4               # Generator's learning rate.
+    gamma: 0.5                          # Generator's scheduler gamma.
+    milestones:                         # At each milestone, lr will be multiplied by gamma.
+        - 200000
+        - 400000
+        - 600000
+        - 800000
+generator_grad_norm: -1                 # Generator's gradient norm.
+discriminator_optimizer_params:
+    beta1: 0.5
+    beta2: 0.9
+    weight_decay: 0.0                   # Discriminator's weight decay coefficient.
+discriminator_scheduler_params:
+    learning_rate: 2.0e-4               # Discriminator's learning rate.
+    gamma: 0.5                          # Discriminator's scheduler gamma.
+    milestones:                         # At each milestone, lr will be multiplied by gamma.
+        - 200000
+        - 400000
+        - 600000
+        - 800000    
+discriminator_grad_norm: -1             # Discriminator's gradient norm.            
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+generator_train_start_steps: 1     # Number of steps to start to train discriminator.
+discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
+train_max_steps: 2500000           # Number of training steps.
+save_interval_steps: 5000         # Interval steps to save checkpoint.
+eval_interval_steps: 1000          # Interval steps to evaluate the network.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_snapshots: 4                 # max number of snapshots to keep while training
+seed: 42                          # random seed for paddle, random, and np.random
diff --git a/examples/opencpop/voc5/conf/finetune.yaml b/examples/opencpop/voc5/conf/finetune.yaml
new file mode 100644
index 00000000..0022a67a
--- /dev/null
+++ b/examples/opencpop/voc5/conf/finetune.yaml
@@ -0,0 +1,168 @@
+# This is the configuration file for CSMSC dataset.
+# This configuration is based on HiFiGAN V1, which is an official configuration. 
+# But I found that the optimizer setting does not work well with my implementation.
+# So I changed optimizer settings as follows:
+# - AdamW -> Adam
+# - betas: [0.8, 0.99] -> betas: [0.5, 0.9]
+# - Scheduler: ExponentialLR -> MultiStepLR
+# To match the shift size difference, the upsample scales is also modified from the original 256 shift setting.
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+fs: 24000                # Sampling rate.
+n_fft: 512              # FFT size (samples).
+n_shift: 128             # Hop size (samples). 12.5ms
+win_length: 512         # Window length (samples). 50ms
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+n_mels: 80               # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation. (Hz)
+fmax: 12000               # Maximum frequency in mel basis calculation. (Hz)
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_params:
+    in_channels: 80                       # Number of input channels.
+    out_channels: 1                       # Number of output channels.
+    channels: 512                         # Number of initial channels.
+    kernel_size: 7                        # Kernel size of initial and final conv layers.
+    upsample_scales: [8, 4, 2, 2]         # Upsampling scales.
+    upsample_kernel_sizes: [16, 8, 4, 4] # Kernel size for upsampling layers.
+    resblock_kernel_sizes: [3, 7, 11]     # Kernel size for residual blocks.
+    resblock_dilations:                   # Dilations for residual blocks.
+        - [1, 3, 5]
+        - [1, 3, 5]
+        - [1, 3, 5]
+    use_additional_convs: True            # Whether to use additional conv layer in residual blocks.
+    bias: True                            # Whether to use bias parameter in conv.
+    nonlinear_activation: "leakyrelu"     # Nonlinear activation type.
+    nonlinear_activation_params:          # Nonlinear activation paramters.
+        negative_slope: 0.1
+    use_weight_norm: True                 # Whether to apply weight normalization.
+
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_params:
+    scales: 3                              # Number of multi-scale discriminator.
+    scale_downsample_pooling: "AvgPool1D"  # Pooling operation for scale discriminator.
+    scale_downsample_pooling_params:
+        kernel_size: 4                     # Pooling kernel size.
+        stride: 2                          # Pooling stride.
+        padding: 2                         # Padding size.
+    scale_discriminator_params:
+        in_channels: 1                     # Number of input channels.
+        out_channels: 1                    # Number of output channels.
+        kernel_sizes: [15, 41, 5, 3]       # List of kernel sizes.
+        channels: 128                      # Initial number of channels.
+        max_downsample_channels: 1024      # Maximum number of channels in downsampling conv layers.
+        max_groups: 16                     # Maximum number of groups in downsampling conv layers.
+        bias: True
+        downsample_scales: [4, 4, 4, 4, 1] # Downsampling scales.
+        nonlinear_activation: "leakyrelu"  # Nonlinear activation.
+        nonlinear_activation_params:
+            negative_slope: 0.1
+    follow_official_norm: True             # Whether to follow the official norm setting.
+    periods: [2, 3, 5, 7, 11]              # List of period for multi-period discriminator.
+    period_discriminator_params:
+        in_channels: 1                     # Number of input channels.
+        out_channels: 1                    # Number of output channels.
+        kernel_sizes: [5, 3]               # List of kernel sizes.
+        channels: 32                       # Initial number of channels.
+        downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales.
+        max_downsample_channels: 1024      # Maximum number of channels in downsampling conv layers.
+        bias: True                         # Whether to use bias parameter in conv layer."
+        nonlinear_activation: "leakyrelu"  # Nonlinear activation.
+        nonlinear_activation_params:       # Nonlinear activation paramters.
+            negative_slope: 0.1
+        use_weight_norm: True              # Whether to apply weight normalization.
+        use_spectral_norm: False           # Whether to apply spectral normalization.
+    
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+use_stft_loss: False                 # Whether to use multi-resolution STFT loss.
+use_mel_loss: True                   # Whether to use Mel-spectrogram loss.
+mel_loss_params:
+    fs: 24000
+    fft_size: 512
+    hop_size: 128
+    win_length: 512
+    window: "hann"
+    num_mels: 80
+    fmin: 30
+    fmax: 12000
+    log_base: null
+generator_adv_loss_params:
+    average_by_discriminators: False # Whether to average loss by #discriminators.
+discriminator_adv_loss_params:
+    average_by_discriminators: False # Whether to average loss by #discriminators.
+use_feat_match_loss: True
+feat_match_loss_params:
+    average_by_discriminators: False # Whether to average loss by #discriminators.
+    average_by_layers: False         # Whether to average loss by #layers in each discriminator.
+    include_final_outputs: False     # Whether to include final outputs in feat match loss calculation.
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_aux: 45.0       # Loss balancing coefficient for STFT loss.
+lambda_adv: 1.0        # Loss balancing coefficient for adversarial loss.
+lambda_feat_match: 2.0 # Loss balancing coefficient for feat match loss..
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+#batch_size: 16              # Batch size.
+batch_size: 1              # Batch size.
+batch_max_steps: 8400       # Length of each audio in batch. Make sure dividable by hop_size.
+num_workers: 1              # Number of workers in DataLoader.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+    beta1: 0.5
+    beta2: 0.9
+    weight_decay: 0.0                   # Generator's weight decay coefficient.
+generator_scheduler_params:
+    learning_rate: 2.0e-4               # Generator's learning rate.
+    gamma: 0.5                          # Generator's scheduler gamma.
+    milestones:                         # At each milestone, lr will be multiplied by gamma.
+        - 200000
+        - 400000
+        - 600000
+        - 800000
+generator_grad_norm: -1                 # Generator's gradient norm.
+discriminator_optimizer_params:
+    beta1: 0.5
+    beta2: 0.9
+    weight_decay: 0.0                   # Discriminator's weight decay coefficient.
+discriminator_scheduler_params:
+    learning_rate: 2.0e-4               # Discriminator's learning rate.
+    gamma: 0.5                          # Discriminator's scheduler gamma.
+    milestones:                         # At each milestone, lr will be multiplied by gamma.
+        - 200000
+        - 400000
+        - 600000
+        - 800000    
+discriminator_grad_norm: -1             # Discriminator's gradient norm.            
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+generator_train_start_steps: 1     # Number of steps to start to train discriminator.
+discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
+train_max_steps: 2600000           # Number of training steps.
+save_interval_steps: 5000         # Interval steps to save checkpoint.
+eval_interval_steps: 1000          # Interval steps to evaluate the network.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_snapshots: 4                 # max number of snapshots to keep while training
+seed: 42                          # random seed for paddle, random, and np.random
diff --git a/examples/opencpop/voc5/finetune.sh b/examples/opencpop/voc5/finetune.sh
new file mode 100755
index 00000000..76f36329
--- /dev/null
+++ b/examples/opencpop/voc5/finetune.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+
+source path.sh
+
+gpus=0
+stage=0
+stop_stage=100
+
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    python3 ${MAIN_ROOT}/paddlespeech/t2s/exps/diffsinger/gen_gta_mel.py \
+        --diffsinger-config=diffsinger_opencpop_ckpt_1.4.0/default.yaml \
+        --diffsinger-checkpoint=diffsinger_opencpop_ckpt_1.4.0/snapshot_iter_160000.pdz \
+        --diffsinger-stat=diffsinger_opencpop_ckpt_1.4.0/speech_stats.npy \
+        --diffsinger-stretch=diffsinger_opencpop_ckpt_1.4.0/speech_stretchs.npy \
+        --dur-file=~/datasets/Opencpop/segments/transcriptions.txt \
+        --output-dir=dump_finetune \
+        --phones-dict=diffsinger_opencpop_ckpt_1.4.0/phone_id_map.txt \
+        --dataset=opencpop \
+        --rootdir=~/datasets/Opencpop/segments/
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    python3 ${MAIN_ROOT}/utils/link_wav.py \
+        --old-dump-dir=dump \
+        --dump-dir=dump_finetune
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # get features' stats(mean and std)
+    echo "Get features' stats ..."
+    cp dump/train/feats_stats.npy dump_finetune/train/
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # normalize, dev and test should use train's stats
+    echo "Normalize ..."
+   
+    python3 ${BIN_DIR}/../normalize.py \
+        --metadata=dump_finetune/train/raw/metadata.jsonl \
+        --dumpdir=dump_finetune/train/norm \
+        --stats=dump_finetune/train/feats_stats.npy
+    python3 ${BIN_DIR}/../normalize.py \
+        --metadata=dump_finetune/dev/raw/metadata.jsonl \
+        --dumpdir=dump_finetune/dev/norm \
+        --stats=dump_finetune/train/feats_stats.npy
+    
+    python3 ${BIN_DIR}/../normalize.py \
+        --metadata=dump_finetune/test/raw/metadata.jsonl \
+        --dumpdir=dump_finetune/test/norm \
+        --stats=dump_finetune/train/feats_stats.npy
+fi
+
+# create finetune env
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    echo "create finetune env"
+    python3 local/prepare_env.py \
+        --pretrained_model_dir=exp/default/checkpoints/ \
+        --output_dir=exp/finetune/
+fi 
+
+# finetune
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+    CUDA_VISIBLE_DEVICES=${gpus} \
+    FLAGS_cudnn_exhaustive_search=true \
+    FLAGS_conv_workspace_size_limit=4000 \
+    python ${BIN_DIR}/train.py \
+        --train-metadata=dump_finetune/train/norm/metadata.jsonl \
+        --dev-metadata=dump_finetune/dev/norm/metadata.jsonl \
+        --config=conf/finetune.yaml \
+        --output-dir=exp/finetune \
+        --ngpu=1
+fi 
diff --git a/examples/opencpop/voc5/local/PTQ_static.sh b/examples/opencpop/voc5/local/PTQ_static.sh
new file mode 120000
index 00000000..247ce5c7
--- /dev/null
+++ b/examples/opencpop/voc5/local/PTQ_static.sh
@@ -0,0 +1 @@
+../../../csmsc/voc1/local/PTQ_static.sh
\ No newline at end of file
diff --git a/examples/opencpop/voc5/local/dygraph_to_static.sh b/examples/opencpop/voc5/local/dygraph_to_static.sh
new file mode 100755
index 00000000..65077661
--- /dev/null
+++ b/examples/opencpop/voc5/local/dygraph_to_static.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
+FLAGS_allocator_strategy=naive_best_fit \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python3 ${BIN_DIR}/../../dygraph_to_static.py \
+    --type=voc \
+    --voc=hifigan_opencpop \
+    --voc_config=${config_path} \
+    --voc_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+    --voc_stat=dump/train/feats_stats.npy \
+    --inference_dir=exp/default/inference/
diff --git a/examples/opencpop/voc5/local/prepare_env.py b/examples/opencpop/voc5/local/prepare_env.py
new file mode 120000
index 00000000..be03c86b
--- /dev/null
+++ b/examples/opencpop/voc5/local/prepare_env.py
@@ -0,0 +1 @@
+../../../other/tts_finetune/tts3/local/prepare_env.py
\ No newline at end of file
diff --git a/examples/opencpop/voc5/local/preprocess.sh b/examples/opencpop/voc5/local/preprocess.sh
new file mode 120000
index 00000000..f0cb24de
--- /dev/null
+++ b/examples/opencpop/voc5/local/preprocess.sh
@@ -0,0 +1 @@
+../../voc1/local/preprocess.sh
\ No newline at end of file
diff --git a/examples/opencpop/voc5/local/synthesize.sh b/examples/opencpop/voc5/local/synthesize.sh
new file mode 120000
index 00000000..c887112c
--- /dev/null
+++ b/examples/opencpop/voc5/local/synthesize.sh
@@ -0,0 +1 @@
+../../../csmsc/voc5/local/synthesize.sh
\ No newline at end of file
diff --git a/examples/opencpop/voc5/local/train.sh b/examples/opencpop/voc5/local/train.sh
new file mode 120000
index 00000000..2942893d
--- /dev/null
+++ b/examples/opencpop/voc5/local/train.sh
@@ -0,0 +1 @@
+../../../csmsc/voc1/local/train.sh
\ No newline at end of file
diff --git a/examples/opencpop/voc5/path.sh b/examples/opencpop/voc5/path.sh
new file mode 120000
index 00000000..b67fe2b3
--- /dev/null
+++ b/examples/opencpop/voc5/path.sh
@@ -0,0 +1 @@
+../../csmsc/voc5/path.sh
\ No newline at end of file
diff --git a/examples/opencpop/voc5/run.sh b/examples/opencpop/voc5/run.sh
new file mode 100755
index 00000000..290c90d2
--- /dev/null
+++ b/examples/opencpop/voc5/run.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+set -e
+source path.sh
+
+gpus=0
+stage=0
+stop_stage=100
+
+conf_path=conf/default.yaml
+train_output_path=exp/default
+ckpt_name=snapshot_iter_2500000.pdz
+
+# with the following command, you can choose the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    ./local/preprocess.sh ${conf_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # synthesize
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+# dygraph to static
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/dygraph_to_static.sh  ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+# PTQ_static
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/PTQ_static.sh  ${train_output_path} hifigan_opencpop || exit -1
+fi
diff --git a/examples/other/mfa/README.md b/examples/other/mfa/README.md
index 216d1275..b85dac4d 100644
--- a/examples/other/mfa/README.md
+++ b/examples/other/mfa/README.md
@@ -7,3 +7,10 @@ Run the following script to get started, for more detail, please see `run.sh`.
 # Rhythm tags for MFA
 If you want to get rhythm tags with duration through MFA tool, you may add flag `--rhy-with-duration` in the first two commands in `run.sh`
 Note that only CSMSC dataset is supported so far, and we replace `#` with `sp` in rhythm tags for MFA.
+
+# MFA for Cantonese language
+First, go download these datasets [Guangzhou_Cantonese_Scripted_Speech_Corpus_Daily_Use_Sentence](https://magichub.com/datasets/guangzhou-cantonese-scripted-speech-corpus-daily-use-sentence/) and [Guangzhou_Cantonese_Scripted_Speech_Corpus_in_Vehicle](https://magichub.com/datasets/guangzhou-cantonese-scripted-speech-corpus-in-the-vehicle/) under `~/datasets/`.
+Then,
+```bash
+./run_canton.sh
+```
diff --git a/examples/other/mfa/local/generate_canton_lexicon_wavlabs.py b/examples/other/mfa/local/generate_canton_lexicon_wavlabs.py
new file mode 100644
index 00000000..e760528d
--- /dev/null
+++ b/examples/other/mfa/local/generate_canton_lexicon_wavlabs.py
@@ -0,0 +1,82 @@
+import argparse
+import os
+import re
+import shutil
+
+import ToJyutping
+
+
+def check(str):
+    my_re = re.compile(r'[A-Za-z]', re.S)
+    res = re.findall(my_re, str)
+    if len(res):
+        return True
+    else:
+        return False
+
+
+INITIALS = [
+    'aa', 'aai', 'aak', 'aap', 'aat', 'aau', 'ai', 'au', 'ap', 'at', 'ak', 'a',
+    'p', 'b', 'e', 'ts', 't', 'dz', 'd', 'kw', 'k', 'gw', 'g', 'f', 'h', 'l',
+    'm', 'ng', 'n', 's', 'y', 'w', 'c', 'z', 'j', 'ong', 'on', 'ou', 'oi', 'ok',
+    'o', 'uk', 'ung'
+]
+
+
+def get_lines(canton):
+    for init in INITIALS:
+        if canton.startswith(init):
+            c, v = canton[:len(init)], canton[len(init):]
+            return canton + ' ' + c + ' ' + v
+    return canton + ' ' + canton
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Generate lexicon for Cantonese pinyin to phoneme for MFA")
+    parser.add_argument(
+        "--output_lexicon", type=str, help="Path to save lexicon.")
+    parser.add_argument(
+        "--output_wavlabs",
+        type=str,
+        help="Path of wavs and labs for MFA training.")
+    parser.add_argument(
+        "--inputs", type=str, nargs="+", help="Path to the cantonese datasets.")
+    args = parser.parse_args()
+
+    os.mkdir(args.output_wavlabs)
+
+    utterance_info = []
+    all_canton = []
+    for input_ in args.inputs:
+        utt = "UTTRANSINFO.txt" if "Guangzhou_Cantonese_Scripted_Speech_Corpus_Daily_Use_Sentence" in input_ else "UTTERANCEINFO.txt"
+        input_utttxt = os.path.join(input_, utt)
+
+        with open(input_utttxt, 'r') as f:
+            utterance_info = f.readlines()[1:]
+
+        for utterance_line in utterance_info:
+            _, wav_name, spk, _, text = utterance_line.split('\t')
+            text = text.strip().replace(' ', '')
+            # check the characters and drop the short text.
+            if not check(text) and len(text) > 2:
+                source_path = os.path.join(input_, 'WAV', spk, wav_name)
+                out_spk_path = os.path.join(args.output_wavlabs, spk)
+                os.makedirs(out_spk_path, exist_ok=True)
+                target_path = os.path.join(out_spk_path, wav_name)
+
+                shutil.copy(source_path, target_path)
+
+                lab_name = wav_name.split('.')[0] + '.lab'
+                lab_target_path = os.path.join(out_spk_path, lab_name)
+                canton_list = ToJyutping.get_jyutping_text(text)
+                with open(lab_target_path, 'w') as f:
+                    f.write(canton_list)
+
+                canton_list = canton_list.split(' ')
+                all_canton.extend(canton_list)
+    all_canton = sorted(list(set(all_canton)))
+
+    with open(args.output_lexicon, 'w') as f:
+        for canton in all_canton:
+            f.write(get_lines(canton) + '\n')
diff --git a/examples/other/mfa/local/generate_lexicon.py b/examples/other/mfa/local/generate_lexicon.py
index 3deb2470..e63b5eb2 100644
--- a/examples/other/mfa/local/generate_lexicon.py
+++ b/examples/other/mfa/local/generate_lexicon.py
@@ -48,7 +48,7 @@ def rule(C, V, R, T):
     
     'i' is distinguished when appeared in phonemes, and separated into 3 categories, 'i', 'ii' and 'iii'.
 
-    Erhua is is possibly applied to every finals, except for finals that already ends with 'r'.
+    Erhua is possibly applied to every finals, except for finals that already ends with 'r'.
 
     When a syllable is impossible or does not have any characters with this pronunciation, return None
     to filter it out.
diff --git a/examples/other/mfa/run_canton.sh b/examples/other/mfa/run_canton.sh
new file mode 100755
index 00000000..cef6a2f0
--- /dev/null
+++ b/examples/other/mfa/run_canton.sh
@@ -0,0 +1,34 @@
+EXP_DIR=exp
+
+mkdir -p $EXP_DIR
+LEXICON_NAME='canton'
+if [ ! -f "$EXP_DIR/$LEXICON_NAME.lexicon" ]; then
+    echo "generating lexicon and training data..."
+    python local/generate_canton_lexicon_wavlabs.py --output_lexicon "$EXP_DIR/$LEXICON_NAME.lexicon" --output_wavlabs "$EXP_DIR/$LEXICON_NAME"_wavlabs --inputs ~/datasets/Guangzhou_Cantonese_Scripted_Speech_Corpus_Daily_Use_Sentence ~/datasets/Guangzhou_Cantonese_Scripted_Speech_Corpus_in_Vehicle
+    echo "lexicon and training data done"
+fi
+
+
+MFA_DOWNLOAD_DIR=local/
+
+if [ ! -f "$MFA_DOWNLOAD_DIR/montreal-forced-aligner_linux.tar.gz" ]; then
+    echo "downloading mfa..."
+    (cd $MFA_DOWNLOAD_DIR && wget https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/download/v1.0.1/montreal-forced-aligner_linux.tar.gz)
+    echo "download mfa done!"
+fi
+
+if [ ! -d "$MFA_DOWNLOAD_DIR/montreal-forced-aligner" ]; then
+    echo "extracting mfa..."
+    (cd $MFA_DOWNLOAD_DIR && tar xvf "montreal-forced-aligner_linux.tar.gz")
+    echo "extraction done!"
+fi
+
+export PATH="$MFA_DOWNLOAD_DIR/montreal-forced-aligner/bin"
+if [ ! -d "$EXP_DIR/canton_alignment" ]; then
+    echo "Start MFA training..."
+    mfa_train_and_align "$EXP_DIR/$LEXICON_NAME"_wavlabs "$EXP_DIR/$LEXICON_NAME.lexicon" $EXP_DIR/canton_alignment -o $EXP_DIR/canton_model --clean --verbose --temp_directory $EXP_DIR/.mfa_train_and_align
+    echo "training done!"
+    echo "results: $EXP_DIR/canton_alignment"
+    echo "model: $EXP_DIR/canton_model"
+fi
+
diff --git a/examples/other/tn/data/textnorm_test_cases.txt b/examples/other/tn/data/textnorm_test_cases.txt
index 17e90d0b..ba9e6529 100644
--- a/examples/other/tn/data/textnorm_test_cases.txt
+++ b/examples/other/tn/data/textnorm_test_cases.txt
@@ -32,7 +32,7 @@ iPad Pro的秒控键盘这次也推出白色版本。|iPad Pro的秒控键盘这
 明天有62%的概率降雨|明天有百分之六十二的概率降雨
 这是固话0421-33441122|这是固话零四二一三三四四一一二二
 这是手机+86 18544139121|这是手机八六一八五四四一三九一二一
-小王的身高是153.5cm,梦想是打篮球!我觉得有0.1%的可能性。|小王的身高是一百五十三点五cm,梦想是打篮球!我觉得有百分之零点一的可能性。
+小王的身高是153.5cm,梦想是打篮球!我觉得有0.1%的可能性。|小王的身高是一百五十三点五厘米,梦想是打篮球!我觉得有百分之零点一的可能性。
 不管三七二十一|不管三七二十一
 九九八十一难|九九八十一难
 2018年5月23号上午10点10分|二零一八年五月二十三号上午十点十分
@@ -124,4 +124,4 @@ iPad Pro的秒控键盘这次也推出白色版本。|iPad Pro的秒控键盘这
 12~23|十二到二十三
 12-23|十二到二十三
 25cm²|二十五平方厘米
-25m|米
\ No newline at end of file
+25m|米
diff --git a/examples/other/tts_finetune/tts3/run.sh b/examples/other/tts_finetune/tts3/run.sh
index ed1705f8..cc25d8f6 100755
--- a/examples/other/tts_finetune/tts3/run.sh
+++ b/examples/other/tts_finetune/tts3/run.sh
@@ -8,7 +8,6 @@ input_dir=./input/csmsc_mini
 newdir_name="newdir"
 new_dir=${input_dir}/${newdir_name}
 pretrained_model_dir=./pretrained_models/fastspeech2_aishell3_ckpt_1.1.0
-mfa_tools=./tools
 mfa_dir=./mfa_result
 dump_dir=./dump
 output_dir=./exp/default
diff --git a/examples/other/tts_finetune/tts3/run_en.sh b/examples/other/tts_finetune/tts3/run_en.sh
index 765274e8..53721486 100755
--- a/examples/other/tts_finetune/tts3/run_en.sh
+++ b/examples/other/tts_finetune/tts3/run_en.sh
@@ -7,7 +7,6 @@ input_dir=./input/ljspeech_mini
 newdir_name="newdir"
 new_dir=${input_dir}/${newdir_name}
 pretrained_model_dir=./pretrained_models/fastspeech2_vctk_ckpt_1.2.0
-mfa_tools=./tools
 mfa_dir=./mfa_result
 dump_dir=./dump
 output_dir=./exp/default
diff --git a/examples/other/tts_finetune/tts3/run_mix.sh b/examples/other/tts_finetune/tts3/run_mix.sh
index 960278a5..7630022b 100755
--- a/examples/other/tts_finetune/tts3/run_mix.sh
+++ b/examples/other/tts_finetune/tts3/run_mix.sh
@@ -8,7 +8,6 @@ input_dir=./input/SSB0005_mini
 newdir_name="newdir"
 new_dir=${input_dir}/${newdir_name}
 pretrained_model_dir=./pretrained_models/fastspeech2_mix_ckpt_1.2.0
-mfa_tools=./tools
 mfa_dir=./mfa_result
 dump_dir=./dump
 output_dir=./exp/default
diff --git a/examples/tal_cs/asr1/README.md b/examples/tal_cs/asr1/README.md
new file mode 100644
index 00000000..83a27ac1
--- /dev/null
+++ b/examples/tal_cs/asr1/README.md
@@ -0,0 +1,190 @@
+# Transformer/Conformer ASR with TALCS
+This example contains code used to train [u2](https://arxiv.org/pdf/2012.05481.pdf) model (Transformer or [Conformer](https://arxiv.org/pdf/2005.08100.pdf) model) with [TALCS dataset](https://ai.100tal.com/dataset)
+## Overview
+All the scripts you need are in `run.sh`. There are several stages in `run.sh`, and each stage has its function.
+| Stage | Function                                                     |
+|:---- |:----------------------------------------------------------- |
+| 0     | Process data. It includes: <br>       (1) Download the dataset <br>       (2) Calculate the CMVN of the train dataset <br>       (3) Get the vocabulary file <br>       (4) Get the manifest files of the train, development and test dataset<br>       (5) Get the sentencepiece model |
+| 1     | Train the model                                              |
+| 2     | Get the final model by averaging the top-k models, set k = 1 means to choose the best model |
+| 3     | Test the final model performance                             |
+| 4     | Get ctc alignment of test data using the final model         |
+| 5     | Infer the single audio file                                  |
+
+You can choose to run a range of stages by setting `stage` and `stop_stage `. 
+
+For example, if you want to execute the code in stage 2 and stage 3, you can run this script:
+```bash
+bash run.sh --stage 2 --stop_stage 3
+```
+Or you can set `stage` equal to `stop-stage` to only run one stage.
+For example, if you only want to run `stage 0`, you can use the script below:
+```bash
+bash run.sh --stage 0 --stop_stage 0
+```
+The document below will describe the scripts in `run.sh` in detail.
+## The Environment Variables
+The path.sh contains the environment variables. 
+```bash
+. ./path.sh
+. ./cmd.sh
+```
+This script needs to be run first. And another script is also needed:
+```bash
+source ${MAIN_ROOT}/utils/parse_options.sh
+```
+It will support the way of using `--variable value` in the shell scripts.
+## The Local Variables
+Some local variables are set in `run.sh`. 
+`gpus` denotes the GPU number you want to use. If you set `gpus=`, it means you only use CPU. 
+`stage` denotes the number of stages you want to start from in the experiments.
+`stop stage` denotes the number of the stage you want to end at in the experiments. 
+`conf_path` denotes the config path of the model.
+`avg_num` denotes the number K of top-K models you want to average to get the final model.
+`audio file` denotes the file path of the single file you want to infer in stage 5
+`ckpt` denotes the checkpoint prefix of the model, e.g. "conformer"
+
+You can set the local variables (except `ckpt`) when you use `run.sh`
+
+For example, you can set the `gpus` and `avg_num` when you use the command line:
+```bash
+bash run.sh --gpus 0,1 --avg_num 10
+```
+## Stage 0: Data Processing
+To use this example, you need to process data firstly and you can use stage 0 in `run.sh` to do this. The code is shown below:
+```bash
+ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+     # prepare data
+     bash ./local/data.sh || exit -1
+ fi
+```
+Stage 0 is for processing the data.
+
+If you only want to process the data. You can run
+```bash
+bash run.sh --stage 0 --stop_stage 0
+```
+You can also just run these scripts in your command line.
+```bash
+. ./path.sh
+. ./cmd.sh
+bash ./local/data.sh
+```
+After processing the data, the `data` directory will look like this:
+```bash
+data/
+|-- dev_set.meta
+|-- lang_char
+|   `-- bpe_bpe_11297.model
+|   `-- bpe_bpe_11297.vocab
+|   `-- vocab.txt
+|-- manifest.dev
+|-- manifest.dev.raw
+|-- manifest.test
+|-- manifest.test.raw
+|-- manifest.train
+|-- manifest.train.raw
+|-- mean_std.json
+|-- test_set.meta
+`-- train_set.meta
+```
+## Stage 1: Model Training
+If you want to train the model. you can use stage 1 in `run.sh`. The code is shown below. 
+```bash
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+     # train model, all `ckpt` under `exp` dir
+     CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt}
+ fi
+```
+If you want to train the model, you can use the script below to execute stage 0 and stage 1:
+```bash
+bash run.sh --stage 0 --stop_stage 1
+```
+or you can run these scripts in the command line (only use CPU).
+```bash
+. ./path.sh
+. ./cmd.sh
+bash ./local/data.sh
+CUDA_VISIBLE_DEVICES= ./local/train.sh conf/conformer.yaml conformer
+```
+## Stage 2: Top-k Models Averaging
+After training the model, we need to get the final model for testing and inference. In every epoch, the model checkpoint is saved, so we can choose the best model from them based on the validation loss or we can sort them and average the parameters of the top-k models to get the final model. We can use stage 2 to do this, and the code is shown below:
+```bash
+ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+     # avg n best model
+     avg.sh best exp/${ckpt}/checkpoints ${avg_num}
+ fi
+```
+The `avg.sh` is in the `../../../utils/` which is define in the `path.sh`.
+If you want to get the final model, you can use the script below to execute stage 0, stage 1, and stage 2:
+```bash
+bash run.sh --stage 0 --stop_stage 2
+```
+or you can run these scripts in the command line (only use CPU).
+
+```bash
+. ./path.sh
+. ./cmd.sh
+bash ./local/data.sh
+CUDA_VISIBLE_DEVICES= ./local/train.sh conf/conformer.yaml conformer
+avg.sh best exp/conformer/checkpoints 10
+```
+## Stage 3: Model Testing
+The test stage is to evaluate the model performance. The code of test stage is shown below:
+```bash
+ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+     # test ckpt avg_n
+     CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+ fi
+```
+If you want to train a model and test it, you can use the script below to execute stage 0, stage 1, stage 2, and stage 3 :
+```bash
+bash run.sh --stage 0 --stop_stage 3
+```
+or you can run these scripts in the command line (only use CPU).
+```bash
+. ./path.sh
+. ./cmd.sh
+bash ./local/data.sh
+CUDA_VISIBLE_DEVICES= ./local/train.sh conf/conformer.yaml conformer
+avg.sh best exp/conformer/checkpoints 10
+CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml exp/conformer/checkpoints/avg_10
+```
+## Pretrained Model
+You can get the pretrained transformer or conformer from [this](../../../docs/source/released_model.md).
+
+using the `tar` scripts to unpack the model and then you can use the script to test the model.
+
+For example:
+```bash
+wget https://paddlespeech.bj.bcebos.com/s2t/tal_cs/asr1/asr1_conformer_talcs_ckpt_1.4.0.model.tar.gz
+tar xzvf asr1_conformer_talcs_ckpt_1.4.0.model.tar.gz
+source path.sh
+# If you have process the data and get the manifest file， you can skip the following 2 steps
+bash local/data.sh --stage -1 --stop_stage -1
+bash local/data.sh --stage 2 --stop_stage 2
+CUDA_VISIBLE_DEVICES= ./local/test.sh conf/conformer.yaml exp/conformer/checkpoints/avg_10
+```
+The performance of the released models are shown in [here](./RESULTS.md).
+
+## Stage 5: Single Audio File Inference
+In some situations, you want to use the trained model to do the inference for the single audio file. You can use stage 5. The code is shown below
+```bash
+ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+     # test a single .wav file
+     CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1
+ fi
+```
+you can train the model by yourself using ```bash run.sh --stage 0 --stop_stage 3```, or you can download the pretrained model through the script below:
+```bash
+wget https://paddlespeech.bj.bcebos.com/s2t/tal_cs/asr1/asr1_conformer_talcs_ckpt_1.4.0.model.tar.gz
+tar xzvf asr1_conformer_talcs_ckpt_1.4.0.model.tar.gz
+```
+You can download the audio demo:
+```bash
+wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/
+```
+You need to prepare an audio file or use the audio demo above, please confirm the sample rate of the audio is 16K. You can get the result of the audio demo by running the script below.
+```bash
+CUDA_VISIBLE_DEVICES= ./local/test_wav.sh conf/conformer.yaml exp/conformer/checkpoints/avg_10 data/demo_01_03.wav
+```
diff --git a/examples/tal_cs/asr1/RESULTS.md b/examples/tal_cs/asr1/RESULTS.md
new file mode 100644
index 00000000..4a6bd8fd
--- /dev/null
+++ b/examples/tal_cs/asr1/RESULTS.md
@@ -0,0 +1,12 @@
+# TALCS
+2023.1.6, commit id: fa724285f3b799b97b4348ad3b1084afc0764f9b
+
+## Conformer
+train: Epoch 100, 3 V100-32G, best avg: 10
+
+| Model | Params | Config | Augmentation| Test set | Decode method | Loss | MER |  
+| --- | --- | --- | --- | --- | --- | --- | --- |
+| conformer | 47.63 M | conf/conformer.yaml | spec_aug | test-set | attention | 9.85091028213501 | 0.102786 |  
+| conformer | 47.63 M | conf/conformer.yaml | spec_aug | test-set | ctc_greedy_search | 9.85091028213501 | 0.103538 |  
+| conformer | 47.63 M | conf/conformer.yaml | spec_aug | test-set | ctc_prefix_beam_search | 9.85091028213501 | 0.103317 |  
+| conformer | 47.63 M | conf/conformer.yaml | spec_aug | test-set | attention_rescoring | 9.85091028213501 | 0.084374 |  
diff --git a/examples/tal_cs/asr1/conf/conformer.yaml b/examples/tal_cs/asr1/conf/conformer.yaml
new file mode 100644
index 00000000..25148d1b
--- /dev/null
+++ b/examples/tal_cs/asr1/conf/conformer.yaml
@@ -0,0 +1,91 @@
+############################################
+#           Network Architecture           #
+############################################
+cmvn_file: 
+cmvn_file_type: "json"
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 512    # dimension of attention
+    attention_heads: 8
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: True
+    cnn_module_kernel: 15
+    use_cnn_module: True
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rel_pos'
+    selfattention_layer_type: 'rel_selfattn'
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+    init_type: 'kaiming_uniform' # !Warning: need to convergence
+
+###########################################
+#                   Data                  #
+###########################################
+train_manifest: data/manifest.train
+dev_manifest: data/manifest.dev
+test_manifest: data/manifest.test
+
+###########################################
+#              Dataloader                 #
+###########################################
+vocab_filepath: data/lang_char/vocab.txt 
+spm_model_prefix: 'data/lang_char/bpe_bpe_11297'
+unit_type: 'spm'
+preprocess_config: conf/preprocess.yaml
+feat_dim: 80
+stride_ms: 20.0
+window_ms: 30.0
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
+batch_size: 5
+maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
+maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
+minibatches: 0 # for debug
+batch_count: auto
+batch_bins: 0 
+batch_frames_in: 0
+batch_frames_out: 0
+batch_frames_inout: 0
+num_workers: 2
+subsampling_factor: 1
+num_encs: 1
+
+###########################################
+#                Training                 #
+###########################################
+n_epoch: 100 
+accum_grad: 4
+global_grad_clip: 5.0
+dist_sampler: False
+optim: adam
+optim_conf:
+  lr: 0.002
+  weight_decay: 1.0e-6
+scheduler: warmuplr
+scheduler_conf:
+  warmup_steps: 25000
+  lr_decay: 1.0
+log_interval: 100
+checkpoint:
+  kbest_n: 50
+  latest_n: 5
diff --git a/examples/tal_cs/asr1/conf/preprocess.yaml b/examples/tal_cs/asr1/conf/preprocess.yaml
new file mode 100644
index 00000000..c7ccc522
--- /dev/null
+++ b/examples/tal_cs/asr1/conf/preprocess.yaml
@@ -0,0 +1,29 @@
+process:
+  # extract kaldi fbank from PCM
+  - type: fbank_kaldi
+    fs: 16000
+    n_mels: 80
+    n_shift: 160
+    win_length: 400
+    dither: 1.0
+  - type: cmvn_json
+    cmvn_path: data/mean_std.json
+  # these three processes are a.k.a. SpecAugument
+  - type: time_warp
+    max_time_warp: 5
+    inplace: true
+    mode: PIL
+  - type: freq_mask
+    F: 30
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
+  - type: time_mask
+    T: 40
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
+
+
+
+
diff --git a/examples/tal_cs/asr1/conf/tuning/chunk_decode.yaml b/examples/tal_cs/asr1/conf/tuning/chunk_decode.yaml
new file mode 100644
index 00000000..6945ed6e
--- /dev/null
+++ b/examples/tal_cs/asr1/conf/tuning/chunk_decode.yaml
@@ -0,0 +1,12 @@
+beam_size: 10
+decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
+ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
+reverse_weight: 0.3 # reverse weight for attention rescoring decode mode.
+decoding_chunk_size: 16 # decoding chunk size. Defaults to -1.
+    # <0: for decoding, use full chunk.
+    # >0: for decoding, use fixed chunk size as set.
+    # 0: used for training, it's prohibited here. 
+num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
+simulate_streaming: True  # simulate streaming inference. Defaults to False.
+decode_batch_size: 128
+error_rate_type: cer 
diff --git a/examples/tal_cs/asr1/conf/tuning/decode.yaml b/examples/tal_cs/asr1/conf/tuning/decode.yaml
new file mode 100644
index 00000000..22611176
--- /dev/null
+++ b/examples/tal_cs/asr1/conf/tuning/decode.yaml
@@ -0,0 +1,12 @@
+beam_size: 10
+decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
+ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
+#reverse_weight: 0.3 # reverse weight for attention rescoring decode mode.
+decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
+    # <0: for decoding, use full chunk.
+    # >0: for decoding, use fixed chunk size as set.
+    # 0: used for training, it's prohibited here. 
+num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
+simulate_streaming: False  # simulate streaming inference. Defaults to False.
+decode_batch_size: 1
+error_rate_type: cer 
diff --git a/examples/tal_cs/asr1/local/data.sh b/examples/tal_cs/asr1/local/data.sh
new file mode 100644
index 00000000..7ea12809
--- /dev/null
+++ b/examples/tal_cs/asr1/local/data.sh
@@ -0,0 +1,88 @@
+#!/bin/bash
+stage=-1
+stop_stage=100
+dict_dir=data/lang_char
+
+# bpemode (unigram or bpe)
+nbpe=11297
+bpemode=bpe
+bpeprefix="${dict_dir}/bpe_${bpemode}_${nbpe}"
+
+stride_ms=20
+window_ms=30
+sample_rate=16000
+feat_dim=80
+
+source ${MAIN_ROOT}/utils/parse_options.sh
+
+
+mkdir -p data
+mkdir -p ${dict_dir}
+TARGET_DIR=${MAIN_ROOT}/dataset
+mkdir -p ${TARGET_DIR}
+
+#prepare data
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    if [ ! -d "${MAIN_ROOT}/dataset/tal_cs/TALCS_corpus" ]; then
+        echo "${MAIN_ROOT}/dataset/tal_cs/TALCS_corpus does not exist. Please donwload tal_cs data and unpack it from https://ai.100tal.com/dataset first."
+        echo "data md5 reference: 4c879b3c9c05365fc9dee1fc68713afe"
+        exit
+    fi
+    # create manifest json file from TALCS_corpus
+    python ${MAIN_ROOT}/dataset/tal_cs/tal_cs.py --target_dir ${MAIN_ROOT}/dataset/tal_cs/TALCS_corpus/ --manifest_prefix data/
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # compute mean and stddev for normalizer
+    num_workers=$(nproc)
+    python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
+    --manifest_path="data/manifest.train.raw" \
+    --num_samples=-1 \
+    --spectrum_type="fbank" \
+    --feat_dim=${feat_dim}  \
+    --delta_delta=false \
+    --sample_rate=${sample_rate} \
+    --stride_ms=${stride_ms} \
+    --window_ms=${window_ms} \
+    --use_dB_normalization=False \
+    --num_workers=${num_workers} \
+    --output_path="data/mean_std.json"
+    echo "compute mean and stddev done."
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    #use train_set build dict
+    python3 ${MAIN_ROOT}/utils/build_vocab.py \
+    --unit_type 'spm' \
+    --count_threshold=0 \
+    --vocab_path="${dict_dir}/vocab.txt"  \
+    --manifest_paths="data/manifest.train.raw"  \
+    --spm_mode=${bpemode} \
+    --spm_vocab_size=${nbpe}  \
+    --spm_model_prefix=${bpeprefix} \
+    --spm_character_coverage=1 
+    echo "build dict done."
+fi
+
+#use new dict format data
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # format manifest with tokenids, vocab size
+    for sub in train dev test ; do
+    {
+        python3 ${MAIN_ROOT}/utils/format_data.py \
+        --cmvn_path "data/mean_std.json" \
+        --unit_type "spm" \
+        --spm_model_prefix ${bpeprefix} \
+        --vocab_path="${dict_dir}/vocab.txt" \
+        --manifest_path="data/manifest.${sub}.raw" \
+        --output_path="data/manifest.${sub}"
+
+        if [ $? -ne 0 ]; then
+            echo "Formt mnaifest failed. Terminated."
+            exit 1
+        fi
+    }&
+    done
+    wait
+    echo "format data done."
+fi
diff --git a/examples/tal_cs/asr1/local/test.sh b/examples/tal_cs/asr1/local/test.sh
new file mode 100755
index 00000000..65b884e5
--- /dev/null
+++ b/examples/tal_cs/asr1/local/test.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+
+if [ $# != 3 ];then
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix"
+    exit -1
+fi
+
+ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+echo "using $ngpu gpus..."
+
+config_path=$1
+decode_config_path=$2
+ckpt_prefix=$3
+
+chunk_mode=false
+if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then
+    chunk_mode=true
+fi
+
+# download language model
+#bash local/download_lm_ch.sh
+#if [ $? -ne 0 ]; then
+#    exit 1
+#fi
+
+
+for type in attention ctc_greedy_search; do
+    echo "decoding ${type}"
+    if [ ${chunk_mode} == true ];then
+        # stream decoding only support batchsize=1
+        batch_size=1
+    else
+        batch_size=64
+    fi
+    output_dir=${ckpt_prefix}
+    mkdir -p ${output_dir}
+    python3 -u ${BIN_DIR}/test.py \
+    --ngpu ${ngpu} \
+    --config ${config_path} \
+    --decode_cfg ${decode_config_path} \
+    --result_file ${output_dir}/${type}.rsl \
+    --checkpoint_path ${ckpt_prefix} \
+    --opts decode.decoding_method ${type} \
+    --opts decode.decode_batch_size ${batch_size}
+
+    if [ $? -ne 0 ]; then
+        echo "Failed in evaluation!"
+        exit 1
+    fi
+done
+
+for type in ctc_prefix_beam_search attention_rescoring; do
+    echo "decoding ${type}"
+    batch_size=1
+    output_dir=${ckpt_prefix}
+    mkdir -p ${output_dir}
+    python3 -u ${BIN_DIR}/test.py \
+    --ngpu ${ngpu} \
+    --config ${config_path} \
+    --decode_cfg ${decode_config_path} \
+    --result_file ${output_dir}/${type}.rsl \
+    --checkpoint_path ${ckpt_prefix} \
+    --opts decode.decoding_method ${type} \
+    --opts decode.decode_batch_size ${batch_size}
+
+    if [ $? -ne 0 ]; then
+        echo "Failed in evaluation!"
+        exit 1
+    fi
+done
+
+exit 0
diff --git a/examples/tal_cs/asr1/local/test_wav.sh b/examples/tal_cs/asr1/local/test_wav.sh
new file mode 100755
index 00000000..d029f2fd
--- /dev/null
+++ b/examples/tal_cs/asr1/local/test_wav.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+
+if [ $# != 4 ];then
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix audio_file"
+    exit -1
+fi
+
+ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+echo "using $ngpu gpus..."
+
+config_path=$1
+decode_config_path=$2
+ckpt_prefix=$3
+audio_file=$4
+
+mkdir -p data
+wget -nc https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/demo_01_03.wav -P data/
+if [ $? -ne 0 ]; then
+   exit 1
+fi
+
+if [ ! -f ${audio_file} ]; then
+    echo "Plase input the right audio_file path"
+    exit 1
+fi
+
+chunk_mode=false
+if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then
+    chunk_mode=true
+fi
+
+# download language model
+#bash local/download_lm_ch.sh
+#if [ $? -ne 0 ]; then
+#    exit 1
+#fi
+
+for type in  attention_rescoring; do
+    echo "decoding ${type}"
+    batch_size=1
+    output_dir=${ckpt_prefix}
+    mkdir -p ${output_dir}
+    python3 -u ${BIN_DIR}/test_wav.py \
+    --ngpu ${ngpu} \
+    --config ${config_path} \
+    --decode_cfg ${decode_config_path} \
+    --result_file ${output_dir}/${type}.rsl \
+    --checkpoint_path ${ckpt_prefix} \
+    --opts decode.decoding_method ${type} \
+    --opts decode.decode_batch_size ${batch_size} \
+    --audio_file ${audio_file}
+
+    if [ $? -ne 0 ]; then
+        echo "Failed in evaluation!"
+        exit 1
+    fi
+done
+exit 0
diff --git a/examples/tal_cs/asr1/local/train.sh b/examples/tal_cs/asr1/local/train.sh
new file mode 100755
index 00000000..bfa8dd97
--- /dev/null
+++ b/examples/tal_cs/asr1/local/train.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+
+profiler_options=
+benchmark_batch_size=0
+benchmark_max_step=0
+
+# seed may break model convergence
+seed=0
+
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
+
+ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+echo "using $ngpu gpus..."
+
+if [ ${seed} != 0  ]; then
+    export FLAGS_cudnn_deterministic=True
+    echo "using seed $seed & FLAGS_cudnn_deterministic=True ..."
+fi
+
+if [ $# -lt 2 ] && [ $# -gt 3 ];then
+    echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ips(optional)"
+    exit -1
+fi
+
+config_path=$1
+ckpt_name=$2
+ips=$3
+
+if [ ! $ips ];then
+  ips_config=
+else
+  ips_config="--ips="${ips}
+fi
+echo ${ips_config}
+
+mkdir -p exp
+
+# default memeory allocator strategy may case gpu training hang
+# for no OOM raised when memory exhaused
+export FLAGS_allocator_strategy=naive_best_fit
+
+if [ ${ngpu} == 0 ]; then
+python3 -u ${BIN_DIR}/train.py \
+--ngpu ${ngpu} \
+--seed ${seed} \
+--config ${config_path} \
+--output exp/${ckpt_name} \
+--profiler-options "${profiler_options}" \
+--benchmark-batch-size ${benchmark_batch_size} \
+--benchmark-max-step ${benchmark_max_step}
+else
+python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${ips_config} ${BIN_DIR}/train.py \
+--ngpu ${ngpu} \
+--seed ${seed} \
+--config ${config_path} \
+--output exp/${ckpt_name} \
+--profiler-options "${profiler_options}" \
+--benchmark-batch-size ${benchmark_batch_size} \
+--benchmark-max-step ${benchmark_max_step}
+fi
+
+
+if [ ${seed} != 0  ]; then
+    unset FLAGS_cudnn_deterministic
+fi
+
+if [ $? -ne 0 ]; then
+    echo "Failed in training!"
+    exit 1
+fi
+
+exit 0
diff --git a/examples/tal_cs/asr1/path.sh b/examples/tal_cs/asr1/path.sh
new file mode 100755
index 00000000..666b29bc
--- /dev/null
+++ b/examples/tal_cs/asr1/path.sh
@@ -0,0 +1,15 @@
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
+
+# model exp
+MODEL=u2
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/s2t/exps/${MODEL}/bin
diff --git a/examples/tal_cs/asr1/run.sh b/examples/tal_cs/asr1/run.sh
new file mode 100644
index 00000000..120d69fc
--- /dev/null
+++ b/examples/tal_cs/asr1/run.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+source path.sh || exit 1;
+set -e
+
+gpus=0,1,2,3
+stage=0
+stop_stage=50
+conf_path=conf/conformer.yaml
+ips=  #xxx.xxx.xxx.xxx,xxx.xxx.xxx.xxx
+decode_conf_path=conf/tuning/decode.yaml
+average_checkpoint=true
+avg_num=10
+
+. ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
+
+avg_ckpt=avg_${avg_num}
+ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
+echo "checkpoint name ${ckpt}"
+
+audio_file="data/demo_01_03.wav"
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    bash ./local/data.sh || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `exp` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path}  ${ckpt} ${ips}
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # avg n best model
+    avg.sh best exp/${ckpt}/checkpoints ${avg_num}
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # test ckpt avg_n
+    CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    # test a single .wav file
+    CUDA_VISIBLE_DEVICES=0 ./local/test_wav.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${audio_file} || exit -1
+fi
+
+# Not supported at now!!!
+if [ ${stage} -le 51 ] && [ ${stop_stage} -ge 51 ]; then
+     # export ckpt avg_n
+     CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
+fi
\ No newline at end of file
diff --git a/examples/tal_cs/asr1/utils b/examples/tal_cs/asr1/utils
new file mode 120000
index 00000000..973afe67
--- /dev/null
+++ b/examples/tal_cs/asr1/utils
@@ -0,0 +1 @@
+../../../utils
\ No newline at end of file
diff --git a/examples/tess/cls0/local/train.py b/examples/tess/cls0/local/train.py
index 25382d8c..f023a37b 100644
--- a/examples/tess/cls0/local/train.py
+++ b/examples/tess/cls0/local/train.py
@@ -121,7 +121,7 @@ if __name__ == "__main__":
             optimizer.clear_grad()
 
             # Calculate loss
-            avg_loss += loss.numpy()[0]
+            avg_loss += float(loss)
 
             # Calculate metrics
             preds = paddle.argmax(logits, axis=1)
diff --git a/examples/tiny/asr1/README.md b/examples/tiny/asr1/README.md
index cfa26670..489f5bc3 100644
--- a/examples/tiny/asr1/README.md
+++ b/examples/tiny/asr1/README.md
@@ -37,7 +37,7 @@ It will support the way of using `--variable value` in the shell scripts.
 Some local variables are set in `run.sh`. 
 `gpus` denotes the GPU number you want to use. If you set `gpus=`, it means you only use CPU. 
 `stage` denotes the number of stage you want the start from in the experiments.
-`stop stage` denotes the number of stage you want the stop at in the expriments. 
+`stop stage` denotes the number of stage you want the stop at in the experiments. 
 `conf_path` denotes the config path of the model.
 `avg_num`denotes the number K of top-K models you want to average to get the final model.
 `ckpt` denotes the checkpoint prefix of the model, e.g. "transformerr"
diff --git a/examples/vctk/README.md b/examples/vctk/README.md
index 41163dbe..4a589bcc 100644
--- a/examples/vctk/README.md
+++ b/examples/vctk/README.md
@@ -10,3 +10,4 @@
 * voc2 - MelGAN
 * voc3 - MultiBand MelGAN
 * ernie_sat - ERNIE-SAT
+* vc3 - StarGANv2-VC
diff --git a/examples/vctk/ernie_sat/local/train.sh b/examples/vctk/ernie_sat/local/train.sh
deleted file mode 100755
index 526aac43..00000000
--- a/examples/vctk/ernie_sat/local/train.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-python3 ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=8 \
-    --phones-dict=dump/phone_id_map.txt
\ No newline at end of file
diff --git a/examples/vctk/ernie_sat/local/train.sh b/examples/vctk/ernie_sat/local/train.sh
new file mode 120000
index 00000000..9f1d2346
--- /dev/null
+++ b/examples/vctk/ernie_sat/local/train.sh
@@ -0,0 +1 @@
+../../../aishell3/ernie_sat/local/train.sh
\ No newline at end of file
diff --git a/examples/vctk/ernie_sat/path.sh b/examples/vctk/ernie_sat/path.sh
deleted file mode 100755
index 4ecab025..00000000
--- a/examples/vctk/ernie_sat/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=ernie_sat
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
\ No newline at end of file
diff --git a/examples/vctk/ernie_sat/path.sh b/examples/vctk/ernie_sat/path.sh
new file mode 120000
index 00000000..5ec39759
--- /dev/null
+++ b/examples/vctk/ernie_sat/path.sh
@@ -0,0 +1 @@
+../../aishell3/ernie_sat/path.sh
\ No newline at end of file
diff --git a/examples/vctk/tts3/local/train.sh b/examples/vctk/tts3/local/train.sh
deleted file mode 100755
index 3a507650..00000000
--- a/examples/vctk/tts3/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-python3 ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=1 \
-    --phones-dict=dump/phone_id_map.txt \
-    --speaker-dict=dump/speaker_id_map.txt
diff --git a/examples/vctk/tts3/local/train.sh b/examples/vctk/tts3/local/train.sh
new file mode 120000
index 00000000..78885a30
--- /dev/null
+++ b/examples/vctk/tts3/local/train.sh
@@ -0,0 +1 @@
+../../../aishell3/tts3/local/train.sh
\ No newline at end of file
diff --git a/examples/vctk/tts3/path.sh b/examples/vctk/tts3/path.sh
deleted file mode 100755
index fb7e8411..00000000
--- a/examples/vctk/tts3/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=fastspeech2
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
diff --git a/examples/vctk/tts3/path.sh b/examples/vctk/tts3/path.sh
new file mode 120000
index 00000000..4785b909
--- /dev/null
+++ b/examples/vctk/tts3/path.sh
@@ -0,0 +1 @@
+../../csmsc/tts3/path.sh
\ No newline at end of file
diff --git a/examples/vctk/tts3/run.sh b/examples/vctk/tts3/run.sh
index a112b94b..76307bd5 100755
--- a/examples/vctk/tts3/run.sh
+++ b/examples/vctk/tts3/run.sh
@@ -43,10 +43,7 @@ fi
 
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
     # install paddle2onnx
-    version=$(echo `pip list |grep "paddle2onnx"` |awk -F" " '{print $2}')
-    if [[ -z "$version" || ${version} != '1.0.0' ]]; then
-        pip install paddle2onnx==1.0.0
-    fi
+    pip install paddle2onnx --upgrade
     ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx fastspeech2_vctk
     # considering the balance between speed and quality, we recommend that you use hifigan as vocoder
     ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_vctk
diff --git a/examples/vctk/vc3/README.md b/examples/vctk/vc3/README.md
new file mode 100644
index 00000000..83e1003c
--- /dev/null
+++ b/examples/vctk/vc3/README.md
@@ -0,0 +1,10 @@
+You can download test source audios from [test_wav.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/starganv2vc/test_wav.zip).
+
+
+Test Voice Conversion:
+
+```bash
+wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/starganv2vc/test_wav.zip
+unzip test_wav.zip
+./run.sh --stage 2 --stop-stage 2 --gpus 0
+```
\ No newline at end of file
diff --git a/examples/vctk/vc3/conf/default.yaml b/examples/vctk/vc3/conf/default.yaml
new file mode 100644
index 00000000..eb98515a
--- /dev/null
+++ b/examples/vctk/vc3/conf/default.yaml
@@ -0,0 +1,135 @@
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+# 源码 load 的时候用的 24k, 提取 mel 用的 16k, 后续 load 和提取 mel 都要改成 24k
+fs: 16000
+n_fft: 2048
+n_shift: 300
+win_length: 1200   # Window length.(in samples) 50ms
+                   # If set to null, it will be the same as fft_size.
+window: "hann"     # Window function.
+
+fmin: 0           # Minimum frequency of Mel basis.
+fmax: 8000        # Maximum frequency of Mel basis.  sr // 2
+n_mels: 80
+# only for StarGANv2 VC
+norm:             # None here
+htk: True
+power: 2.0
+
+
+###########################################################
+#                       MODEL SETTING                     #
+###########################################################
+generator_params:
+    dim_in: 64
+    style_dim: 64
+    max_conv_dim: 512
+    w_hpf: 0
+    F0_channel: 256
+mapping_network_params:
+    num_domains: 20      # num of speakers in StarGANv2
+    latent_dim: 16
+    style_dim: 64        # same as style_dim in generator_params
+    hidden_dim: 512      # same as max_conv_dim in generator_params
+style_encoder_params:
+    dim_in: 64           # same as dim_in in generator_params
+    style_dim: 64        # same as style_dim in generator_params
+    num_domains: 20      # same as num_domains in generator_params
+    max_conv_dim: 512    # same as max_conv_dim in generator_params
+discriminator_params:
+    dim_in: 64           # same as dim_in in generator_params
+    num_domains: 20      # same as num_domains in mapping_network_params
+    max_conv_dim: 512    # same as max_conv_dim in generator_params
+    repeat_num: 4
+asr_params:
+    input_dim: 80
+    hidden_dim: 256
+    n_token: 80
+    token_embedding_dim: 256
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+loss_params:
+    g_loss:
+        lambda_sty: 1.
+        lambda_cyc: 5.
+        lambda_ds: 1.
+        lambda_norm: 1.
+        lambda_asr: 10.
+        lambda_f0: 5.
+        lambda_f0_sty: 0.1
+        lambda_adv: 2.
+        lambda_adv_cls: 0.5
+        norm_bias: 0.5
+    d_loss:
+        lambda_reg: 1.
+        lambda_adv_cls: 0.1
+        lambda_con_reg: 10.
+
+    adv_cls_epoch: 50
+    con_reg_epoch: 30
+        
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 5               # Batch size.
+num_workers: 2              # Number of workers in DataLoader.
+max_mel_length: 192
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+    beta1: 0.0
+    beta2: 0.99
+    weight_decay: 1.0e-4                   
+    epsilon: 1.0e-9
+generator_scheduler_params:
+    max_learning_rate: 2.0e-4
+    phase_pct: 0.0
+    divide_factor: 1
+    total_steps: 200000                # train_max_steps
+    end_learning_rate: 2.0e-4
+style_encoder_optimizer_params:
+    beta1: 0.0
+    beta2: 0.99
+    weight_decay: 1.0e-4                 
+    epsilon: 1.0e-9
+style_encoder_scheduler_params:
+    max_learning_rate: 2.0e-4
+    phase_pct: 0.0
+    divide_factor: 1
+    total_steps: 200000                # train_max_steps
+    end_learning_rate: 2.0e-4
+mapping_network_optimizer_params:
+    beta1: 0.0
+    beta2: 0.99
+    weight_decay: 1.0e-4                 
+    epsilon: 1.0e-9
+mapping_network_scheduler_params:
+    max_learning_rate: 2.0e-6
+    phase_pct: 0.0
+    divide_factor: 1
+    total_steps: 200000                # train_max_steps
+    end_learning_rate: 2.0e-6
+discriminator_optimizer_params:
+    beta1: 0.0
+    beta2: 0.99
+    weight_decay: 1.0e-4                 
+    epsilon: 1.0e-9
+discriminator_scheduler_params:
+    max_learning_rate: 2.0e-4
+    phase_pct: 0.0
+    divide_factor: 1
+    total_steps: 200000                # train_max_steps
+    end_learning_rate: 2.0e-4        
+
+###########################################################
+#                    TRAINING SETTING                     #
+###########################################################
+max_epoch: 150
+num_snapshots: 5
+seed: 1
\ No newline at end of file
diff --git a/examples/vctk/vc3/local/preprocess.sh b/examples/vctk/vc3/local/preprocess.sh
new file mode 100755
index 00000000..058171c5
--- /dev/null
+++ b/examples/vctk/vc3/local/preprocess.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+stage=0
+stop_stage=100
+
+config_path=$1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # extract features
+    echo "Extract features ..."
+    python3 ${BIN_DIR}/preprocess.py \
+        --dataset=vctk \
+        --rootdir=~/datasets/VCTK-Corpus-0.92/ \
+        --dumpdir=dump \
+        --config=${config_path} \
+        --num-cpu=20
+
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+echo "Normalize ..."
+    python3 ${BIN_DIR}/normalize.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --dumpdir=dump/train/norm \
+        --speaker-dict=dump/speaker_id_map.txt
+
+    python3 ${BIN_DIR}/normalize.py \
+        --metadata=dump/dev/raw/metadata.jsonl \
+        --dumpdir=dump/dev/norm \
+        --speaker-dict=dump/speaker_id_map.txt
+
+    python3 ${BIN_DIR}/normalize.py \
+        --metadata=dump/test/raw/metadata.jsonl \
+        --dumpdir=dump/test/norm \
+        --speaker-dict=dump/speaker_id_map.txt
+
+fi
diff --git a/examples/vctk/vc3/local/train.sh b/examples/vctk/vc3/local/train.sh
new file mode 100755
index 00000000..d4ea02da
--- /dev/null
+++ b/examples/vctk/vc3/local/train.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+
+python3 ${BIN_DIR}/train.py \
+    --train-metadata=dump/train/norm/metadata.jsonl \
+    --dev-metadata=dump/dev/norm/metadata.jsonl \
+    --config=${config_path} \
+    --output-dir=${train_output_path} \
+    --ngpu=1 \
+    --speaker-dict=dump/speaker_id_map.txt
diff --git a/examples/vctk/vc3/local/voice_conversion.sh b/examples/vctk/vc3/local/voice_conversion.sh
new file mode 100755
index 00000000..edf8f7ef
--- /dev/null
+++ b/examples/vctk/vc3/local/voice_conversion.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+config_path=$1
+source_path=$2
+output_dir=$3
+
+python3 ${BIN_DIR}/vc.py \
+    --config_path=${config_path} \
+    --source_path=${source_path}\
+    --output_dir=${output_dir} 
\ No newline at end of file
diff --git a/examples/vctk/vc3/path.sh b/examples/vctk/vc3/path.sh
new file mode 100755
index 00000000..9de2e4d7
--- /dev/null
+++ b/examples/vctk/vc3/path.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+MODEL=starganv2_vc
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
diff --git a/examples/vctk/vc3/run.sh b/examples/vctk/vc3/run.sh
new file mode 100755
index 00000000..602a593d
--- /dev/null
+++ b/examples/vctk/vc3/run.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+
+set -e
+source path.sh
+
+gpus=0,1
+stage=0
+stop_stage=100
+
+conf_path=conf/default.yaml
+train_output_path=exp/default
+ckpt_name=snapshot_iter_331.pdz
+source_path=test_wav/goat_01.wav
+output_dir=vc_output
+
+# with the following command, you can choose the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+# not ready now
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    ./local/preprocess.sh ${conf_path} || exit -1
+fi
+# not ready now
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # synthesize, vocoder is pwgan by default
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/voice_conversion.sh ${conf_path} ${source_path} ${output_dir}|| exit -1
+fi
+
diff --git a/examples/vctk/voc1/local/preprocess.sh b/examples/vctk/voc1/local/preprocess.sh
index 88a478cd..6b7e5288 100755
--- a/examples/vctk/voc1/local/preprocess.sh
+++ b/examples/vctk/voc1/local/preprocess.sh
@@ -42,14 +42,18 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     python3 ${BIN_DIR}/../normalize.py \
         --metadata=dump/train/raw/metadata.jsonl \
         --dumpdir=dump/train/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
+        --skip-wav-copy
+
     python3 ${BIN_DIR}/../normalize.py \
         --metadata=dump/dev/raw/metadata.jsonl \
         --dumpdir=dump/dev/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
+        --skip-wav-copy
     
     python3 ${BIN_DIR}/../normalize.py \
         --metadata=dump/test/raw/metadata.jsonl \
         --dumpdir=dump/test/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
+        --skip-wav-copy
 fi
diff --git a/examples/vctk/voc1/local/synthesize.sh b/examples/vctk/voc1/local/synthesize.sh
deleted file mode 100755
index 145557b3..00000000
--- a/examples/vctk/voc1/local/synthesize.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-ckpt_name=$3
-
-FLAGS_allocator_strategy=naive_best_fit \
-FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ${BIN_DIR}/../synthesize.py \
-    --config=${config_path} \
-    --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
-    --test-metadata=dump/test/norm/metadata.jsonl \
-    --output-dir=${train_output_path}/test \
-    --generator-type=pwgan
diff --git a/examples/vctk/voc1/local/synthesize.sh b/examples/vctk/voc1/local/synthesize.sh
new file mode 120000
index 00000000..d6aecd8d
--- /dev/null
+++ b/examples/vctk/voc1/local/synthesize.sh
@@ -0,0 +1 @@
+../../../csmsc/voc1/local/synthesize.sh
\ No newline at end of file
diff --git a/examples/vctk/voc1/local/train.sh b/examples/vctk/voc1/local/train.sh
deleted file mode 100755
index 9695631e..00000000
--- a/examples/vctk/voc1/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-FLAGS_cudnn_exhaustive_search=true \
-FLAGS_conv_workspace_size_limit=4000 \
-python ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=1
diff --git a/examples/vctk/voc1/local/train.sh b/examples/vctk/voc1/local/train.sh
new file mode 120000
index 00000000..2942893d
--- /dev/null
+++ b/examples/vctk/voc1/local/train.sh
@@ -0,0 +1 @@
+../../../csmsc/voc1/local/train.sh
\ No newline at end of file
diff --git a/examples/vctk/voc1/path.sh b/examples/vctk/voc1/path.sh
deleted file mode 100755
index 1e6647b8..00000000
--- a/examples/vctk/voc1/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=parallelwave_gan
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL}
\ No newline at end of file
diff --git a/examples/vctk/voc1/path.sh b/examples/vctk/voc1/path.sh
new file mode 120000
index 00000000..b7ed4fb8
--- /dev/null
+++ b/examples/vctk/voc1/path.sh
@@ -0,0 +1 @@
+../../csmsc/voc1/path.sh
\ No newline at end of file
diff --git a/examples/vctk/voc5/local/preprocess.sh b/examples/vctk/voc5/local/preprocess.sh
deleted file mode 100755
index 88a478cd..00000000
--- a/examples/vctk/voc5/local/preprocess.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/bash
-
-stage=0
-stop_stage=100
-
-config_path=$1
-
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    # get durations from MFA's result
-    echo "Generate durations.txt from MFA results ..."
-    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
-        --inputdir=./vctk_alignment \
-        --output=durations.txt \
-        --config=${config_path}
-fi
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    # extract features
-    echo "Extract features ..."
-    python3 ${BIN_DIR}/../preprocess.py \
-        --rootdir=~/datasets/VCTK-Corpus-0.92/ \
-        --dataset=vctk \
-        --dumpdir=dump \
-        --dur-file=durations.txt \
-        --config=${config_path} \
-        --cut-sil=True \
-        --num-cpu=20
-fi
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    # get features' stats(mean and std)
-    echo "Get features' stats ..."
-    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
-        --metadata=dump/train/raw/metadata.jsonl \
-        --field-name="feats"
-fi
-
-if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
-    # normalize, dev and test should use train's stats
-    echo "Normalize ..."
-   
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/train/raw/metadata.jsonl \
-        --dumpdir=dump/train/norm \
-        --stats=dump/train/feats_stats.npy
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/dev/raw/metadata.jsonl \
-        --dumpdir=dump/dev/norm \
-        --stats=dump/train/feats_stats.npy
-    
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/test/raw/metadata.jsonl \
-        --dumpdir=dump/test/norm \
-        --stats=dump/train/feats_stats.npy
-fi
diff --git a/examples/vctk/voc5/local/preprocess.sh b/examples/vctk/voc5/local/preprocess.sh
new file mode 120000
index 00000000..f0cb24de
--- /dev/null
+++ b/examples/vctk/voc5/local/preprocess.sh
@@ -0,0 +1 @@
+../../voc1/local/preprocess.sh
\ No newline at end of file
diff --git a/examples/vctk/voc5/local/synthesize.sh b/examples/vctk/voc5/local/synthesize.sh
deleted file mode 100755
index 64789617..00000000
--- a/examples/vctk/voc5/local/synthesize.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-ckpt_name=$3
-
-FLAGS_allocator_strategy=naive_best_fit \
-FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ${BIN_DIR}/../synthesize.py \
-    --config=${config_path} \
-    --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
-    --test-metadata=dump/test/norm/metadata.jsonl \
-    --output-dir=${train_output_path}/test \
-    --generator-type=hifigan
diff --git a/examples/vctk/voc5/local/synthesize.sh b/examples/vctk/voc5/local/synthesize.sh
new file mode 120000
index 00000000..c887112c
--- /dev/null
+++ b/examples/vctk/voc5/local/synthesize.sh
@@ -0,0 +1 @@
+../../../csmsc/voc5/local/synthesize.sh
\ No newline at end of file
diff --git a/examples/vctk/voc5/local/train.sh b/examples/vctk/voc5/local/train.sh
deleted file mode 100755
index 9695631e..00000000
--- a/examples/vctk/voc5/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-FLAGS_cudnn_exhaustive_search=true \
-FLAGS_conv_workspace_size_limit=4000 \
-python ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=1
diff --git a/examples/vctk/voc5/local/train.sh b/examples/vctk/voc5/local/train.sh
new file mode 120000
index 00000000..2942893d
--- /dev/null
+++ b/examples/vctk/voc5/local/train.sh
@@ -0,0 +1 @@
+../../../csmsc/voc1/local/train.sh
\ No newline at end of file
diff --git a/examples/vctk/voc5/path.sh b/examples/vctk/voc5/path.sh
deleted file mode 100755
index 7451b321..00000000
--- a/examples/vctk/voc5/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=hifigan
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL}
diff --git a/examples/vctk/voc5/path.sh b/examples/vctk/voc5/path.sh
new file mode 120000
index 00000000..b67fe2b3
--- /dev/null
+++ b/examples/vctk/voc5/path.sh
@@ -0,0 +1 @@
+../../csmsc/voc5/path.sh
\ No newline at end of file
diff --git a/examples/zh_en_tts/tts3/local/train.sh b/examples/zh_en_tts/tts3/local/train.sh
deleted file mode 100755
index 1da72f11..00000000
--- a/examples/zh_en_tts/tts3/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-python3 ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=2 \
-    --phones-dict=dump/phone_id_map.txt \
-    --speaker-dict=dump/speaker_id_map.txt
diff --git a/examples/zh_en_tts/tts3/local/train.sh b/examples/zh_en_tts/tts3/local/train.sh
new file mode 120000
index 00000000..78885a30
--- /dev/null
+++ b/examples/zh_en_tts/tts3/local/train.sh
@@ -0,0 +1 @@
+../../../aishell3/tts3/local/train.sh
\ No newline at end of file
diff --git a/examples/zh_en_tts/tts3/path.sh b/examples/zh_en_tts/tts3/path.sh
deleted file mode 100755
index fb7e8411..00000000
--- a/examples/zh_en_tts/tts3/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=fastspeech2
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
diff --git a/examples/zh_en_tts/tts3/path.sh b/examples/zh_en_tts/tts3/path.sh
new file mode 120000
index 00000000..4785b909
--- /dev/null
+++ b/examples/zh_en_tts/tts3/path.sh
@@ -0,0 +1 @@
+../../csmsc/tts3/path.sh
\ No newline at end of file
diff --git a/examples/zh_en_tts/tts3/run.sh b/examples/zh_en_tts/tts3/run.sh
index 12f99081..a4d86480 100755
--- a/examples/zh_en_tts/tts3/run.sh
+++ b/examples/zh_en_tts/tts3/run.sh
@@ -46,10 +46,7 @@ fi
 
 if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
     # install paddle2onnx
-    version=$(echo `pip list |grep "paddle2onnx"` |awk -F" " '{print $2}')
-    if [[ -z "$version" || ${version} != '1.0.0' ]]; then
-        pip install paddle2onnx==1.0.0
-    fi
+    pip install paddle2onnx --upgrade
     ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx fastspeech2_mix
     # considering the balance between speed and quality, we recommend that you use hifigan as vocoder
     ./local/paddle2onnx.sh ${train_output_path} inference inference_onnx pwgan_aishell3
diff --git a/paddlespeech/__init__.py b/paddlespeech/__init__.py
index 6c7e75c1..969d189f 100644
--- a/paddlespeech/__init__.py
+++ b/paddlespeech/__init__.py
@@ -13,3 +13,7 @@
 # limitations under the License.
 import _locale
 _locale._getdefaultlocale = (lambda *args: ['en_US', 'utf8'])
+
+__version__ = '0.0.0'
+
+__commit__ = '9cf8c1985a98bb380c183116123672976bdfe5c9'
diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py
index 00414336..7a7aef8b 100644
--- a/paddlespeech/cli/asr/infer.py
+++ b/paddlespeech/cli/asr/infer.py
@@ -25,6 +25,9 @@ import librosa
 import numpy as np
 import paddle
 import soundfile
+from paddlespeech.audio.transform.transformation import Transformation
+from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
+from paddlespeech.s2t.utils.utility import UpdateConfig
 from yacs.config import CfgNode
 
 from ...utils.env import MODEL_HOME
@@ -34,9 +37,6 @@ from ..log import logger
 from ..utils import CLI_TIMER
 from ..utils import stats_wrapper
 from ..utils import timer_register
-from paddlespeech.audio.transform.transformation import Transformation
-from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
-from paddlespeech.s2t.utils.utility import UpdateConfig
 
 __all__ = ['ASRExecutor']
 
@@ -62,8 +62,13 @@ class ASRExecutor(BaseExecutor):
             '--lang',
             type=str,
             default='zh',
-            help='Choose model language. zh or en, zh:[conformer_wenetspeech-zh-16k], en:[transformer_librispeech-en-16k]'
+            help='Choose model language. [zh, en, zh_en], zh:[conformer_wenetspeech-zh-16k], en:[transformer_librispeech-en-16k], zh_en:[conformer_talcs-codeswitch_zh_en-16k]'
         )
+        self.parser.add_argument(
+            '--codeswitch',
+            type=bool,
+            default=False,
+            help='Choose whether use code-switch. True or False.')
         self.parser.add_argument(
             "--sample_rate",
             type=int,
@@ -127,6 +132,7 @@ class ASRExecutor(BaseExecutor):
     def _init_from_path(self,
                         model_type: str='wenetspeech',
                         lang: str='zh',
+                        codeswitch: bool=False,
                         sample_rate: int=16000,
                         cfg_path: Optional[os.PathLike]=None,
                         decode_method: str='attention_rescoring',
@@ -144,7 +150,12 @@ class ASRExecutor(BaseExecutor):
 
         if cfg_path is None or ckpt_path is None:
             sample_rate_str = '16k' if sample_rate == 16000 else '8k'
-            tag = model_type + '-' + lang + '-' + sample_rate_str
+            if lang == "zh_en" and codeswitch is True:
+                tag = model_type + '-' + 'codeswitch_' + lang + '-' + sample_rate_str
+            elif lang == "zh_en" or codeswitch is True:
+                raise Exception("codeswitch is true only in zh_en model")
+            else:
+                tag = model_type + '-' + lang + '-' + sample_rate_str
             self.task_resource.set_task_model(tag, version=None)
             self.res_path = self.task_resource.res_dir
 
@@ -423,6 +434,7 @@ class ASRExecutor(BaseExecutor):
 
         model = parser_args.model
         lang = parser_args.lang
+        codeswitch = parser_args.codeswitch
         sample_rate = parser_args.sample_rate
         config = parser_args.config
         ckpt_path = parser_args.ckpt_path
@@ -444,6 +456,7 @@ class ASRExecutor(BaseExecutor):
                     audio_file=input_,
                     model=model,
                     lang=lang,
+                    codeswitch=codeswitch,
                     sample_rate=sample_rate,
                     config=config,
                     ckpt_path=ckpt_path,
@@ -472,6 +485,7 @@ class ASRExecutor(BaseExecutor):
                  audio_file: os.PathLike,
                  model: str='conformer_u2pp_online_wenetspeech',
                  lang: str='zh',
+                 codeswitch: bool=False,
                  sample_rate: int=16000,
                  config: os.PathLike=None,
                  ckpt_path: os.PathLike=None,
@@ -485,8 +499,8 @@ class ASRExecutor(BaseExecutor):
         """
         audio_file = os.path.abspath(audio_file)
         paddle.set_device(device)
-        self._init_from_path(model, lang, sample_rate, config, decode_method,
-                             num_decoding_left_chunks, ckpt_path)
+        self._init_from_path(model, lang, codeswitch, sample_rate, config,
+                             decode_method, num_decoding_left_chunks, ckpt_path)
         if not self._check(audio_file, sample_rate, force_yes):
             sys.exit(-1)
         if rtf:
diff --git a/paddlespeech/cli/base_commands.py b/paddlespeech/cli/base_commands.py
index 767d0df7..dfeb5cae 100644
--- a/paddlespeech/cli/base_commands.py
+++ b/paddlespeech/cli/base_commands.py
@@ -14,6 +14,7 @@
 import argparse
 from typing import List
 
+import numpy
 from prettytable import PrettyTable
 
 from ..resource import CommonTaskResource
@@ -78,7 +79,7 @@ class VersionCommand:
 
 
 model_name_format = {
-    'asr': 'Model-Language-Sample Rate',
+    'asr': 'Model-Size-Code Switch-Multilingual-Language-Sample Rate',
     'cls': 'Model-Sample Rate',
     'st': 'Model-Source language-Target language',
     'text': 'Model-Task-Language',
@@ -111,7 +112,21 @@ class StatsCommand:
         fields = model_name_format[self.task].split("-")
         table = PrettyTable(fields)
         for key in pretrained_models:
-            table.add_row(key.split("-"))
+            line = key.split("-")
+            if self.task == "asr" and len(line) < len(fields):
+                for i in range(len(line), len(fields)):
+                    line.append("-")
+                if "codeswitch" in key:
+                    line[3], line[1] = line[1].split("_")[0], line[1].split(
+                        "_")[1:]
+                elif "multilingual" in key:
+                    line[4], line[1] = line[1].split("_")[0], line[1].split(
+                        "_")[1:]
+                tmp = numpy.array(line)
+                idx = [0, 5, 3, 4, 1, 2]
+                line = tmp[idx]
+            table.add_row(line)
+
         print(table)
 
     def execute(self, argv: List[str]) -> bool:
diff --git a/paddlespeech/cli/download.py b/paddlespeech/cli/download.py
index 5661f18f..e77a05d2 100644
--- a/paddlespeech/cli/download.py
+++ b/paddlespeech/cli/download.py
@@ -133,10 +133,10 @@ def _get_download(url, fullname):
     total_size = req.headers.get('content-length')
     with open(tmp_fullname, 'wb') as f:
         if total_size:
-            with tqdm(total=(int(total_size) + 1023) // 1024) as pbar:
+            with tqdm(total=(int(total_size)), unit='B', unit_scale=True) as pbar:
                 for chunk in req.iter_content(chunk_size=1024):
                     f.write(chunk)
-                    pbar.update(1)
+                    pbar.update(len(chunk))
         else:
             for chunk in req.iter_content(chunk_size=1024):
                 if chunk:
diff --git a/paddlespeech/cli/text/infer.py b/paddlespeech/cli/text/infer.py
index ff822f67..bd76a13d 100644
--- a/paddlespeech/cli/text/infer.py
+++ b/paddlespeech/cli/text/infer.py
@@ -127,7 +127,7 @@ class TextExecutor(BaseExecutor):
         if self.task == 'punc':
             # punc list
             self._punc_list = []
-            with open(self.vocab_file, 'r') as f:
+            with open(self.vocab_file, 'r', encoding='utf-8') as f:
                 for line in f:
                     self._punc_list.append(line.strip())
 
@@ -178,12 +178,12 @@ class TextExecutor(BaseExecutor):
         if self.task == 'punc':
             # punc list
             self._punc_list = []
-            with open(self.vocab_file, 'r') as f:
+            with open(self.vocab_file, 'r', encoding='utf-8') as f:
                 for line in f:
                     self._punc_list.append(line.strip())
 
             # model
-            with open(self.cfg_path) as f:
+            with open(self.cfg_path, 'r', encoding='utf-8') as f:
                 config = CfgNode(yaml.safe_load(f))
             self.model = ErnieLinear(**config["model"])
 
diff --git a/paddlespeech/cli/tts/infer.py b/paddlespeech/cli/tts/infer.py
index 707518c0..4787e1ee 100644
--- a/paddlespeech/cli/tts/infer.py
+++ b/paddlespeech/cli/tts/infer.py
@@ -39,10 +39,25 @@ from paddlespeech.t2s.utils import str2bool
 
 __all__ = ['TTSExecutor']
 ONNX_SUPPORT_SET = {
-    'speedyspeech_csmsc', 'fastspeech2_csmsc', 'fastspeech2_ljspeech',
-    'fastspeech2_aishell3', 'fastspeech2_vctk', 'pwgan_csmsc', 'pwgan_ljspeech',
-    'pwgan_aishell3', 'pwgan_vctk', 'mb_melgan_csmsc', 'hifigan_csmsc',
-    'hifigan_ljspeech', 'hifigan_aishell3', 'hifigan_vctk'
+    'speedyspeech_csmsc',
+    'fastspeech2_csmsc',
+    'fastspeech2_ljspeech',
+    'fastspeech2_aishell3',
+    'fastspeech2_vctk',
+    'fastspeech2_male',
+    'fastspeech2_mix',
+    'fastspeech2_canton',
+    'pwgan_csmsc',
+    'pwgan_ljspeech',
+    'pwgan_aishell3',
+    'pwgan_vctk',
+    'pwgan_male',
+    'mb_melgan_csmsc',
+    'hifigan_csmsc',
+    'hifigan_ljspeech',
+    'hifigan_aishell3',
+    'hifigan_vctk',
+    'hifigan_male',
 }
 
 
@@ -68,6 +83,7 @@ class TTSExecutor(BaseExecutor):
                 'tacotron2_csmsc',
                 'tacotron2_ljspeech',
                 'fastspeech2_male',
+                'fastspeech2_canton',
             ],
             help='Choose acoustic model type of tts task.')
         self.parser.add_argument(
@@ -124,6 +140,7 @@ class TTSExecutor(BaseExecutor):
                 'hifigan_vctk',
                 'wavernn_csmsc',
                 'pwgan_male',
+                'hifigan_male',
             ],
             help='Choose vocoder type of tts task.')
 
@@ -258,8 +275,12 @@ class TTSExecutor(BaseExecutor):
             use_pretrained_voc = False
         voc_lang = lang
         # When speaker is 174 (csmsc), use csmsc's vocoder is better than aishell3's
-        if lang == 'mix':
-            voc_lang = 'zh'
+        if lang == 'mix' or lang == 'canton':
+            voc_dataset = voc[voc.rindex('_') + 1:]
+            if voc_dataset in {"ljspeech", "vctk"}:
+                voc_lang = 'en'
+            else:
+                voc_lang = 'zh'
         voc_tag = voc + '-' + voc_lang
         self.task_resource.set_task_model(
             model_tag=voc_tag,
@@ -292,19 +313,19 @@ class TTSExecutor(BaseExecutor):
         with open(self.voc_config) as f:
             self.voc_config = CfgNode(yaml.safe_load(f))
 
-        with open(self.phones_dict, "r") as f:
+        with open(self.phones_dict, 'rt', encoding='utf-8') as f:
             phn_id = [line.strip().split() for line in f.readlines()]
         vocab_size = len(phn_id)
 
         tone_size = None
         if self.tones_dict:
-            with open(self.tones_dict, "r") as f:
+            with open(self.tones_dict, 'rt', encoding='utf-8') as f:
                 tone_id = [line.strip().split() for line in f.readlines()]
             tone_size = len(tone_id)
 
         spk_num = None
         if self.speaker_dict:
-            with open(self.speaker_dict, 'rt') as f:
+            with open(self.speaker_dict, 'rt', encoding='utf-8') as f:
                 spk_id = [line.strip().split() for line in f.readlines()]
             spk_num = len(spk_id)
 
@@ -388,9 +409,12 @@ class TTSExecutor(BaseExecutor):
         else:
             use_pretrained_voc = False
         voc_lang = lang
-        # we must use ljspeech's voc for mix am now!
-        if lang == 'mix':
-            voc_lang = 'en'
+        if lang == 'mix' or lang == 'canton':
+            voc_dataset = voc[voc.rindex('_') + 1:]
+            if voc_dataset in {"ljspeech", "vctk"}:
+                voc_lang = 'en'
+            else:
+                voc_lang = 'zh'
         voc_tag = voc + '_onnx' + '-' + voc_lang
         self.task_resource.set_task_model(
             model_tag=voc_tag,
@@ -465,7 +489,7 @@ class TTSExecutor(BaseExecutor):
             # fastspeech2
             else:
                 # multi speaker
-                if am_dataset in {'aishell3', 'vctk', 'mix'}:
+                if am_dataset in {'aishell3', 'vctk', 'mix', 'canton'}:
                     mel = self.am_inference(
                         part_phone_ids, spk_id=paddle.to_tensor(spk_id))
                 else:
@@ -501,7 +525,7 @@ class TTSExecutor(BaseExecutor):
             merge_sentences=merge_sentences,
             get_tone_ids=get_tone_ids,
             lang=lang,
-            to_tensor=False)
+            to_tensor=False, )
         self.frontend_time = time.time() - frontend_st
         phone_ids = frontend_dict['phone_ids']
         self.am_time = 0
@@ -512,7 +536,7 @@ class TTSExecutor(BaseExecutor):
             part_phone_ids = phone_ids[i]
             if am_name == 'fastspeech2':
                 am_input_feed.update({'text': part_phone_ids})
-                if am_dataset in {"aishell3", "vctk"}:
+                if am_dataset in {"aishell3", "vctk", "mix", "canton"}:
                     # NOTE: 'spk_id' should be List[int] rather than int here!!
                     am_input_feed.update({'spk_id': [spk_id]})
             elif am_name == 'speedyspeech':
diff --git a/paddlespeech/cli/whisper/infer.py b/paddlespeech/cli/whisper/infer.py
index c016b453..ebcca890 100644
--- a/paddlespeech/cli/whisper/infer.py
+++ b/paddlespeech/cli/whisper/infer.py
@@ -152,8 +152,7 @@ class WhisperExecutor(BaseExecutor):
         Init model and other resources from a specific path.
         """
         logger.debug("start to init the model")
-        # default max_len: unit:second
-        self.max_len = 50
+
         if hasattr(self, 'model'):
             logger.debug('Model had been initialized.')
             return
@@ -339,12 +338,6 @@ class WhisperExecutor(BaseExecutor):
         try:
             audio, audio_sample_rate = soundfile.read(
                 audio_file, dtype="int16", always_2d=True)
-            audio_duration = audio.shape[0] / audio_sample_rate
-            if audio_duration > self.max_len:
-                logger.error(
-                    f"Please input audio file less then {self.max_len} seconds.\n"
-                )
-                return False
         except Exception as e:
             logger.exception(e)
             logger.error(
diff --git a/paddlespeech/dataset/__init__.py b/paddlespeech/dataset/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/dataset/aidatatang_200zh/README.md b/paddlespeech/dataset/aidatatang_200zh/README.md
similarity index 100%
rename from dataset/aidatatang_200zh/README.md
rename to paddlespeech/dataset/aidatatang_200zh/README.md
diff --git a/paddlespeech/dataset/aidatatang_200zh/__init__.py b/paddlespeech/dataset/aidatatang_200zh/__init__.py
new file mode 100644
index 00000000..9146247d
--- /dev/null
+++ b/paddlespeech/dataset/aidatatang_200zh/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .aidatatang_200zh import main as aidatatang_200zh_main
diff --git a/paddlespeech/dataset/aidatatang_200zh/aidatatang_200zh.py b/paddlespeech/dataset/aidatatang_200zh/aidatatang_200zh.py
new file mode 100644
index 00000000..5d914a43
--- /dev/null
+++ b/paddlespeech/dataset/aidatatang_200zh/aidatatang_200zh.py
@@ -0,0 +1,158 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Prepare aidatatang_200zh mandarin dataset
+
+Download, unpack and create manifest files.
+Manifest file is a json-format file with each line containing the
+meta data (i.e. audio filepath, transcript and audio duration)
+of each audio file in the data set.
+"""
+import argparse
+import codecs
+import json
+import os
+from pathlib import Path
+
+import soundfile
+
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unpack
+from paddlespeech.utils.argparse import print_arguments
+
+DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
+
+URL_ROOT = 'http://www.openslr.org/resources/62'
+# URL_ROOT = 'https://openslr.magicdatatech.com/resources/62'
+DATA_URL = URL_ROOT + '/aidatatang_200zh.tgz'
+MD5_DATA = '6e0f4f39cd5f667a7ee53c397c8d0949'
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--target_dir",
+    default=DATA_HOME + "/aidatatang_200zh",
+    type=str,
+    help="Directory to save the dataset. (default: %(default)s)")
+parser.add_argument(
+    "--manifest_prefix",
+    default="manifest",
+    type=str,
+    help="Filepath prefix for output manifests. (default: %(default)s)")
+args = parser.parse_args()
+
+
+def create_manifest(data_dir, manifest_path_prefix):
+    print("Creating manifest %s ..." % manifest_path_prefix)
+    json_lines = []
+    transcript_path = os.path.join(data_dir, 'transcript',
+                                   'aidatatang_200_zh_transcript.txt')
+    transcript_dict = {}
+    for line in codecs.open(transcript_path, 'r', 'utf-8'):
+        line = line.strip()
+        if line == '':
+            continue
+        audio_id, text = line.split(' ', 1)
+        # remove withespace, charactor text
+        text = ''.join(text.split())
+        transcript_dict[audio_id] = text
+
+    data_types = ['train', 'dev', 'test']
+    for dtype in data_types:
+        del json_lines[:]
+        total_sec = 0.0
+        total_text = 0.0
+        total_num = 0
+
+        audio_dir = os.path.join(data_dir, 'corpus/', dtype)
+        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
+            for fname in filelist:
+                if not fname.endswith('.wav'):
+                    continue
+
+                audio_path = os.path.abspath(os.path.join(subfolder, fname))
+                audio_id = os.path.basename(fname)[:-4]
+                utt2spk = Path(audio_path).parent.name
+
+                audio_data, samplerate = soundfile.read(audio_path)
+                duration = float(len(audio_data) / samplerate)
+                text = transcript_dict[audio_id]
+                json_lines.append(
+                    json.dumps(
+                        {
+                            'utt': audio_id,
+                            'utt2spk': str(utt2spk),
+                            'feat': audio_path,
+                            'feat_shape': (duration, ),  # second
+                            'text': text,
+                        },
+                        ensure_ascii=False))
+
+                total_sec += duration
+                total_text += len(text)
+                total_num += 1
+
+        manifest_path = manifest_path_prefix + '.' + dtype
+        with codecs.open(manifest_path, 'w', 'utf-8') as fout:
+            for line in json_lines:
+                fout.write(line + '\n')
+
+        manifest_dir = os.path.dirname(manifest_path_prefix)
+        meta_path = os.path.join(manifest_dir, dtype) + '.meta'
+        with open(meta_path, 'w') as f:
+            print(f"{dtype}:", file=f)
+            print(f"{total_num} utts", file=f)
+            print(f"{total_sec / (60*60)} h", file=f)
+            print(f"{total_text} text", file=f)
+            print(f"{total_text / total_sec} text/sec", file=f)
+            print(f"{total_sec / total_num} sec/utt", file=f)
+
+
+def prepare_dataset(url, md5sum, target_dir, manifest_path, subset):
+    """Download, unpack and create manifest file."""
+    data_dir = os.path.join(target_dir, subset)
+    if not os.path.exists(data_dir):
+        filepath = download(url, md5sum, target_dir)
+        unpack(filepath, target_dir)
+        # unpack all audio tar files
+        audio_dir = os.path.join(data_dir, 'corpus')
+        for subfolder, dirlist, filelist in sorted(os.walk(audio_dir)):
+            for sub in dirlist:
+                print(f"unpack dir {sub}...")
+                for folder, _, filelist in sorted(
+                        os.walk(os.path.join(subfolder, sub))):
+                    for ftar in filelist:
+                        unpack(os.path.join(folder, ftar), folder, True)
+    else:
+        print("Skip downloading and unpacking. Data already exists in %s." %
+              target_dir)
+
+    create_manifest(data_dir, manifest_path)
+
+
+def main():
+    print_arguments(args, globals())
+    if args.target_dir.startswith('~'):
+        args.target_dir = os.path.expanduser(args.target_dir)
+
+    prepare_dataset(
+        url=DATA_URL,
+        md5sum=MD5_DATA,
+        target_dir=args.target_dir,
+        manifest_path=args.manifest_prefix,
+        subset='aidatatang_200zh')
+
+    print("Data download and manifest prepare done!")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/paddlespeech/dataset/aishell/README.md b/paddlespeech/dataset/aishell/README.md
new file mode 100644
index 00000000..c46312df
--- /dev/null
+++ b/paddlespeech/dataset/aishell/README.md
@@ -0,0 +1,58 @@
+# [Aishell1](http://openslr.elda.org/33/)
+
+This Open Source Mandarin Speech Corpus, AISHELL-ASR0009-OS1, is 178 hours long. It is a part of AISHELL-ASR0009, of which utterance contains 11 domains, including smart home, autonomous driving, and industrial production. The whole recording was put in quiet indoor environment, using 3 different devices at the same time: high fidelity microphone (44.1kHz, 16-bit,); Android-system mobile phone (16kHz, 16-bit), iOS-system mobile phone (16kHz, 16-bit). Audios in high fidelity were re-sampled to 16kHz to build AISHELL- ASR0009-OS1. 400 speakers from different accent areas in China were invited to participate in the recording. The manual transcription accuracy rate is above 95%, through professional speech annotation and strict quality inspection. The corpus is divided into training, development and testing sets. ( This database is free for academic research, not in the commerce, if without permission. )
+
+
+## Dataset Architecture
+
+```bash
+data_aishell
+├── transcript      # text 目录
+└── wav             # wav 目录
+    ├── dev         # dev 目录
+    │   ├── S0724   # spk 目录
+    │   ├── S0725
+    │   ├── S0726
+    ├── train
+    │   ├── S0724
+    │   ├── S0725
+    │   ├── S0726
+    ├── test
+    │   ├── S0724
+    │   ├── S0725
+    │   ├── S0726
+ 
+
+data_aishell
+├── transcript
+│   └── aishell_transcript_v0.8.txt   # 文本标注文件
+└── wav
+    ├── dev
+    │   ├── S0724
+    │   │   ├── BAC009S0724W0121.wav  # S0724 的音频
+    │   │   ├── BAC009S0724W0122.wav
+    │   │   ├── BAC009S0724W0123.wav
+    ├── test
+    │   ├── S0724
+    │   │   ├── BAC009S0724W0121.wav
+    │   │   ├── BAC009S0724W0122.wav
+    │   │   ├── BAC009S0724W0123.wav
+    ├── train
+    │   ├── S0724
+    │   │   ├── BAC009S0724W0121.wav
+    │   │   ├── BAC009S0724W0122.wav
+    │   │   ├── BAC009S0724W0123.wav
+    
+标注文件格式： <utt> <tokens>
+> head data_aishell/transcript/aishell_transcript_v0.8.txt 
+BAC009S0002W0122 而 对 楼市 成交 抑制 作用 最 大 的 限 购
+BAC009S0002W0123 也 成为 地方 政府 的 眼中 钉
+BAC009S0002W0124 自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后
+BAC009S0002W0125 各地 政府 便 纷纷 跟进
+BAC009S0002W0126 仅 一 个 多 月 的 时间 里
+BAC009S0002W0127 除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外
+BAC009S0002W0128 四十六 个 限 购 城市 当中
+BAC009S0002W0129 四十一 个 已 正式 取消 或 变相 放松 了 限 购
+BAC009S0002W0130 财政 金融 政策 紧随 其后 而来
+BAC009S0002W0131 显示 出 了 极 强 的 威力
+```
diff --git a/paddlespeech/dataset/aishell/__init__.py b/paddlespeech/dataset/aishell/__init__.py
new file mode 100644
index 00000000..667680af
--- /dev/null
+++ b/paddlespeech/dataset/aishell/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .aishell import check_dataset
+from .aishell import create_manifest
+from .aishell import download_dataset
+from .aishell import main as aishell_main
+from .aishell import prepare_dataset
diff --git a/paddlespeech/dataset/aishell/aishell.py b/paddlespeech/dataset/aishell/aishell.py
new file mode 100644
index 00000000..7ea4d676
--- /dev/null
+++ b/paddlespeech/dataset/aishell/aishell.py
@@ -0,0 +1,230 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Prepare Aishell mandarin dataset
+
+Download, unpack and create manifest files.
+Manifest file is a json-format file with each line containing the
+meta data (i.e. audio filepath, transcript and audio duration)
+of each audio file in the data set.
+"""
+import argparse
+import codecs
+import json
+import os
+from pathlib import Path
+
+import soundfile
+
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unpack
+from paddlespeech.utils.argparse import print_arguments
+
+DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
+
+URL_ROOT = 'http://openslr.elda.org/resources/33'
+# URL_ROOT = 'https://openslr.magicdatatech.com/resources/33'
+DATA_URL = URL_ROOT + '/data_aishell.tgz'
+MD5_DATA = '2f494334227864a8a8fec932999db9d8'
+RESOURCE_URL = URL_ROOT + '/resource_aishell.tgz'
+MD5_RESOURCE = '957d480a0fcac85fc18e550756f624e5'
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--target_dir",
+    default=DATA_HOME + "/Aishell",
+    type=str,
+    help="Directory to save the dataset. (default: %(default)s)")
+parser.add_argument(
+    "--manifest_prefix",
+    default="manifest",
+    type=str,
+    help="Filepath prefix for output manifests. (default: %(default)s)")
+args = parser.parse_args()
+
+
+def create_manifest(data_dir, manifest_path_prefix):
+    print("Creating manifest %s ..." % os.path.join(data_dir,
+                                                    manifest_path_prefix))
+    json_lines = []
+    transcript_path = os.path.join(data_dir, 'transcript',
+                                   'aishell_transcript_v0.8.txt')
+    transcript_dict = {}
+    for line in codecs.open(transcript_path, 'r', 'utf-8'):
+        line = line.strip()
+        if line == '':
+            continue
+        audio_id, text = line.split(' ', 1)
+        # remove withespace, charactor text
+        text = ''.join(text.split())
+        transcript_dict[audio_id] = text
+
+    data_metas = dict()
+    data_types = ['train', 'dev', 'test']
+    for dtype in data_types:
+        del json_lines[:]
+        total_sec = 0.0
+        total_text = 0.0
+        total_num = 0
+
+        audio_dir = os.path.join(data_dir, 'wav', dtype)
+        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
+            for fname in filelist:
+                audio_path = os.path.abspath(os.path.join(subfolder, fname))
+                audio_id = os.path.basename(fname)[:-4]
+                # if no transcription for audio then skipped
+                if audio_id not in transcript_dict:
+                    continue
+
+                utt2spk = Path(audio_path).parent.name
+                audio_data, samplerate = soundfile.read(audio_path)
+                duration = float(len(audio_data) / samplerate)
+                text = transcript_dict[audio_id]
+                json_lines.append(
+                    json.dumps(
+                        {
+                            'utt': audio_id,
+                            'utt2spk': str(utt2spk),
+                            'feat': audio_path,
+                            'feat_shape': (duration, ),  # second
+                            'text': text
+                        },
+                        ensure_ascii=False))
+
+                total_sec += duration
+                total_text += len(text)
+                total_num += 1
+
+        manifest_path = manifest_path_prefix + '.' + dtype
+        with codecs.open(manifest_path, 'w', 'utf-8') as fout:
+            for line in json_lines:
+                fout.write(line + '\n')
+
+        meta = dict()
+        meta["dtype"] = dtype  # train, dev, test
+        meta["utts"] = total_num
+        meta["hours"] = total_sec / (60 * 60)
+        meta["text"] = total_text
+        meta["text/sec"] = total_text / total_sec
+        meta["sec/utt"] = total_sec / total_num
+        data_metas[dtype] = meta
+
+        manifest_dir = os.path.dirname(manifest_path_prefix)
+        meta_path = os.path.join(manifest_dir, dtype) + '.meta'
+        with open(meta_path, 'w') as f:
+            for key, val in meta.items():
+                print(f"{key}: {val}", file=f)
+
+    return data_metas
+
+
+def download_dataset(url, md5sum, target_dir):
+    """Download, unpack and create manifest file."""
+    data_dir = os.path.join(target_dir, 'data_aishell')
+    if not os.path.exists(data_dir):
+        filepath = download(url, md5sum, target_dir)
+        unpack(filepath, target_dir)
+        # unpack all audio tar files
+        audio_dir = os.path.join(data_dir, 'wav')
+        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
+            for ftar in filelist:
+                unpack(os.path.join(subfolder, ftar), subfolder, True)
+    else:
+        print("Skip downloading and unpacking. Data already exists in %s." %
+              os.path.abspath(target_dir))
+    return os.path.abspath(data_dir)
+
+
+def check_dataset(data_dir):
+    print(f"check dataset {os.path.abspath(data_dir)} ...")
+
+    transcript_path = os.path.join(data_dir, 'transcript',
+                                   'aishell_transcript_v0.8.txt')
+    if not os.path.exists(transcript_path):
+        raise FileNotFoundError(f"no transcript file found in {data_dir}.")
+
+    transcript_dict = {}
+    for line in codecs.open(transcript_path, 'r', 'utf-8'):
+        line = line.strip()
+        if line == '':
+            continue
+        audio_id, text = line.split(' ', 1)
+        # remove withespace, charactor text
+        text = ''.join(text.split())
+        transcript_dict[audio_id] = text
+
+    no_label = 0
+    data_types = ['train', 'dev', 'test']
+    for dtype in data_types:
+        audio_dir = os.path.join(data_dir, 'wav', dtype)
+        if not os.path.exists(audio_dir):
+            raise IOError(f"{audio_dir} does not exist.")
+
+        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
+            for fname in filelist:
+                audio_path = os.path.abspath(os.path.join(subfolder, fname))
+                audio_id = os.path.basename(fname)[:-4]
+                # if no transcription for audio then skipped
+                if audio_id not in transcript_dict:
+                    print(f"Warning: {audio_id} not has transcript.")
+                    no_label += 1
+                    continue
+
+                utt2spk = Path(audio_path).parent.name
+                audio_data, samplerate = soundfile.read(audio_path)
+                assert samplerate == 16000, f"{audio_path} sample rate is {samplerate} not 16k, please check."
+
+        print(f"Warning: {dtype} has {no_label} audio does not has transcript.")
+
+
+def prepare_dataset(url, md5sum, target_dir, manifest_path=None, check=False):
+    """Download, unpack and create manifest file."""
+    data_dir = download_dataset(url, md5sum, target_dir)
+
+    if check:
+        try:
+            check_dataset(data_dir)
+        except Exception as e:
+            raise ValueError(
+                f"{data_dir} dataset format not right, please check it.")
+
+    meta = None
+    if manifest_path:
+        meta = create_manifest(data_dir, manifest_path)
+
+    return data_dir, meta
+
+
+def main():
+    print_arguments(args, globals())
+    if args.target_dir.startswith('~'):
+        args.target_dir = os.path.expanduser(args.target_dir)
+
+    data_dir, meta = prepare_dataset(
+        url=DATA_URL,
+        md5sum=MD5_DATA,
+        target_dir=args.target_dir,
+        manifest_path=args.manifest_prefix,
+        check=True)
+
+    resource_dir, _ = prepare_dataset(
+        url=RESOURCE_URL,
+        md5sum=MD5_RESOURCE,
+        target_dir=args.target_dir,
+        manifest_path=None)
+
+    print("Data download and manifest prepare done!")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/utils/utility.py b/paddlespeech/dataset/download.py
similarity index 59%
rename from utils/utility.py
rename to paddlespeech/dataset/download.py
index dbf8b1d7..28dbd0eb 100755
--- a/utils/utility.py
+++ b/paddlespeech/dataset/download.py
@@ -19,91 +19,16 @@ import zipfile
 from typing import Text
 
 __all__ = [
-    "check_md5sum", "getfile_insensitive", "download_multi", "download",
-    "unpack", "unzip", "md5file", "print_arguments", "add_arguments",
-    "get_commandline_args"
+    "check_md5sum",
+    "getfile_insensitive",
+    "download_multi",
+    "download",
+    "unpack",
+    "unzip",
+    "md5file",
 ]
 
 
-def get_commandline_args():
-    extra_chars = [
-        " ",
-        ";",
-        "&",
-        "(",
-        ")",
-        "|",
-        "^",
-        "<",
-        ">",
-        "?",
-        "*",
-        "[",
-        "]",
-        "$",
-        "`",
-        '"',
-        "\\",
-        "!",
-        "{",
-        "}",
-    ]
-
-    # Escape the extra characters for shell
-    argv = [
-        arg.replace("'", "'\\''") if all(char not in arg
-                                         for char in extra_chars) else
-        "'" + arg.replace("'", "'\\''") + "'" for arg in sys.argv
-    ]
-
-    return sys.executable + " " + " ".join(argv)
-
-
-def print_arguments(args, info=None):
-    """Print argparse's arguments.
-
-    Usage:
-
-    .. code-block:: python
-
-        parser = argparse.ArgumentParser()
-        parser.add_argument("name", default="Jonh", type=str, help="User name.")
-        args = parser.parse_args()
-        print_arguments(args)
-
-    :param args: Input argparse.Namespace for printing.
-    :type args: argparse.Namespace
-    """
-    filename = ""
-    if info:
-        filename = info["__file__"]
-    filename = os.path.basename(filename)
-    print(f"----------- {filename} Configuration Arguments -----------")
-    for arg, value in sorted(vars(args).items()):
-        print("%s: %s" % (arg, value))
-    print("-----------------------------------------------------------")
-
-
-def add_arguments(argname, type, default, help, argparser, **kwargs):
-    """Add argparse's argument.
-
-    Usage:
-
-    .. code-block:: python
-
-        parser = argparse.ArgumentParser()
-        add_argument("name", str, "Jonh", "User name.", parser)
-        args = parser.parse_args()
-    """
-    type = distutils.util.strtobool if type == bool else type
-    argparser.add_argument(
-        "--" + argname,
-        default=default,
-        type=type,
-        help=help + ' Default: %(default)s.',
-        **kwargs)
-
-
 def md5file(fname):
     hash_md5 = hashlib.md5()
     f = open(fname, "rb")
diff --git a/paddlespeech/dataset/s2t/__init__.py b/paddlespeech/dataset/s2t/__init__.py
new file mode 100644
index 00000000..27ea9e77
--- /dev/null
+++ b/paddlespeech/dataset/s2t/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# s2t utils binaries.
+from .avg_model import main as avg_ckpts_main
+from .build_vocab import main as build_vocab_main
+from .compute_mean_std import main as compute_mean_std_main
+from .compute_wer import main as compute_wer_main
+from .format_data import main as format_data_main
+from .format_rsl import main as format_rsl_main
diff --git a/paddlespeech/dataset/s2t/avg_model.py b/paddlespeech/dataset/s2t/avg_model.py
new file mode 100755
index 00000000..c5753b72
--- /dev/null
+++ b/paddlespeech/dataset/s2t/avg_model.py
@@ -0,0 +1,125 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import glob
+import json
+import os
+
+import numpy as np
+import paddle
+
+
+def define_argparse():
+    parser = argparse.ArgumentParser(description='average model')
+    parser.add_argument('--dst_model', required=True, help='averaged model')
+    parser.add_argument(
+        '--ckpt_dir', required=True, help='ckpt model dir for average')
+    parser.add_argument(
+        '--val_best', action="store_true", help='averaged model')
+    parser.add_argument(
+        '--num', default=5, type=int, help='nums for averaged model')
+    parser.add_argument(
+        '--min_epoch',
+        default=0,
+        type=int,
+        help='min epoch used for averaging model')
+    parser.add_argument(
+        '--max_epoch',
+        default=65536,  # Big enough
+        type=int,
+        help='max epoch used for averaging model')
+
+    args = parser.parse_args()
+    return args
+
+
+def average_checkpoints(dst_model="",
+                        ckpt_dir="",
+                        val_best=True,
+                        num=5,
+                        min_epoch=0,
+                        max_epoch=65536):
+    paddle.set_device('cpu')
+
+    val_scores = []
+    jsons = glob.glob(f'{ckpt_dir}/[!train]*.json')
+    jsons = sorted(jsons, key=os.path.getmtime, reverse=True)
+    for y in jsons:
+        with open(y, 'r') as f:
+            dic_json = json.load(f)
+        loss = dic_json['val_loss']
+        epoch = dic_json['epoch']
+        if epoch >= min_epoch and epoch <= max_epoch:
+            val_scores.append((epoch, loss))
+    assert val_scores, f"Not find any valid checkpoints: {val_scores}"
+    val_scores = np.array(val_scores)
+
+    if val_best:
+        sort_idx = np.argsort(val_scores[:, 1])
+        sorted_val_scores = val_scores[sort_idx]
+    else:
+        sorted_val_scores = val_scores
+
+    beat_val_scores = sorted_val_scores[:num, 1]
+    selected_epochs = sorted_val_scores[:num, 0].astype(np.int64)
+    avg_val_score = np.mean(beat_val_scores)
+    print("selected val scores = " + str(beat_val_scores))
+    print("selected epochs = " + str(selected_epochs))
+    print("averaged val score = " + str(avg_val_score))
+
+    path_list = [
+        ckpt_dir + '/{}.pdparams'.format(int(epoch))
+        for epoch in sorted_val_scores[:num, 0]
+    ]
+    print(path_list)
+
+    avg = None
+    num = args.num
+    assert num == len(path_list)
+    for path in path_list:
+        print(f'Processing {path}')
+        states = paddle.load(path)
+        if avg is None:
+            avg = states
+        else:
+            for k in avg.keys():
+                avg[k] += states[k]
+    # average
+    for k in avg.keys():
+        if avg[k] is not None:
+            avg[k] /= num
+
+    paddle.save(avg, args.dst_model)
+    print(f'Saving to {args.dst_model}')
+
+    meta_path = os.path.splitext(args.dst_model)[0] + '.avg.json'
+    with open(meta_path, 'w') as f:
+        data = json.dumps({
+            "mode": 'val_best' if args.val_best else 'latest',
+            "avg_ckpt": args.dst_model,
+            "val_loss_mean": avg_val_score,
+            "ckpts": path_list,
+            "epochs": selected_epochs.tolist(),
+            "val_losses": beat_val_scores.tolist(),
+        })
+        f.write(data + "\n")
+
+
+def main():
+    args = define_argparse()
+    average_checkpoints(args)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/paddlespeech/dataset/s2t/build_vocab.py b/paddlespeech/dataset/s2t/build_vocab.py
new file mode 100755
index 00000000..dd5f6208
--- /dev/null
+++ b/paddlespeech/dataset/s2t/build_vocab.py
@@ -0,0 +1,166 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Build vocabulary from manifest files.
+Each item in vocabulary file is a character.
+"""
+import argparse
+import functools
+import os
+import tempfile
+from collections import Counter
+
+import jsonlines
+
+from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
+from paddlespeech.s2t.frontend.utility import BLANK
+from paddlespeech.s2t.frontend.utility import SOS
+from paddlespeech.s2t.frontend.utility import SPACE
+from paddlespeech.s2t.frontend.utility import UNK
+from paddlespeech.utils.argparse import add_arguments
+from paddlespeech.utils.argparse import print_arguments
+
+
+def count_manifest(counter, text_feature, manifest_path):
+    manifest_jsons = []
+    with jsonlines.open(manifest_path, 'r') as reader:
+        for json_data in reader:
+            manifest_jsons.append(json_data)
+
+    for line_json in manifest_jsons:
+        if isinstance(line_json['text'], str):
+            tokens = text_feature.tokenize(
+                line_json['text'], replace_space=False)
+
+            counter.update(tokens)
+        else:
+            assert isinstance(line_json['text'], list)
+            for text in line_json['text']:
+                tokens = text_feature.tokenize(text, replace_space=False)
+                counter.update(tokens)
+
+
+def dump_text_manifest(fileobj, manifest_path, key='text'):
+    manifest_jsons = []
+    with jsonlines.open(manifest_path, 'r') as reader:
+        for json_data in reader:
+            manifest_jsons.append(json_data)
+
+    for line_json in manifest_jsons:
+        if isinstance(line_json[key], str):
+            fileobj.write(line_json[key] + "\n")
+        else:
+            assert isinstance(line_json[key], list)
+            for line in line_json[key]:
+                fileobj.write(line + "\n")
+
+
+def build_vocab(manifest_paths="",
+                vocab_path="examples/librispeech/data/vocab.txt",
+                unit_type="char",
+                count_threshold=0,
+                text_keys='text',
+                spm_mode="unigram",
+                spm_vocab_size=0,
+                spm_model_prefix="",
+                spm_character_coverage=0.9995):
+    fout = open(vocab_path, 'w', encoding='utf-8')
+    fout.write(BLANK + "\n")  # 0 will be used for "blank" in CTC
+    fout.write(UNK + '\n')  # <unk> must be 1
+
+    if unit_type == 'spm':
+        # tools/spm_train --input=$wave_data/lang_char/input.txt
+        # --vocab_size=${nbpe} --model_type=${bpemode}
+        # --model_prefix=${bpemodel} --input_sentence_size=100000000
+        import sentencepiece as spm
+
+        fp = tempfile.NamedTemporaryFile(mode='w', delete=False)
+        for manifest_path in manifest_paths:
+            _text_keys = [text_keys] if type(
+                text_keys) is not list else text_keys
+            for text_key in _text_keys:
+                dump_text_manifest(fp, manifest_path, key=text_key)
+        fp.close()
+        # train
+        spm.SentencePieceTrainer.Train(
+            input=fp.name,
+            vocab_size=spm_vocab_size,
+            model_type=spm_mode,
+            model_prefix=spm_model_prefix,
+            input_sentence_size=100000000,
+            character_coverage=spm_character_coverage)
+        os.unlink(fp.name)
+
+    # encode
+    text_feature = TextFeaturizer(unit_type, "", spm_model_prefix)
+    counter = Counter()
+
+    for manifest_path in manifest_paths:
+        count_manifest(counter, text_feature, manifest_path)
+
+    count_sorted = sorted(counter.items(), key=lambda x: x[1], reverse=True)
+    tokens = []
+    for token, count in count_sorted:
+        if count < count_threshold:
+            break
+        # replace space by `<space>`
+        token = SPACE if token == ' ' else token
+        tokens.append(token)
+
+    tokens = sorted(tokens)
+    for token in tokens:
+        fout.write(token + '\n')
+
+    fout.write(SOS + "\n")  # <sos/eos>
+    fout.close()
+
+
+def define_argparse():
+    parser = argparse.ArgumentParser(description=__doc__)
+    add_arg = functools.partial(add_arguments, argparser=parser)
+
+    # yapf: disable
+    add_arg('unit_type', str, "char", "Unit type, e.g. char, word, spm")
+    add_arg('count_threshold', int, 0,
+            "Truncation threshold for char/word counts.Default 0, no truncate.")
+    add_arg('vocab_path', str,
+            'examples/librispeech/data/vocab.txt',
+            "Filepath to write the vocabulary.")
+    add_arg('manifest_paths', str,
+            None,
+            "Filepaths of manifests for building vocabulary. "
+            "You can provide multiple manifest files.",
+            nargs='+',
+            required=True)
+    add_arg('text_keys', str,
+            'text',
+            "keys of the text in manifest for building vocabulary. "
+            "You can provide multiple k.",
+            nargs='+')
+    # bpe
+    add_arg('spm_vocab_size', int, 0, "Vocab size for spm.")
+    add_arg('spm_mode', str, 'unigram', "spm model type, e.g. unigram, spm, char, word. only need when `unit_type` is spm")
+    add_arg('spm_model_prefix', str, "", "spm_model_%(spm_mode)_%(count_threshold), spm model prefix, only need when `unit_type` is spm")
+    add_arg('spm_character_coverage', float, 0.9995, "character coverage to determine the minimum symbols")
+    # yapf: disable
+
+    args = parser.parse_args()
+    return args
+
+def main():
+    args = define_argparse()
+    print_arguments(args, globals())
+    build_vocab(**vars(args))
+
+if __name__ == '__main__':
+    main()
diff --git a/paddlespeech/dataset/s2t/compute_mean_std.py b/paddlespeech/dataset/s2t/compute_mean_std.py
new file mode 100755
index 00000000..8762ee57
--- /dev/null
+++ b/paddlespeech/dataset/s2t/compute_mean_std.py
@@ -0,0 +1,106 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Compute mean and std for feature normalizer, and save to file."""
+import argparse
+import functools
+
+from paddlespeech.s2t.frontend.augmentor.augmentation import AugmentationPipeline
+from paddlespeech.s2t.frontend.featurizer.audio_featurizer import AudioFeaturizer
+from paddlespeech.s2t.frontend.normalizer import FeatureNormalizer
+from paddlespeech.utils.argparse import add_arguments
+from paddlespeech.utils.argparse import print_arguments
+
+
+def compute_cmvn(manifest_path="data/librispeech/manifest.train",
+                 output_path="data/librispeech/mean_std.npz",
+                 num_samples=2000,
+                 num_workers=0,
+                 spectrum_type="linear",
+                 feat_dim=13,
+                 delta_delta=False,
+                 stride_ms=10,
+                 window_ms=20,
+                 sample_rate=16000,
+                 use_dB_normalization=True,
+                 target_dB=-20):
+
+    augmentation_pipeline = AugmentationPipeline('{}')
+    audio_featurizer = AudioFeaturizer(
+        spectrum_type=spectrum_type,
+        feat_dim=feat_dim,
+        delta_delta=delta_delta,
+        stride_ms=float(stride_ms),
+        window_ms=float(window_ms),
+        n_fft=None,
+        max_freq=None,
+        target_sample_rate=sample_rate,
+        use_dB_normalization=use_dB_normalization,
+        target_dB=target_dB,
+        dither=0.0)
+
+    def augment_and_featurize(audio_segment):
+        augmentation_pipeline.transform_audio(audio_segment)
+        return audio_featurizer.featurize(audio_segment)
+
+    normalizer = FeatureNormalizer(
+        mean_std_filepath=None,
+        manifest_path=manifest_path,
+        featurize_func=augment_and_featurize,
+        num_samples=num_samples,
+        num_workers=num_workers)
+    normalizer.write_to_file(output_path)
+
+
+def define_argparse():
+    parser = argparse.ArgumentParser(description=__doc__)
+    add_arg = functools.partial(add_arguments, argparser=parser)
+
+    # yapf: disable
+    add_arg('manifest_path', str,
+            'data/librispeech/manifest.train',
+            "Filepath of manifest to compute normalizer's mean and stddev.")
+
+    add_arg('output_path', str,
+            'data/librispeech/mean_std.npz',
+            "Filepath of write mean and stddev to (.npz).")
+    add_arg('num_samples',  int,    2000,    "# of samples to for statistics.")
+    add_arg('num_workers',
+                            default=0,
+                            type=int,
+                            help='num of subprocess workers for processing')
+
+
+    add_arg('spectrum_type', str,
+            'linear',
+            "Audio feature type. Options: linear, mfcc, fbank.",
+            choices=['linear', 'mfcc', 'fbank'])
+    add_arg('feat_dim', int, 13, "Audio feature dim.")
+    add_arg('delta_delta', bool,  False, "Audio feature with delta delta.")
+    add_arg('stride_ms', int, 10,  "stride length in ms.")
+    add_arg('window_ms', int, 20,  "stride length in ms.")
+    add_arg('sample_rate',  int, 16000,  "target sample rate.")
+    add_arg('use_dB_normalization', bool, True, "do dB normalization.")
+    add_arg('target_dB',   int, -20,  "target dB.")
+    # yapf: disable
+
+    args = parser.parse_args()
+    return args
+
+def main():
+    args = define_argparse()
+    print_arguments(args, globals())
+    compute_cmvn(**vars(args))
+
+if __name__ == '__main__':
+    main()
diff --git a/paddlespeech/dataset/s2t/compute_wer.py b/paddlespeech/dataset/s2t/compute_wer.py
new file mode 100755
index 00000000..5711c725
--- /dev/null
+++ b/paddlespeech/dataset/s2t/compute_wer.py
@@ -0,0 +1,558 @@
+# Copyright 2021 Mobvoi Inc. All Rights Reserved.
+# flake8: noqa
+import codecs
+import re
+import sys
+import unicodedata
+
+remove_tag = True
+spacelist = [' ', '\t', '\r', '\n']
+puncts = [
+    '!', ',', '?', '、', '。', '！', '，', '；', '？', '：', '「', '」', '︰', '『', '』',
+    '《', '》'
+]
+
+
+def characterize(string):
+    res = []
+    i = 0
+    while i < len(string):
+        char = string[i]
+        if char in puncts:
+            i += 1
+            continue
+        cat1 = unicodedata.category(char)
+        #https://unicodebook.readthedocs.io/unicode.html#unicode-categories
+        if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist:  # space or not assigned
+            i += 1
+            continue
+        if cat1 == 'Lo':  # letter-other
+            res.append(char)
+            i += 1
+        else:
+            # some input looks like: <unk><noise>, we want to separate it to two words.
+            sep = ' '
+            if char == '<': sep = '>'
+            j = i + 1
+            while j < len(string):
+                c = string[j]
+                if ord(c) >= 128 or (c in spacelist) or (c == sep):
+                    break
+                j += 1
+            if j < len(string) and string[j] == '>':
+                j += 1
+            res.append(string[i:j])
+            i = j
+    return res
+
+
+def stripoff_tags(x):
+    if not x: return ''
+    chars = []
+    i = 0
+    T = len(x)
+    while i < T:
+        if x[i] == '<':
+            while i < T and x[i] != '>':
+                i += 1
+            i += 1
+        else:
+            chars.append(x[i])
+            i += 1
+    return ''.join(chars)
+
+
+def normalize(sentence, ignore_words, cs, split=None):
+    """ sentence, ignore_words are both in unicode
+    """
+    new_sentence = []
+    for token in sentence:
+        x = token
+        if not cs:
+            x = x.upper()
+        if x in ignore_words:
+            continue
+        if remove_tag:
+            x = stripoff_tags(x)
+        if not x:
+            continue
+        if split and x in split:
+            new_sentence += split[x]
+        else:
+            new_sentence.append(x)
+    return new_sentence
+
+
+class Calculator:
+    def __init__(self):
+        self.data = {}
+        self.space = []
+        self.cost = {}
+        self.cost['cor'] = 0
+        self.cost['sub'] = 1
+        self.cost['del'] = 1
+        self.cost['ins'] = 1
+
+    def calculate(self, lab, rec):
+        # Initialization
+        lab.insert(0, '')
+        rec.insert(0, '')
+        while len(self.space) < len(lab):
+            self.space.append([])
+        for row in self.space:
+            for element in row:
+                element['dist'] = 0
+                element['error'] = 'non'
+            while len(row) < len(rec):
+                row.append({'dist': 0, 'error': 'non'})
+        for i in range(len(lab)):
+            self.space[i][0]['dist'] = i
+            self.space[i][0]['error'] = 'del'
+        for j in range(len(rec)):
+            self.space[0][j]['dist'] = j
+            self.space[0][j]['error'] = 'ins'
+        self.space[0][0]['error'] = 'non'
+        for token in lab:
+            if token not in self.data and len(token) > 0:
+                self.data[token] = {
+                    'all': 0,
+                    'cor': 0,
+                    'sub': 0,
+                    'ins': 0,
+                    'del': 0
+                }
+        for token in rec:
+            if token not in self.data and len(token) > 0:
+                self.data[token] = {
+                    'all': 0,
+                    'cor': 0,
+                    'sub': 0,
+                    'ins': 0,
+                    'del': 0
+                }
+        # Computing edit distance
+        for i, lab_token in enumerate(lab):
+            for j, rec_token in enumerate(rec):
+                if i == 0 or j == 0:
+                    continue
+                min_dist = sys.maxsize
+                min_error = 'none'
+                dist = self.space[i - 1][j]['dist'] + self.cost['del']
+                error = 'del'
+                if dist < min_dist:
+                    min_dist = dist
+                    min_error = error
+                dist = self.space[i][j - 1]['dist'] + self.cost['ins']
+                error = 'ins'
+                if dist < min_dist:
+                    min_dist = dist
+                    min_error = error
+                if lab_token == rec_token:
+                    dist = self.space[i - 1][j - 1]['dist'] + self.cost['cor']
+                    error = 'cor'
+                else:
+                    dist = self.space[i - 1][j - 1]['dist'] + self.cost['sub']
+                    error = 'sub'
+                if dist < min_dist:
+                    min_dist = dist
+                    min_error = error
+                self.space[i][j]['dist'] = min_dist
+                self.space[i][j]['error'] = min_error
+        # Tracing back
+        result = {
+            'lab': [],
+            'rec': [],
+            'all': 0,
+            'cor': 0,
+            'sub': 0,
+            'ins': 0,
+            'del': 0
+        }
+        i = len(lab) - 1
+        j = len(rec) - 1
+        while True:
+            if self.space[i][j]['error'] == 'cor':  # correct
+                if len(lab[i]) > 0:
+                    self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1
+                    self.data[lab[i]]['cor'] = self.data[lab[i]]['cor'] + 1
+                    result['all'] = result['all'] + 1
+                    result['cor'] = result['cor'] + 1
+                result['lab'].insert(0, lab[i])
+                result['rec'].insert(0, rec[j])
+                i = i - 1
+                j = j - 1
+            elif self.space[i][j]['error'] == 'sub':  # substitution
+                if len(lab[i]) > 0:
+                    self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1
+                    self.data[lab[i]]['sub'] = self.data[lab[i]]['sub'] + 1
+                    result['all'] = result['all'] + 1
+                    result['sub'] = result['sub'] + 1
+                result['lab'].insert(0, lab[i])
+                result['rec'].insert(0, rec[j])
+                i = i - 1
+                j = j - 1
+            elif self.space[i][j]['error'] == 'del':  # deletion
+                if len(lab[i]) > 0:
+                    self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1
+                    self.data[lab[i]]['del'] = self.data[lab[i]]['del'] + 1
+                    result['all'] = result['all'] + 1
+                    result['del'] = result['del'] + 1
+                result['lab'].insert(0, lab[i])
+                result['rec'].insert(0, "")
+                i = i - 1
+            elif self.space[i][j]['error'] == 'ins':  # insertion
+                if len(rec[j]) > 0:
+                    self.data[rec[j]]['ins'] = self.data[rec[j]]['ins'] + 1
+                    result['ins'] = result['ins'] + 1
+                result['lab'].insert(0, "")
+                result['rec'].insert(0, rec[j])
+                j = j - 1
+            elif self.space[i][j]['error'] == 'non':  # starting point
+                break
+            else:  # shouldn't reach here
+                print(
+                    'this should not happen , i = {i} , j = {j} , error = {error}'.
+                    format(i=i, j=j, error=self.space[i][j]['error']))
+        return result
+
+    def overall(self):
+        result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0}
+        for token in self.data:
+            result['all'] = result['all'] + self.data[token]['all']
+            result['cor'] = result['cor'] + self.data[token]['cor']
+            result['sub'] = result['sub'] + self.data[token]['sub']
+            result['ins'] = result['ins'] + self.data[token]['ins']
+            result['del'] = result['del'] + self.data[token]['del']
+        return result
+
+    def cluster(self, data):
+        result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0}
+        for token in data:
+            if token in self.data:
+                result['all'] = result['all'] + self.data[token]['all']
+                result['cor'] = result['cor'] + self.data[token]['cor']
+                result['sub'] = result['sub'] + self.data[token]['sub']
+                result['ins'] = result['ins'] + self.data[token]['ins']
+                result['del'] = result['del'] + self.data[token]['del']
+        return result
+
+    def keys(self):
+        return list(self.data.keys())
+
+
+def width(string):
+    return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string)
+
+
+def default_cluster(word):
+    unicode_names = [unicodedata.name(char) for char in word]
+    for i in reversed(range(len(unicode_names))):
+        if unicode_names[i].startswith('DIGIT'):  # 1
+            unicode_names[i] = 'Number'  # 'DIGIT'
+        elif (unicode_names[i].startswith('CJK UNIFIED IDEOGRAPH') or
+              unicode_names[i].startswith('CJK COMPATIBILITY IDEOGRAPH')):
+            # 明 / 郎
+            unicode_names[i] = 'Mandarin'  # 'CJK IDEOGRAPH'
+        elif (unicode_names[i].startswith('LATIN CAPITAL LETTER') or
+              unicode_names[i].startswith('LATIN SMALL LETTER')):
+            # A / a
+            unicode_names[i] = 'English'  # 'LATIN LETTER'
+        elif unicode_names[i].startswith('HIRAGANA LETTER'):  # は こ め
+            unicode_names[i] = 'Japanese'  # 'GANA LETTER'
+        elif (unicode_names[i].startswith('AMPERSAND') or
+              unicode_names[i].startswith('APOSTROPHE') or
+              unicode_names[i].startswith('COMMERCIAL AT') or
+              unicode_names[i].startswith('DEGREE CELSIUS') or
+              unicode_names[i].startswith('EQUALS SIGN') or
+              unicode_names[i].startswith('FULL STOP') or
+              unicode_names[i].startswith('HYPHEN-MINUS') or
+              unicode_names[i].startswith('LOW LINE') or
+              unicode_names[i].startswith('NUMBER SIGN') or
+              unicode_names[i].startswith('PLUS SIGN') or
+              unicode_names[i].startswith('SEMICOLON')):
+            # & / ' / @ / ℃ / = / . / - / _ / # / + / ;
+            del unicode_names[i]
+        else:
+            return 'Other'
+    if len(unicode_names) == 0:
+        return 'Other'
+    if len(unicode_names) == 1:
+        return unicode_names[0]
+    for i in range(len(unicode_names) - 1):
+        if unicode_names[i] != unicode_names[i + 1]:
+            return 'Other'
+    return unicode_names[0]
+
+
+def usage():
+    print(
+        "compute-wer.py : compute word error rate (WER) and align recognition results and references."
+    )
+    print(
+        "         usage : python compute-wer.py [--cs={0,1}] [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] [--padding-symbol={space,underline}] test.ref test.hyp > test.wer"
+    )
+
+
+def main():
+    # python utils/compute-wer.py --char=1 --v=1 ref hyp > rsl.error
+    if len(sys.argv) == 1:
+        usage()
+        sys.exit(0)
+    calculator = Calculator()
+    cluster_file = ''
+    ignore_words = set()
+    tochar = False
+    verbose = 1
+    padding_symbol = ' '
+    case_sensitive = False
+    max_words_per_line = sys.maxsize
+    split = None
+    while len(sys.argv) > 3:
+        a = '--maxw='
+        if sys.argv[1].startswith(a):
+            b = sys.argv[1][len(a):]
+            del sys.argv[1]
+            max_words_per_line = int(b)
+            continue
+        a = '--rt='
+        if sys.argv[1].startswith(a):
+            b = sys.argv[1][len(a):].lower()
+            del sys.argv[1]
+            remove_tag = (b == 'true') or (b != '0')
+            continue
+        a = '--cs='
+        if sys.argv[1].startswith(a):
+            b = sys.argv[1][len(a):].lower()
+            del sys.argv[1]
+            case_sensitive = (b == 'true') or (b != '0')
+            continue
+        a = '--cluster='
+        if sys.argv[1].startswith(a):
+            cluster_file = sys.argv[1][len(a):]
+            del sys.argv[1]
+            continue
+        a = '--splitfile='
+        if sys.argv[1].startswith(a):
+            split_file = sys.argv[1][len(a):]
+            del sys.argv[1]
+            split = dict()
+            with codecs.open(split_file, 'r', 'utf-8') as fh:
+                for line in fh:  # line in unicode
+                    words = line.strip().split()
+                    if len(words) >= 2:
+                        split[words[0]] = words[1:]
+            continue
+        a = '--ig='
+        if sys.argv[1].startswith(a):
+            ignore_file = sys.argv[1][len(a):]
+            del sys.argv[1]
+            with codecs.open(ignore_file, 'r', 'utf-8') as fh:
+                for line in fh:  # line in unicode
+                    line = line.strip()
+                    if len(line) > 0:
+                        ignore_words.add(line)
+            continue
+        a = '--char='
+        if sys.argv[1].startswith(a):
+            b = sys.argv[1][len(a):].lower()
+            del sys.argv[1]
+            tochar = (b == 'true') or (b != '0')
+            continue
+        a = '--v='
+        if sys.argv[1].startswith(a):
+            b = sys.argv[1][len(a):].lower()
+            del sys.argv[1]
+            verbose = 0
+            try:
+                verbose = int(b)
+            except:
+                if b == 'true' or b != '0':
+                    verbose = 1
+            continue
+        a = '--padding-symbol='
+        if sys.argv[1].startswith(a):
+            b = sys.argv[1][len(a):].lower()
+            del sys.argv[1]
+            if b == 'space':
+                padding_symbol = ' '
+            elif b == 'underline':
+                padding_symbol = '_'
+            continue
+        if True or sys.argv[1].startswith('-'):
+            #ignore invalid switch
+            del sys.argv[1]
+            continue
+
+    if not case_sensitive:
+        ig = set([w.upper() for w in ignore_words])
+        ignore_words = ig
+
+    default_clusters = {}
+    default_words = {}
+
+    ref_file = sys.argv[1]
+    hyp_file = sys.argv[2]
+    rec_set = {}
+    if split and not case_sensitive:
+        newsplit = dict()
+        for w in split:
+            words = split[w]
+            for i in range(len(words)):
+                words[i] = words[i].upper()
+            newsplit[w.upper()] = words
+        split = newsplit
+
+    with codecs.open(hyp_file, 'r', 'utf-8') as fh:
+        for line in fh:
+            if tochar:
+                array = characterize(line)
+            else:
+                array = line.strip().split()
+            if len(array) == 0: continue
+            fid = array[0]
+            rec_set[fid] = normalize(array[1:], ignore_words, case_sensitive,
+                                     split)
+
+    # compute error rate on the interaction of reference file and hyp file
+    for line in open(ref_file, 'r', encoding='utf-8'):
+        if tochar:
+            array = characterize(line)
+        else:
+            array = line.rstrip('\n').split()
+        if len(array) == 0: continue
+        fid = array[0]
+        if fid not in rec_set:
+            continue
+        lab = normalize(array[1:], ignore_words, case_sensitive, split)
+        rec = rec_set[fid]
+        if verbose:
+            print('\nutt: %s' % fid)
+
+        for word in rec + lab:
+            if word not in default_words:
+                default_cluster_name = default_cluster(word)
+                if default_cluster_name not in default_clusters:
+                    default_clusters[default_cluster_name] = {}
+                if word not in default_clusters[default_cluster_name]:
+                    default_clusters[default_cluster_name][word] = 1
+                default_words[word] = default_cluster_name
+
+        result = calculator.calculate(lab, rec)
+        if verbose:
+            if result['all'] != 0:
+                wer = float(result['ins'] + result['sub'] + result[
+                    'del']) * 100.0 / result['all']
+            else:
+                wer = 0.0
+            print('WER: %4.2f %%' % wer, end=' ')
+            print('N=%d C=%d S=%d D=%d I=%d' %
+                  (result['all'], result['cor'], result['sub'], result['del'],
+                   result['ins']))
+            space = {}
+            space['lab'] = []
+            space['rec'] = []
+            for idx in range(len(result['lab'])):
+                len_lab = width(result['lab'][idx])
+                len_rec = width(result['rec'][idx])
+                length = max(len_lab, len_rec)
+                space['lab'].append(length - len_lab)
+                space['rec'].append(length - len_rec)
+            upper_lab = len(result['lab'])
+            upper_rec = len(result['rec'])
+            lab1, rec1 = 0, 0
+            while lab1 < upper_lab or rec1 < upper_rec:
+                if verbose > 1:
+                    print('lab(%s):' % fid.encode('utf-8'), end=' ')
+                else:
+                    print('lab:', end=' ')
+                lab2 = min(upper_lab, lab1 + max_words_per_line)
+                for idx in range(lab1, lab2):
+                    token = result['lab'][idx]
+                    print('{token}'.format(token=token), end='')
+                    for n in range(space['lab'][idx]):
+                        print(padding_symbol, end='')
+                    print(' ', end='')
+                print()
+                if verbose > 1:
+                    print('rec(%s):' % fid.encode('utf-8'), end=' ')
+                else:
+                    print('rec:', end=' ')
+                rec2 = min(upper_rec, rec1 + max_words_per_line)
+                for idx in range(rec1, rec2):
+                    token = result['rec'][idx]
+                    print('{token}'.format(token=token), end='')
+                    for n in range(space['rec'][idx]):
+                        print(padding_symbol, end='')
+                    print(' ', end='')
+                print('\n', end='\n')
+                lab1 = lab2
+                rec1 = rec2
+
+    if verbose:
+        print(
+            '==========================================================================='
+        )
+        print()
+
+    result = calculator.overall()
+    if result['all'] != 0:
+        wer = float(result['ins'] + result['sub'] + result[
+            'del']) * 100.0 / result['all']
+    else:
+        wer = 0.0
+    print('Overall -> %4.2f %%' % wer, end=' ')
+    print('N=%d C=%d S=%d D=%d I=%d' %
+          (result['all'], result['cor'], result['sub'], result['del'],
+           result['ins']))
+    if not verbose:
+        print()
+
+    if verbose:
+        for cluster_id in default_clusters:
+            result = calculator.cluster(
+                [k for k in default_clusters[cluster_id]])
+            if result['all'] != 0:
+                wer = float(result['ins'] + result['sub'] + result[
+                    'del']) * 100.0 / result['all']
+            else:
+                wer = 0.0
+            print('%s -> %4.2f %%' % (cluster_id, wer), end=' ')
+            print('N=%d C=%d S=%d D=%d I=%d' %
+                  (result['all'], result['cor'], result['sub'], result['del'],
+                   result['ins']))
+        if len(cluster_file) > 0:  # compute separated WERs for word clusters
+            cluster_id = ''
+            cluster = []
+            for line in open(cluster_file, 'r', encoding='utf-8'):
+                for token in line.decode('utf-8').rstrip('\n').split():
+                    # end of cluster reached, like </Keyword>
+                    if token[0:2] == '</' and token[len(token)-1] == '>' and \
+                       token.lstrip('</').rstrip('>') == cluster_id :
+                        result = calculator.cluster(cluster)
+                        if result['all'] != 0:
+                            wer = float(result['ins'] + result['sub'] + result[
+                                'del']) * 100.0 / result['all']
+                        else:
+                            wer = 0.0
+                        print('%s -> %4.2f %%' % (cluster_id, wer), end=' ')
+                        print('N=%d C=%d S=%d D=%d I=%d' %
+                              (result['all'], result['cor'], result['sub'],
+                               result['del'], result['ins']))
+                        cluster_id = ''
+                        cluster = []
+                    # begin of cluster reached, like <Keyword>
+                    elif token[0] == '<' and token[len(token)-1] == '>' and \
+                         cluster_id == '' :
+                        cluster_id = token.lstrip('<').rstrip('>')
+                        cluster = []
+                    # general terms, like WEATHER / CAR / ...
+                    else:
+                        cluster.append(token)
+        print()
+        print(
+            '==========================================================================='
+        )
+
+
+if __name__ == '__main__':
+    main()
diff --git a/paddlespeech/dataset/s2t/format_data.py b/paddlespeech/dataset/s2t/format_data.py
new file mode 100755
index 00000000..dcff66ea
--- /dev/null
+++ b/paddlespeech/dataset/s2t/format_data.py
@@ -0,0 +1,154 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""format manifest with more metadata."""
+import argparse
+import functools
+import json
+
+import jsonlines
+
+from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
+from paddlespeech.s2t.frontend.utility import load_cmvn
+from paddlespeech.s2t.io.utility import feat_type
+from paddlespeech.utils.argparse import add_arguments
+from paddlespeech.utils.argparse import print_arguments
+
+
+def define_argparse():
+    parser = argparse.ArgumentParser(description=__doc__)
+    add_arg = functools.partial(add_arguments, argparser=parser)
+    # yapf: disable
+    add_arg('manifest_paths',   str,
+            None,
+            "Filepaths of manifests for building vocabulary. "
+            "You can provide multiple manifest files.",
+            nargs='+',
+            required=True)
+    add_arg('output_path',  str, None, "filepath of formated manifest.", required=True)
+    add_arg('cmvn_path',       str,
+            'examples/librispeech/data/mean_std.json',
+            "Filepath of cmvn.")
+    add_arg('unit_type', str, "char", "Unit type, e.g. char, word, spm")
+    add_arg('vocab_path',       str,
+            'examples/librispeech/data/vocab.txt',
+            "Filepath of the vocabulary.")
+    # bpe
+    add_arg('spm_model_prefix', str, None,
+        "spm model prefix, spm_model_%(bpe_mode)_%(count_threshold), only need when `unit_type` is spm")
+
+    # yapf: disable
+    args = parser.parse_args()
+    return args
+
+def format_data(
+    manifest_paths="",
+    output_path="",
+    cmvn_path="examples/librispeech/data/mean_std.json",
+    unit_type="char",
+    vocab_path="examples/librispeech/data/vocab.txt",
+    spm_model_prefix=""):
+
+    fout = open(output_path, 'w', encoding='utf-8')
+
+    # get feat dim
+    filetype = cmvn_path.split(".")[-1]
+    mean, istd = load_cmvn(cmvn_path, filetype=filetype)
+    feat_dim = mean.shape[0] #(D)
+    print(f"Feature dim: {feat_dim}")
+
+    text_feature = TextFeaturizer(unit_type, vocab_path, spm_model_prefix)
+    vocab_size = text_feature.vocab_size
+    print(f"Vocab size: {vocab_size}")
+
+    # josnline like this
+    # {
+    #   "input": [{"name": "input1", "shape": (100, 83), "feat": "xxx.ark:123"}],
+    #   "output": [{"name":"target1", "shape": (40, 5002), "text": "a b c de"}],
+    #   "utt2spk": "111-2222",
+    #   "utt": "111-2222-333"
+    # }
+    count = 0
+    for manifest_path in manifest_paths:
+        with jsonlines.open(str(manifest_path), 'r') as reader:
+            manifest_jsons = list(reader)
+
+        for line_json in manifest_jsons:
+            output_json = {
+                "input": [],
+                "output": [],
+                'utt': line_json['utt'],
+                'utt2spk': line_json.get('utt2spk', 'global'),
+            }
+
+            # output
+            line = line_json['text']
+            if isinstance(line, str):
+                # only one target
+                tokens = text_feature.tokenize(line)
+                tokenids = text_feature.featurize(line)
+                output_json['output'].append({
+                    'name': 'target1',
+                    'shape': (len(tokenids), vocab_size),
+                    'text': line,
+                    'token': ' '.join(tokens),
+                    'tokenid': ' '.join(map(str, tokenids)),
+                })
+            else:
+                # isinstance(line, list), multi target in one vocab
+                for i, item in enumerate(line, 1):
+                    tokens = text_feature.tokenize(item)
+                    tokenids = text_feature.featurize(item)
+                    output_json['output'].append({
+                        'name': f'target{i}',
+                        'shape': (len(tokenids), vocab_size),
+                        'text': item,
+                        'token': ' '.join(tokens),
+                        'tokenid': ' '.join(map(str, tokenids)),
+                    })
+
+            # input
+            line = line_json['feat']
+            if isinstance(line, str):
+                # only one input
+                feat_shape = line_json['feat_shape']
+                assert isinstance(feat_shape, (list, tuple)), type(feat_shape)
+                filetype = feat_type(line)
+                if filetype == 'sound':
+                    feat_shape.append(feat_dim)
+                else: # kaldi
+                    raise NotImplementedError('no support kaldi feat now!')
+
+                output_json['input'].append({
+                    "name": "input1",
+                    "shape": feat_shape,
+                    "feat": line,
+                    "filetype": filetype,
+                })
+            else:
+                # isinstance(line, list), multi input 
+                raise NotImplementedError("not support multi input now!")
+
+            fout.write(json.dumps(output_json) + '\n')
+            count += 1
+
+    print(f"{manifest_paths} Examples number: {count}")
+    fout.close()
+
+def main():
+    args = define_argparse()
+    print_arguments(args, globals())
+    format_data(**vars(args))
+
+if __name__ == '__main__':
+    main()
diff --git a/paddlespeech/dataset/s2t/format_rsl.py b/paddlespeech/dataset/s2t/format_rsl.py
new file mode 100644
index 00000000..0a58e7e6
--- /dev/null
+++ b/paddlespeech/dataset/s2t/format_rsl.py
@@ -0,0 +1,143 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+format ref/hyp file for `utt text` format to compute CER/WER/MER.
+
+norm:
+BAC009S0764W0196 明确了发展目标和重点任务
+BAC009S0764W0186 实现我国房地产市场的平稳运行
+
+
+sclite:
+加大对结构机械化环境和收集谈控机制力度(BAC009S0906W0240.wav)
+河南省新乡市丰秋县刘光镇政府东五零左右(BAC009S0770W0441.wav)
+"""
+import argparse
+
+import jsonlines
+
+from paddlespeech.utils.argparse import print_arguments
+
+
+def transform_hyp(origin, trans, trans_sclite):
+    """
+    Args:
+        origin: The input json file which contains the model output
+        trans: The output file for caculate CER/WER
+        trans_sclite: The output file for caculate CER/WER using sclite
+    """
+    input_dict = {}
+
+    with open(origin, "r+", encoding="utf8") as f:
+        for item in jsonlines.Reader(f):
+            input_dict[item["utt"]] = item["hyps"][0]
+
+    if trans:
+        with open(trans, "w+", encoding="utf8") as f:
+            for key in input_dict.keys():
+                f.write(key + " " + input_dict[key] + "\n")
+        print(f"transform_hyp output: {trans}")
+
+    if trans_sclite:
+        with open(trans_sclite, "w+") as f:
+            for key in input_dict.keys():
+                line = input_dict[key] + "(" + key + ".wav" + ")" + "\n"
+                f.write(line)
+        print(f"transform_hyp output: {trans_sclite}")
+
+
+def transform_ref(origin, trans, trans_sclite):
+    """
+    Args:
+        origin: The input json file which contains the model output
+        trans: The output file for caculate CER/WER
+        trans_sclite: The output file for caculate CER/WER using sclite
+    """
+    input_dict = {}
+
+    with open(origin, "r", encoding="utf8") as f:
+        for item in jsonlines.Reader(f):
+            input_dict[item["utt"]] = item["text"]
+
+    if trans:
+        with open(trans, "w", encoding="utf8") as f:
+            for key in input_dict.keys():
+                f.write(key + " " + input_dict[key] + "\n")
+        print(f"transform_hyp output: {trans}")
+
+    if trans_sclite:
+        with open(trans_sclite, "w") as f:
+            for key in input_dict.keys():
+                line = input_dict[key] + "(" + key + ".wav" + ")" + "\n"
+                f.write(line)
+        print(f"transform_hyp output: {trans_sclite}")
+
+
+def define_argparse():
+    parser = argparse.ArgumentParser(
+        prog='format ref/hyp file for compute CER/WER', add_help=True)
+    parser.add_argument(
+        '--origin_hyp', type=str, default="", help='origin hyp file')
+    parser.add_argument(
+        '--trans_hyp',
+        type=str,
+        default="",
+        help='hyp file for caculating CER/WER')
+    parser.add_argument(
+        '--trans_hyp_sclite',
+        type=str,
+        default="",
+        help='hyp file for caculating CER/WER by sclite')
+
+    parser.add_argument(
+        '--origin_ref', type=str, default="", help='origin ref file')
+    parser.add_argument(
+        '--trans_ref',
+        type=str,
+        default="",
+        help='ref file for caculating CER/WER')
+    parser.add_argument(
+        '--trans_ref_sclite',
+        type=str,
+        default="",
+        help='ref file for caculating CER/WER by sclite')
+    parser_args = parser.parse_args()
+    return parser_args
+
+
+def format_result(origin_hyp="",
+                  trans_hyp="",
+                  trans_hyp_sclite="",
+                  origin_ref="",
+                  trans_ref="",
+                  trans_ref_sclite=""):
+
+    if origin_hyp:
+        transform_hyp(
+            origin=origin_hyp, trans=trans_hyp, trans_sclite=trans_hyp_sclite)
+
+    if origin_ref:
+        transform_ref(
+            origin=origin_ref, trans=trans_ref, trans_sclite=trans_ref_sclite)
+
+
+def main():
+    args = define_argparse()
+    print_arguments(args, globals())
+
+    format_result(**vars(args))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/paddlespeech/resource/pretrained_models.py b/paddlespeech/resource/pretrained_models.py
index 3c5aa1f9..3c5db64b 100644
--- a/paddlespeech/resource/pretrained_models.py
+++ b/paddlespeech/resource/pretrained_models.py
@@ -30,6 +30,7 @@ __all__ = [
 ]
 
 # The tags for pretrained_models should be "{model_name}[_{dataset}][-{lang}][-...]".
+# Add code-switch and multilingual tag, "{model_name}[_{dataset}]-[codeswitch/multilingual][_{lang}][-...]".
 # e.g. "conformer_wenetspeech-zh-16k" and "panns_cnn6-32k".
 # Command line and python api use "{model_name}[_{dataset}]" as --model, usage:
 # "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav"
@@ -101,6 +102,20 @@ ssl_dynamic_pretrained_models = {
             'params':
             'exp/wav2vec2ASR/checkpoints/avg_1.pdparams',
         },
+        '1.4': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr3/wav2vec2ASR-large-aishell1_ckpt_1.4.0.model.tar.gz',
+            'md5':
+            '150e51b8ea5d255ccce6b395de8d916a',
+            'cfg_path':
+            'model.yaml',
+            'ckpt_path':
+            'exp/wav2vec2ASR/checkpoints/avg_1',
+            'model':
+            'exp/wav2vec2ASR/checkpoints/avg_1.pdparams',
+            'params':
+            'exp/wav2vec2ASR/checkpoints/avg_1.pdparams',
+        },
     },
 }
 
@@ -322,6 +337,18 @@ asr_dynamic_pretrained_models = {
             '099a601759d467cd0a8523ff939819c5'
         },
     },
+    "conformer_talcs-codeswitch_zh_en-16k": {
+        '1.4': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/s2t/tal_cs/asr1/asr1_conformer_talcs_ckpt_1.4.0.model.tar.gz',
+            'md5':
+            '01962c5d0a70878fe41cacd4f61e14d1',
+            'cfg_path':
+            'model.yaml',
+            'ckpt_path':
+            'exp/conformer/checkpoints/avg_10'
+        },
+    },
 }
 
 asr_static_pretrained_models = {
@@ -876,6 +903,24 @@ tts_dynamic_pretrained_models = {
             'phone_id_map.txt',
         },
     },
+    "fastspeech2_canton-canton": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_canton_ckpt_1.4.0.zip',
+            'md5':
+            '504560c082deba82120927627c900374',
+            'config':
+            'default.yaml',
+            'ckpt':
+            'snapshot_iter_140000.pdz',
+            'speech_stats':
+            'speech_stats.npy',
+            'phones_dict':
+            'phone_id_map.txt',
+            'speaker_dict':
+            'speaker_id_map.txt',
+        },
+    },
     "fastspeech2_ljspeech-en": {
         '1.0': {
             'url':
@@ -981,9 +1026,9 @@ tts_dynamic_pretrained_models = {
     "fastspeech2_male-zh": {
         '1.0': {
             'url':
-            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_ckpt_1.3.0.zip',
+            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_zh_ckpt_1.4.0.zip',
             'md5':
-            'a4b1a2f667b878ec8f67375357b04282',
+            '43a9f4bc48a91f5a6f53017474e6c788',
             'config':
             'default.yaml',
             'ckpt':
@@ -994,6 +1039,38 @@ tts_dynamic_pretrained_models = {
             'phone_id_map.txt',
         },
     },
+    "fastspeech2_male-en": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_en_ckpt_1.4.0.zip',
+            'md5':
+            'cc9f44f1f20a8173f63e2d1d41ef1a9c',
+            'config':
+            'default.yaml',
+            'ckpt':
+            'snapshot_iter_100000.pdz',
+            'speech_stats':
+            'speech_stats.npy',
+            'phones_dict':
+            'phone_id_map.txt',
+        },
+    },
+    "fastspeech2_male-mix": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_mix_ckpt_1.4.0.zip',
+            'md5':
+            '6d48ad60ef0ab2cee89a5d8cfd93dd86',
+            'config':
+            'default.yaml',
+            'ckpt':
+            'snapshot_iter_177000.pdz',
+            'speech_stats':
+            'speech_stats.npy',
+            'phones_dict':
+            'phone_id_map.txt',
+        },
+    },
     # tacotron2
     "tacotron2_csmsc-zh": {
         '1.0': {
@@ -1087,9 +1164,9 @@ tts_dynamic_pretrained_models = {
     "pwgan_male-zh": {
         '1.0': {
             'url':
-            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_male_ckpt_1.3.0.zip',
+            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_male_ckpt_1.4.0.zip',
             'md5':
-            'c98cdb889c809973f8cc764437311132',
+            'a443d6253bf9be377f27ae5972a03c65',
             'config':
             'default.yaml',
             'ckpt':
@@ -1185,6 +1262,20 @@ tts_dynamic_pretrained_models = {
             'feats_stats.npy',
         },
     },
+    "hifigan_male-zh": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_male_ckpt_1.4.0.zip',
+            'md5':
+            'a709830596e102c2b83f8adc26d41d85',
+            'config':
+            'default.yaml',
+            'ckpt':
+            'snapshot_iter_630000.pdz',
+            'speech_stats':
+            'feats_stats.npy',
+        },
+    },
     # wavernn
     "wavernn_csmsc-zh": {
         '1.0': {
@@ -1201,6 +1292,15 @@ tts_dynamic_pretrained_models = {
         },
     },
 }
+tts_dynamic_pretrained_models[
+    "fastspeech2_mix-zh"] = tts_dynamic_pretrained_models[
+        "fastspeech2_mix-en"] = tts_dynamic_pretrained_models[
+            "fastspeech2_mix-mix"]
+tts_dynamic_pretrained_models["pwgan_male-en"] = tts_dynamic_pretrained_models[
+    "pwgan_male-mix"] = tts_dynamic_pretrained_models["pwgan_male-zh"]
+tts_dynamic_pretrained_models[
+    "hifigan_male-en"] = tts_dynamic_pretrained_models[
+        "hifigan_male-mix"] = tts_dynamic_pretrained_models["hifigan_male-zh"]
 
 tts_static_pretrained_models = {
     # speedyspeech
@@ -1291,6 +1391,106 @@ tts_static_pretrained_models = {
             24000,
         },
     },
+    "fastspeech2_mix-mix": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_csmscljspeech_add-zhen_static.zip',
+            'md5':
+            'b5001f66cccafdde07707e1b6269fa58',
+            'model':
+            'fastspeech2_mix.pdmodel',
+            'params':
+            'fastspeech2_mix.pdiparams',
+            'phones_dict':
+            'phone_id_map.txt',
+            'speaker_dict':
+            'speaker_id_map.txt',
+            'sample_rate':
+            24000,
+        },
+        '2.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_mix_static_0.2.0.zip',
+            'md5':
+            'c6dd138fab3ba261299c0b2efee51d5a',
+            'model':
+            'fastspeech2_mix.pdmodel',
+            'params':
+            'fastspeech2_mix.pdiparams',
+            'phones_dict':
+            'phone_id_map.txt',
+            'speaker_dict':
+            'speaker_id_map.txt',
+            'sample_rate':
+            24000,
+        },
+    },
+    "fastspeech2_male-zh": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_zh_static_1.4.0.zip',
+            'md5':
+            '9b7218829e7fa01aa33dbb2c5f6ef20f',
+            'model':
+            'fastspeech2_male-zh.pdmodel',
+            'params':
+            'fastspeech2_male-zh.pdiparams',
+            'phones_dict':
+            'phone_id_map.txt',
+            'sample_rate':
+            24000,
+        },
+    },
+    "fastspeech2_male-en": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_en_static_1.4.0.zip',
+            'md5':
+            '33cea19b6821b371d242969ffd8b6cbf',
+            'model':
+            'fastspeech2_male-en.pdmodel',
+            'params':
+            'fastspeech2_male-en.pdiparams',
+            'phones_dict':
+            'phone_id_map.txt',
+            'sample_rate':
+            24000,
+        },
+    },
+    "fastspeech2_male-mix": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_mix_static_1.4.0.zip',
+            'md5':
+            '66585b04c0ced72f3cb82ee85b814d80',
+            'model':
+            'fastspeech2_male-mix.pdmodel',
+            'params':
+            'fastspeech2_male-mix.pdiparams',
+            'phones_dict':
+            'phone_id_map.txt',
+            'sample_rate':
+            24000,
+        },
+    },
+    "fastspeech2_canton-canton": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_canton_static_1.4.0.zip',
+            'md5':
+            '5da80931666503b9b6aed25e894d2ade',
+            'model':
+            'fastspeech2_canton.pdmodel',
+            'params':
+            'fastspeech2_canton.pdiparams',
+            'phones_dict':
+            'phone_id_map.txt',
+            'speaker_dict':
+            'speaker_id_map.txt',
+            'sample_rate':
+            24000,
+        },
+    },
     # pwgan
     "pwgan_csmsc-zh": {
         '1.0': {
@@ -1348,6 +1548,20 @@ tts_static_pretrained_models = {
             24000,
         },
     },
+    "pwgan_male-zh": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_male_static_1.4.0.zip',
+            'md5':
+            '52a480ad35694b96603e0a92e9fb3f95',
+            'model':
+            'pwgan_male.pdmodel',
+            'params':
+            'pwgan_male.pdiparams',
+            'sample_rate':
+            24000,
+        },
+    },
     # mb_melgan
     "mb_melgan_csmsc-zh": {
         '1.0': {
@@ -1420,8 +1634,33 @@ tts_static_pretrained_models = {
             24000,
         },
     },
+    "hifigan_male-zh": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_male_static_1.4.0.zip',
+            'md5':
+            '9011fa2738b501e909d1a61054bed29b',
+            'model':
+            'hifigan_male.pdmodel',
+            'params':
+            'hifigan_male.pdiparams',
+            'sample_rate':
+            24000,
+        },
+    },
 }
 
+tts_static_pretrained_models[
+    "fastspeech2_mix-zh"] = tts_static_pretrained_models[
+        "fastspeech2_mix-en"] = tts_static_pretrained_models[
+            "fastspeech2_mix-mix"]
+tts_static_pretrained_models["pwgan_male-en"] = tts_static_pretrained_models[
+    "pwgan_male-mix"] = tts_static_pretrained_models["pwgan_male-zh"]
+tts_static_pretrained_models["hifigan_male-en"] = tts_static_pretrained_models[
+    "hifigan_male-mix"] = tts_static_pretrained_models["hifigan_male-zh"]
+tts_static_pretrained_models[
+    "pwgan_aishell3-canton"] = tts_static_pretrained_models["pwgan_aishell3-zh"]
+
 tts_onnx_pretrained_models = {
     # speedyspeech
     "speedyspeech_csmsc_onnx-zh": {
@@ -1520,6 +1759,94 @@ tts_onnx_pretrained_models = {
             24000,
         },
     },
+    "fastspeech2_mix_onnx-mix": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_csmscljspeech_add-zhen_onnx.zip',
+            'md5':
+            '73052520202957920cf54700980933d0',
+            'ckpt':
+            'fastspeech2_mix.onnx',
+            'phones_dict':
+            'phone_id_map.txt',
+            'speaker_dict':
+            'speaker_id_map.txt',
+            'sample_rate':
+            24000,
+        },
+        '2.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/t2s/chinse_english_mixed/models/fastspeech2_mix_onnx_0.2.0.zip',
+            'md5':
+            '43b8ca5f85709c503777f808eb02a39e',
+            'ckpt':
+            'fastspeech2_mix.onnx',
+            'phones_dict':
+            'phone_id_map.txt',
+            'speaker_dict':
+            'speaker_id_map.txt',
+            'sample_rate':
+            24000,
+        },
+    },
+    "fastspeech2_male_onnx-zh": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_zh_onnx_1.4.0.zip',
+            'md5':
+            '46c66f5ab86f4fcb493d899d9901c863',
+            'ckpt':
+            'fastspeech2_male-zh.onnx',
+            'phones_dict':
+            'phone_id_map.txt',
+            'sample_rate':
+            24000,
+        },
+    },
+    "fastspeech2_male_onnx-en": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_en_onnx_1.4.0.zip',
+            'md5':
+            '401fb5cc31fdb25e22e901c9acba79c8',
+            'ckpt':
+            'fastspeech2_male-en.onnx',
+            'phones_dict':
+            'phone_id_map.txt',
+            'sample_rate':
+            24000,
+        },
+    },
+    "fastspeech2_male_onnx-mix": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_male_mix_onnx_1.4.0.zip',
+            'md5':
+            '07e51c5991c529b78603034547e9d0fa',
+            'ckpt':
+            'fastspeech2_male-mix.onnx',
+            'phones_dict':
+            'phone_id_map.txt',
+            'sample_rate':
+            24000,
+        },
+    },
+    "fastspeech2_canton_onnx-canton": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_canton_onnx_1.4.0.zip',
+            'md5':
+            '1c8d51ceb2f9bdd168e23be575c2ccf8',
+            'ckpt':
+            'fastspeech2_canton.onnx',
+            'phones_dict':
+            'phone_id_map.txt',
+            'speaker_dict':
+            'speaker_id_map.txt',
+            'sample_rate':
+            24000,
+        },
+    },
     # pwgan
     "pwgan_csmsc_onnx-zh": {
         '1.0': {
@@ -1569,6 +1896,18 @@ tts_onnx_pretrained_models = {
             24000,
         },
     },
+    "pwgan_male_onnx-zh": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwgan_male_onnx_1.4.0.zip',
+            'md5':
+            '13163fd1326f555650dc7141d31767c3',
+            'ckpt':
+            'pwgan_male.onnx',
+            'sample_rate':
+            24000,
+        },
+    },
     # mb_melgan
     "mb_melgan_csmsc_onnx-zh": {
         '1.0': {
@@ -1631,8 +1970,33 @@ tts_onnx_pretrained_models = {
             24000,
         },
     },
+    "hifigan_male_onnx-zh": {
+        '1.0': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_male_onnx_1.4.0.zip',
+            'md5':
+            'ec6b35417b1fe811d3b1641d4b527769',
+            'ckpt':
+            'hifigan_male.onnx',
+            'sample_rate':
+            24000,
+        },
+    },
 }
 
+tts_onnx_pretrained_models[
+    "fastspeech2_mix_onnx-zh"] = tts_onnx_pretrained_models[
+        "fastspeech2_mix_onnx-en"] = tts_onnx_pretrained_models[
+            "fastspeech2_mix_onnx-mix"]
+tts_onnx_pretrained_models["pwgan_male_onnx-en"] = tts_onnx_pretrained_models[
+    "pwgan_male_onnx-mix"] = tts_onnx_pretrained_models["pwgan_male_onnx-zh"]
+tts_onnx_pretrained_models["hifigan_male_onnx-en"] = tts_onnx_pretrained_models[
+    "hifigan_male_onnx-mix"] = tts_onnx_pretrained_models[
+        "hifigan_male_onnx-zh"]
+tts_onnx_pretrained_models[
+    "pwgan_aishell3_onnx-canton"] = tts_onnx_pretrained_models[
+        "pwgan_aishell3_onnx-zh"]
+
 # ---------------------------------
 # ------------ Vector -------------
 # ---------------------------------
@@ -1692,7 +2056,7 @@ g2pw_onnx_models = {
 }
 
 # ---------------------------------
-# ------------- Rhy_frontend ---------------
+# ---------- Rhy_frontend ---------
 # ---------------------------------
 rhy_frontend_models = {
     'rhy_e2e': {
@@ -1703,3 +2067,16 @@ rhy_frontend_models = {
         },
     },
 }
+
+# ---------------------------------
+# ---------- StarGANv2VC ----------
+# ---------------------------------
+
+StarGANv2VC_source = {
+    '1.0': {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/starganv2vc/StarGANv2VC_source.zip',
+        'md5':
+        '195e169419163f5648030ba84c71f866',
+    }
+}
diff --git a/paddlespeech/s2t/__init__.py b/paddlespeech/s2t/__init__.py
index 6663bcf8..37d99226 100644
--- a/paddlespeech/s2t/__init__.py
+++ b/paddlespeech/s2t/__init__.py
@@ -267,7 +267,7 @@ def to(x: paddle.Tensor, *args, **kwargs) -> paddle.Tensor:
 
 
 if not hasattr(paddle.Tensor, 'to'):
-    logger.debug("register user to to paddle.Tensor, remove this when fixed!")
+    logger.debug("register user to paddle.Tensor, remove this when fixed!")
     setattr(paddle.Tensor, 'to', to)
     setattr(paddle.static.Variable, 'to', to)
 
diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/deploy/runtime.py b/paddlespeech/s2t/exps/deepspeech2/bin/deploy/runtime.py
index 5755a5f1..f6b1ed09 100644
--- a/paddlespeech/s2t/exps/deepspeech2/bin/deploy/runtime.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/deploy/runtime.py
@@ -28,8 +28,8 @@ from paddlespeech.s2t.training.cli import default_argument_parser
 from paddlespeech.s2t.utils.socket_server import AsrRequestHandler
 from paddlespeech.s2t.utils.socket_server import AsrTCPServer
 from paddlespeech.s2t.utils.socket_server import warm_up_test
-from paddlespeech.s2t.utils.utility import add_arguments
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import add_arguments
+from paddlespeech.utils.argparse import print_arguments
 
 
 def init_predictor(args):
diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/deploy/server.py b/paddlespeech/s2t/exps/deepspeech2/bin/deploy/server.py
index 0d0b4f21..fc57399d 100644
--- a/paddlespeech/s2t/exps/deepspeech2/bin/deploy/server.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/deploy/server.py
@@ -26,8 +26,8 @@ from paddlespeech.s2t.training.cli import default_argument_parser
 from paddlespeech.s2t.utils.socket_server import AsrRequestHandler
 from paddlespeech.s2t.utils.socket_server import AsrTCPServer
 from paddlespeech.s2t.utils.socket_server import warm_up_test
-from paddlespeech.s2t.utils.utility import add_arguments
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import add_arguments
+from paddlespeech.utils.argparse import print_arguments
 
 
 def start_server(config, args):
diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/export.py b/paddlespeech/s2t/exps/deepspeech2/bin/export.py
index 8acd46df..07228e98 100644
--- a/paddlespeech/s2t/exps/deepspeech2/bin/export.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/export.py
@@ -16,7 +16,7 @@ from yacs.config import CfgNode
 
 from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Tester as Tester
 from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments
 
 
 def main_sp(config, args):
diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/test.py b/paddlespeech/s2t/exps/deepspeech2/bin/test.py
index 030168a9..a8e20ff9 100644
--- a/paddlespeech/s2t/exps/deepspeech2/bin/test.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/test.py
@@ -16,7 +16,7 @@ from yacs.config import CfgNode
 
 from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Tester as Tester
 from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments
 
 
 def main_sp(config, args):
diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py b/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py
index d7a9402b..1e07aa80 100644
--- a/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py
@@ -16,7 +16,7 @@ from yacs.config import CfgNode
 
 from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2ExportTester as ExportTester
 from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments
 
 
 def main_sp(config, args):
diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py b/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py
index 66ea29d0..32a583b6 100644
--- a/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py
@@ -27,8 +27,8 @@ from paddlespeech.s2t.training.cli import default_argument_parser
 from paddlespeech.s2t.utils import mp_tools
 from paddlespeech.s2t.utils.checkpoint import Checkpoint
 from paddlespeech.s2t.utils.log import Log
-from paddlespeech.s2t.utils.utility import print_arguments
 from paddlespeech.s2t.utils.utility import UpdateConfig
+from paddlespeech.utils.argparse import print_arguments
 
 logger = Log(__name__).getlog()
 
diff --git a/paddlespeech/s2t/exps/deepspeech2/bin/train.py b/paddlespeech/s2t/exps/deepspeech2/bin/train.py
index 2c9942f9..1340aaa3 100644
--- a/paddlespeech/s2t/exps/deepspeech2/bin/train.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/train.py
@@ -16,7 +16,7 @@ from yacs.config import CfgNode
 
 from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Trainer as Trainer
 from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments
 
 
 def main_sp(config, args):
diff --git a/paddlespeech/s2t/exps/u2/bin/alignment.py b/paddlespeech/s2t/exps/u2/bin/alignment.py
index e3390feb..cc294038 100644
--- a/paddlespeech/s2t/exps/u2/bin/alignment.py
+++ b/paddlespeech/s2t/exps/u2/bin/alignment.py
@@ -16,7 +16,7 @@ from yacs.config import CfgNode
 
 from paddlespeech.s2t.exps.u2.model import U2Tester as Tester
 from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments
 
 
 def main_sp(config, args):
diff --git a/paddlespeech/s2t/exps/u2/bin/export.py b/paddlespeech/s2t/exps/u2/bin/export.py
index 592b1237..4725e5e1 100644
--- a/paddlespeech/s2t/exps/u2/bin/export.py
+++ b/paddlespeech/s2t/exps/u2/bin/export.py
@@ -16,7 +16,7 @@ from yacs.config import CfgNode
 
 from paddlespeech.s2t.exps.u2.model import U2Tester as Tester
 from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments
 
 
 def main_sp(config, args):
diff --git a/paddlespeech/s2t/exps/u2/bin/test.py b/paddlespeech/s2t/exps/u2/bin/test.py
index b13fd0d3..43eeff63 100644
--- a/paddlespeech/s2t/exps/u2/bin/test.py
+++ b/paddlespeech/s2t/exps/u2/bin/test.py
@@ -18,7 +18,7 @@ from yacs.config import CfgNode
 
 from paddlespeech.s2t.exps.u2.model import U2Tester as Tester
 from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments
 
 
 def main_sp(config, args):
diff --git a/paddlespeech/s2t/exps/u2/bin/train.py b/paddlespeech/s2t/exps/u2/bin/train.py
index dc3a87c1..a0f50328 100644
--- a/paddlespeech/s2t/exps/u2/bin/train.py
+++ b/paddlespeech/s2t/exps/u2/bin/train.py
@@ -19,7 +19,7 @@ from yacs.config import CfgNode
 
 from paddlespeech.s2t.exps.u2.model import U2Trainer as Trainer
 from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments
 
 # from paddlespeech.s2t.exps.u2.trainer import U2Trainer as Trainer
 
diff --git a/paddlespeech/s2t/exps/u2_kaldi/bin/test.py b/paddlespeech/s2t/exps/u2_kaldi/bin/test.py
index 422483b9..4137537e 100644
--- a/paddlespeech/s2t/exps/u2_kaldi/bin/test.py
+++ b/paddlespeech/s2t/exps/u2_kaldi/bin/test.py
@@ -18,7 +18,7 @@ from yacs.config import CfgNode
 
 from paddlespeech.s2t.training.cli import default_argument_parser
 from paddlespeech.s2t.utils.dynamic_import import dynamic_import
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments
 
 model_test_alias = {
     "u2": "paddlespeech.s2t.exps.u2.model:U2Tester",
diff --git a/paddlespeech/s2t/exps/u2_kaldi/bin/train.py b/paddlespeech/s2t/exps/u2_kaldi/bin/train.py
index b11da715..011aabac 100644
--- a/paddlespeech/s2t/exps/u2_kaldi/bin/train.py
+++ b/paddlespeech/s2t/exps/u2_kaldi/bin/train.py
@@ -19,7 +19,7 @@ from yacs.config import CfgNode
 
 from paddlespeech.s2t.training.cli import default_argument_parser
 from paddlespeech.s2t.utils.dynamic_import import dynamic_import
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments
 
 model_train_alias = {
     "u2": "paddlespeech.s2t.exps.u2.model:U2Trainer",
diff --git a/paddlespeech/s2t/exps/u2_st/bin/export.py b/paddlespeech/s2t/exps/u2_st/bin/export.py
index c641152f..a2a7424c 100644
--- a/paddlespeech/s2t/exps/u2_st/bin/export.py
+++ b/paddlespeech/s2t/exps/u2_st/bin/export.py
@@ -16,7 +16,7 @@ from yacs.config import CfgNode
 
 from paddlespeech.s2t.exps.u2_st.model import U2STTester as Tester
 from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments
 
 
 def main_sp(config, args):
diff --git a/paddlespeech/s2t/exps/u2_st/bin/test.py b/paddlespeech/s2t/exps/u2_st/bin/test.py
index c07c95bd..30a903ce 100644
--- a/paddlespeech/s2t/exps/u2_st/bin/test.py
+++ b/paddlespeech/s2t/exps/u2_st/bin/test.py
@@ -18,7 +18,7 @@ from yacs.config import CfgNode
 
 from paddlespeech.s2t.exps.u2_st.model import U2STTester as Tester
 from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments
 
 
 def main_sp(config, args):
diff --git a/paddlespeech/s2t/exps/u2_st/bin/train.py b/paddlespeech/s2t/exps/u2_st/bin/train.py
index 574942e5..b36a0af4 100644
--- a/paddlespeech/s2t/exps/u2_st/bin/train.py
+++ b/paddlespeech/s2t/exps/u2_st/bin/train.py
@@ -19,7 +19,7 @@ from yacs.config import CfgNode
 
 from paddlespeech.s2t.exps.u2_st.model import U2STTrainer as Trainer
 from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments
 
 
 def main_sp(config, args):
diff --git a/paddlespeech/s2t/exps/wav2vec2/bin/test.py b/paddlespeech/s2t/exps/wav2vec2/bin/test.py
index a376651d..c17cee0f 100644
--- a/paddlespeech/s2t/exps/wav2vec2/bin/test.py
+++ b/paddlespeech/s2t/exps/wav2vec2/bin/test.py
@@ -18,7 +18,7 @@ from yacs.config import CfgNode
 
 from paddlespeech.s2t.exps.wav2vec2.model import Wav2Vec2ASRTester as Tester
 from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments
 
 
 def main_sp(config, args):
diff --git a/paddlespeech/s2t/exps/wav2vec2/bin/test_wav.py b/paddlespeech/s2t/exps/wav2vec2/bin/test_wav.py
index 0d66ac41..0295713f 100644
--- a/paddlespeech/s2t/exps/wav2vec2/bin/test_wav.py
+++ b/paddlespeech/s2t/exps/wav2vec2/bin/test_wav.py
@@ -18,6 +18,7 @@ from pathlib import Path
 
 import paddle
 import soundfile
+from paddlenlp.transformers import AutoTokenizer
 from yacs.config import CfgNode
 
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
@@ -33,9 +34,15 @@ class Wav2vec2Infer():
         self.args = args
         self.config = config
         self.audio_file = args.audio_file
+        self.tokenizer = config.get("tokenizer", None)
+
+        if self.tokenizer:
+            self.text_feature = AutoTokenizer.from_pretrained(
+                self.config.tokenizer)
+        else:
+            self.text_feature = TextFeaturizer(
+                unit_type=config.unit_type, vocab=config.vocab_filepath)
 
-        self.text_feature = TextFeaturizer(
-            unit_type=config.unit_type, vocab=config.vocab_filepath)
         paddle.set_device('gpu' if self.args.ngpu > 0 else 'cpu')
 
         # model
@@ -59,14 +66,14 @@ class Wav2vec2Infer():
             audio, _ = soundfile.read(
                 self.audio_file, dtype="int16", always_2d=True)
             logger.info(f"audio shape: {audio.shape}")
-
             xs = paddle.to_tensor(audio, dtype='float32').unsqueeze(axis=0)
             decode_config = self.config.decode
             result_transcripts, result_tokenids = self.model.decode(
                 xs,
                 text_feature=self.text_feature,
                 decoding_method=decode_config.decoding_method,
-                beam_size=decode_config.beam_size)
+                beam_size=decode_config.beam_size,
+                tokenizer=self.tokenizer, )
             rsl = result_transcripts[0]
             utt = Path(self.audio_file).name
             logger.info(f"hyp: {utt} {rsl}")
diff --git a/paddlespeech/s2t/exps/wav2vec2/bin/train.py b/paddlespeech/s2t/exps/wav2vec2/bin/train.py
index 29e7ef55..0c37f796 100644
--- a/paddlespeech/s2t/exps/wav2vec2/bin/train.py
+++ b/paddlespeech/s2t/exps/wav2vec2/bin/train.py
@@ -19,7 +19,7 @@ from yacs.config import CfgNode
 
 from paddlespeech.s2t.exps.wav2vec2.model import Wav2Vec2ASRTrainer as Trainer
 from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments
 
 
 def main_sp(config, args):
diff --git a/paddlespeech/s2t/exps/wav2vec2/model.py b/paddlespeech/s2t/exps/wav2vec2/model.py
index 6a3321e4..6c90f99e 100644
--- a/paddlespeech/s2t/exps/wav2vec2/model.py
+++ b/paddlespeech/s2t/exps/wav2vec2/model.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,17 +17,22 @@ import math
 import os
 import re
 import time
-from collections import defaultdict
 from collections import OrderedDict
 from contextlib import nullcontext
 
 import jsonlines
 import numpy as np
 import paddle
+from hyperpyyaml import load_hyperpyyaml
 from paddle import distributed as dist
+from paddlenlp.transformers import AutoTokenizer
 
 from paddlespeech.s2t.frontend.featurizer import TextFeaturizer
 from paddlespeech.s2t.io.dataloader import DataLoaderFactory
+from paddlespeech.s2t.io.speechbrain import data_pipeline
+from paddlespeech.s2t.io.speechbrain import dataio
+from paddlespeech.s2t.io.speechbrain import dataset
+from paddlespeech.s2t.io.speechbrain.dataloader import make_dataloader
 from paddlespeech.s2t.models.wav2vec2.processing.speech_augmentation import TimeDomainSpecAugment
 from paddlespeech.s2t.models.wav2vec2.wav2vec2_ASR import Wav2vec2ASR
 from paddlespeech.s2t.training.optimizer import OptimizerFactory
@@ -45,10 +50,96 @@ from paddlespeech.s2t.utils.utility import UpdateConfig
 logger = Log(__name__).getlog()
 
 
+def clip_grad_norm_(
+        parameters,
+        max_norm,
+        norm_type=2.0,
+        error_if_nonfinite=False, ):
+    r"""Clips gradient norm of the iteratable parameters.
+
+    Norms are calculated together on all gradients, just as they are
+    connected into one vector. The gradient will be modified in place.
+
+    This API can only run in dynamic graph mode, not static graph mode.
+
+    Args:
+        parameters (Iterable[paddle.Tensor] or paddle.Tensor): Tensors or a single Tensor
+            that will be normalized gradients
+        max_norm (float or int): max norm of the gradients
+        norm_type (float or int): type of the used p-norm. Can be `inf` for
+            infinity norm.
+        error_if_nonfinite (bool): if True, throw an error if the total
+            norm of the gradients from :attr:`parameters` is `nan`,
+            `inf`, or `-inf`.
+
+    Returns:
+        Total norm of the parameter gradients (treated as a single vector).
+    Example:
+        .. code-block:: python
+            import paddle
+
+            x = paddle.uniform([10, 10], min=-1.0, max=1.0, dtype='float32')
+            max_norm = float(5.0)
+            linear = paddle.nn.Linear(in_features=10, out_features=10)
+            out = linear(x)
+            loss = paddle.mean(out)
+            loss.backward()
+
+            paddle.nn.utils.clip_grad_norm_(linear.parameters(), max_norm)
+
+            sdg = paddle.optimizer.SGD(learning_rate=0.1, parameters=linear.parameters())
+            sdg.step()
+    """
+    if not paddle.in_dynamic_mode():
+        raise RuntimeError('this API can only run in dynamic mode.')
+
+    if isinstance(parameters, paddle.Tensor):
+        parameters = [parameters]
+
+    support_norm_type = [float("inf"), 0, 1, 2]
+    if norm_type not in support_norm_type:
+        raise ValueError(f'norm_type only support {support_norm_type}')
+
+    grads = [p.grad for p in parameters if p.grad is not None]
+    max_norm = float(max_norm)
+    norm_type = float(norm_type)
+    if len(grads) == 0:
+        return paddle.to_tensor(0.0)
+    if norm_type == float("inf"):
+        norms = [g.detach().abs().max() for g in grads]
+        total_norm = (norms[0]
+                      if len(norms) == 1 else paddle.max(paddle.stack(norms)))
+    else:
+        total_norm = paddle.linalg.norm(
+            paddle.stack(
+                [paddle.linalg.norm(g.detach(), norm_type) for g in grads]),
+            norm_type, )
+
+    if error_if_nonfinite and paddle.logical_or(total_norm.isnan(),
+                                                total_norm.isinf()):
+        raise RuntimeError(
+            f'The total norm of {norm_type} order of the gradients from '
+            '`parameters` is non-finite, so it cannot be clipped. In any case, '
+            'disable this error and scale the gradient by non-finite norm, '
+            'set `error_if_nonfinite=False`')
+    clip_coef = max_norm / (total_norm + 1e-6)
+    # Note: when the coef is clamped to 1, it is redundant to multiply the clamped coef, but this
+    # avoids the `if clip_coef < 1:` condition.
+    clip_coef_clamped = paddle.clip(clip_coef, max=1.0)
+    with paddle.no_grad():
+        for _, p in enumerate(parameters):
+            g = p.grad
+            if g is not None:
+                p.grad = paddle.multiply(x=g, y=clip_coef_clamped)
+    return total_norm
+
+
 class Wav2Vec2ASRTrainer(Trainer):
     def __init__(self, config, args):
         super().__init__(config, args)
         self.avg_train_loss = 0.0
+        self.loss_isfinite = True  # while flag is 'False', loss in Nan or inf, and can not be avg
+        self.use_sb = True  # whether use speech brain dataloader
 
     def update_average(self, batch_index, loss):
         """Update running average of the loss.
@@ -62,6 +153,9 @@ class Wav2Vec2ASRTrainer(Trainer):
         if math.isfinite(loss):
             self.avg_train_loss -= self.avg_train_loss / (batch_index + 1)
             self.avg_train_loss += loss / (batch_index + 1)
+        else:
+            self.loss_isfinite = False
+            logger.info('loss:{} in Nan or inf, error'.format(loss))
 
     def before_train(self):
         from_scratch = self.resume_or_scratch()
@@ -81,14 +175,22 @@ class Wav2Vec2ASRTrainer(Trainer):
         start = time.time()
 
         # forward
-        utt, wav, wavs_lens, target, target_lens = batch
-        wavs_lens_rate = wavs_lens / wav.shape[1]
+        ## sb data pipeline
+        if self.use_sb:
+            wav, wavs_lens_rate = batch['sig']
+            target, target_lens_rate = batch['tokens']
+            target_lens = (target_lens_rate *
+                           target.shape[1]).round().astype(paddle.int64)
+        else:
+            utt, wav, wavs_lens, target, target_lens = batch
+            wavs_lens_rate = wavs_lens / wav.shape[1]
+            wav = wav[:, :, 0]
 
-        wav = wav[:, :, 0]
         if hasattr(train_conf, 'audio_augment'):
             wav = self.speech_augmentation(wav, wavs_lens_rate)
 
         loss = self.model(wav, wavs_lens_rate, target, target_lens)
+
         # loss div by `batch_size * accum_grad`
         loss /= train_conf.accum_grad
         # update self.avg_train_loss
@@ -108,10 +210,15 @@ class Wav2Vec2ASRTrainer(Trainer):
             context = nullcontext
         with context():
             loss.backward()
+
             layer_tools.print_grads(self.model, print_func=None)
 
         # optimizer step old
         if (batch_index + 1) % train_conf.accum_grad == 0:
+            #do global grad clip
+            if train_conf.global_grad_clip != 0:
+                clip_grad_norm_(self.model.parameters(),
+                                train_conf.global_grad_clip)
             self.model_optimizer.step()
             self.model_optimizer.clear_grad()
             if not train_conf.freeze_wav2vec2:
@@ -123,10 +230,12 @@ class Wav2Vec2ASRTrainer(Trainer):
                 if not train_conf.freeze_wav2vec2:
                     self.wav2vec2_lr_scheduler.step()
             self.iteration += 1
+
         losses_np = {'loss': self.avg_train_loss * train_conf.accum_grad}
         iteration_time = time.time() - start
         for k, v in losses_np.items():
             report(k, v)
+        report("loss_whitoutavg", float(loss))
         report("batch_size", self.config.batch_size)
         report("accum", train_conf.accum_grad)
         report("step_cost", iteration_time)
@@ -148,24 +257,34 @@ class Wav2Vec2ASRTrainer(Trainer):
         if not self.use_streamdata:
             logger.info(
                 f"Valid Total Examples: {len(self.valid_loader.dataset)}")
-        valid_losses = defaultdict(list)
-        num_seen_utts = 1
+        valid_losses = {}
+        step = 0
         total_loss = 0.0
+        num_seen_utts = 1  # use update_average and no need for num_seen_utts here
         for i, batch in enumerate(self.valid_loader):
-            utt, wav, wavs_lens, target, target_lens = batch
-            wavs_lens_rate = wavs_lens / wav.shape[1]
-            wav = wav[:, :, 0]
+            if self.use_sb:
+                wav, wavs_lens_rate = batch['sig']
+                target, target_lens_rate = batch['tokens']
+                target_lens = (target_lens_rate *
+                               target.shape[1]).round().astype(paddle.int64)
+            else:
+                utt, wav, wavs_lens, target, target_lens = batch
+                wavs_lens_rate = wavs_lens / wav.shape[1]
+                wav = wav[:, :, 0]
+
             loss = self.model(wav, wavs_lens_rate, target, target_lens)
+            # use update_average
+            total_loss -= total_loss / (step + 1)
+            total_loss += loss / (step + 1)
 
             if math.isfinite(float(loss)):
-                num_utts = batch[1].shape[0]
-                num_seen_utts += num_utts
-                total_loss += float(loss) * num_utts
-                valid_losses['val_loss'].append(float(loss))
+                step += 1
+                valid_losses['val_loss'] = float(loss)
+            else:
+                logger.info('loss:{} in Nan or inf, error'.format(float(loss)))
 
             if (i + 1) % self.config.log_interval == 0:
-                valid_dump = {k: np.mean(v) for k, v in valid_losses.items()}
-                valid_dump['val_history_loss'] = total_loss / num_seen_utts
+                valid_losses['val_history_loss'] = float(total_loss)
 
                 # logging
                 msg = f"Valid: Rank: {dist.get_rank()}, "
@@ -175,11 +294,11 @@ class Wav2Vec2ASRTrainer(Trainer):
                     msg += "batch: {}/{}, ".format(i + 1,
                                                    len(self.valid_loader))
                 msg += ', '.join('{}: {:>.6f}'.format(k, v)
-                                 for k, v in valid_dump.items())
+                                 for k, v in valid_losses.items())
                 logger.info(msg)
 
-        logger.info('Rank {} Val info val_loss {}'.format(
-            dist.get_rank(), total_loss / num_seen_utts))
+        logger.info(
+            'Rank {} Val info val_loss {}'.format(dist.get_rank(), total_loss))
         return total_loss, num_seen_utts
 
     @mp_tools.rank_zero_only
@@ -228,7 +347,7 @@ class Wav2Vec2ASRTrainer(Trainer):
             logger.info("Saved scheduler state to {}".format(scheduler_path))
         info_path = re.sub('.pdparams$', '.json', params_path)
         infos = {} if infos is None else infos
-        with open(info_path, 'w') as fout:
+        with open(info_path, 'w', encoding='utf8') as fout:
             data = json.dumps(infos)
             fout.write(data)
 
@@ -245,7 +364,7 @@ class Wav2Vec2ASRTrainer(Trainer):
             # lr will resotre from optimizer ckpt
             resume_json_path = os.path.join(self.checkpoint_dir,
                                             self.args.resume + '.json')
-            with open(resume_json_path, 'r') as f:
+            with open(resume_json_path, 'r', encoding='utf8') as f:
                 resume_json = json.load(f)
             self.iteration = 0
             self.epoch = resume_json["epoch"]
@@ -340,14 +459,13 @@ class Wav2Vec2ASRTrainer(Trainer):
                 total_loss, num_seen_utts = self.valid()
                 if dist.get_world_size() > 1:
                     num_seen_utts = paddle.to_tensor(num_seen_utts)
-                    # the default operator in all_reduce function is sum.
                     dist.all_reduce(num_seen_utts)
                     total_loss = paddle.to_tensor(total_loss)
                     dist.all_reduce(total_loss)
                     cv_loss = total_loss / num_seen_utts
                     cv_loss = float(cv_loss)
                 else:
-                    cv_loss = total_loss / num_seen_utts
+                    cv_loss = float(total_loss)
             logger.info(
                 'Epoch {} Val info val_loss {}'.format(self.epoch, cv_loss))
             if self.visualizer:
@@ -368,45 +486,182 @@ class Wav2Vec2ASRTrainer(Trainer):
                 if not self.config.freeze_wav2vec2:
                     self.wav2vec2_lr_scheduler.step(cv_loss)
             self.save(tag=self.epoch, infos={'val_loss': cv_loss})
+            self.avg_train_loss = 0.0
             self.new_epoch()
 
+    def dataio_prepare(self, hparams):
+        """This function prepares the datasets to be used in the brain class.
+        It also defines the data processing pipeline through user-defined functions."""
+        data_folder = hparams["data_folder"]
+
+        train_data = dataset.DynamicItemDataset.from_csv(
+            csv_path=hparams["train_data"],
+            replacements={"data_root": data_folder}, )
+
+        if hparams["sorting"] == "ascending":
+            # we sort training data to speed up training and get better results.
+            train_data = train_data.filtered_sorted(sort_key="duration")
+            # when sorting do not shuffle in dataloader ! otherwise is pointless
+            hparams["train_dataloader_opts"]["shuffle"] = False
+
+        elif hparams["sorting"] == "descending":
+            train_data = train_data.filtered_sorted(
+                sort_key="duration", reverse=True)
+            # when sorting do not shuffle in dataloader ! otherwise is pointless
+            hparams["train_dataloader_opts"]["shuffle"] = False
+
+        elif hparams["sorting"] == "random":
+            pass
+
+        else:
+            raise NotImplementedError(
+                "sorting must be random, ascending or descending")
+
+        valid_data = dataset.DynamicItemDataset.from_csv(
+            csv_path=hparams["valid_data"],
+            replacements={"data_root": data_folder}, )
+        valid_data = valid_data.filtered_sorted(sort_key="duration")
+
+        test_data = dataset.DynamicItemDataset.from_csv(
+            csv_path=hparams["test_data"],
+            replacements={"data_root": data_folder}, )
+        test_data = test_data.filtered_sorted(sort_key="duration")
+
+        datasets = [train_data, valid_data, test_data]
+
+        # Defining tokenizer and loading it
+        tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese')
+        self.tokenizer = tokenizer
+        # 2. Define audio pipeline:
+        @data_pipeline.takes("wav")
+        @data_pipeline.provides("sig")
+        def audio_pipeline(wav):
+            sig = dataio.read_audio(wav)
+            return sig
+
+        dataset.add_dynamic_item(datasets, audio_pipeline)
+
+        # 3. Define text pipeline:
+        @data_pipeline.takes("transcript")
+        @data_pipeline.provides("wrd", "tokens_list", "tokens")
+        def text_pipeline(wrd):
+            wrd = "".join(wrd.split(" "))
+            yield wrd
+            tokens_list = tokenizer(wrd)["input_ids"]
+            yield tokens_list
+            tokens = np.array(tokens_list, dtype="int64")
+            # tokens = paddle.to_tensor(tokens_list, dtype="int64")
+            yield tokens
+
+        dataset.add_dynamic_item(datasets, text_pipeline)
+
+        # 4. Set output:
+        dataset.set_output_keys(
+            datasets,
+            ["id", "sig", "wrd", "tokens"], )
+
+        # 5. If Dynamic Batching is used, we instantiate the needed samplers.
+        train_batch_sampler = None
+        valid_batch_sampler = None
+        if hparams["dynamic_batching"]:
+            from sampler import DynamicBatchSampler  # noqa
+
+            dynamic_hparams = hparams["dynamic_batch_sampler"]
+            num_buckets = dynamic_hparams["num_buckets"]
+
+            train_batch_sampler = DynamicBatchSampler(
+                train_data,
+                dynamic_hparams["max_batch_len"],
+                num_buckets=num_buckets,
+                length_func=lambda x: x["duration"],
+                shuffle=dynamic_hparams["shuffle_ex"],
+                batch_ordering=dynamic_hparams["batch_ordering"], )
+
+            valid_batch_sampler = DynamicBatchSampler(
+                valid_data,
+                dynamic_hparams["max_batch_len"],
+                num_buckets=num_buckets,
+                length_func=lambda x: x["duration"],
+                shuffle=dynamic_hparams["shuffle_ex"],
+                batch_ordering=dynamic_hparams["batch_ordering"], )
+
+        return (train_data, valid_data, test_data, tokenizer,
+                train_batch_sampler, valid_batch_sampler, )
+
     def setup_dataloader(self):
         config = self.config.clone()
         self.use_streamdata = config.get("use_stream_data", False)
-        if self.train:
-            self.train_loader = DataLoaderFactory.get_dataloader(
-                'train', config, self.args)
-            self.valid_loader = DataLoaderFactory.get_dataloader(
-                'valid', config, self.args)
-            logger.info("Setup train/valid Dataloader!")
+        self.use_sb = config.get("use_sb_pipeline", False)
+        if self.use_sb:
+            hparams_file = config.sb_pipeline_conf
+            with open(hparams_file, 'r', encoding='utf8') as fin:
+                hparams = load_hyperpyyaml(fin, None)
+
+            (train_data, valid_data, test_data, tokenizer, train_bsampler,
+             valid_bsampler, ) = self.dataio_prepare(hparams)
+
+            train_dataloader_opts = hparams["train_dataloader_opts"]
+            valid_dataloader_opts = hparams["valid_dataloader_opts"]
+
+            if train_bsampler is not None:
+                train_dataloader_opts = {
+                    "batch_sampler": train_bsampler,
+                    "num_workers": hparams["num_workers"],
+                }
+
+            if valid_bsampler is not None:
+                valid_dataloader_opts = {"batch_sampler": valid_bsampler}
+
+            if self.train:
+                self.train_loader = make_dataloader(
+                    train_data, stage='train', **train_dataloader_opts)
+                self.valid_loader = make_dataloader(
+                    valid_data,
+                    stage='val',
+                    **valid_dataloader_opts, )
+                logger.info("Setup train/valid Dataloader!")
+            else:
+                self.test_loader = make_dataloader(
+                    test_data, stage='test', **hparams["test_dataloader_opts"])
         else:
-            decode_batch_size = config.get('decode', dict()).get(
-                'decode_batch_size', 1)
-            self.test_loader = DataLoaderFactory.get_dataloader('test', config,
-                                                                self.args)
-            self.align_loader = DataLoaderFactory.get_dataloader(
-                'align', config, self.args)
-            logger.info("Setup test/align Dataloader!")
+            if self.train:
+                self.train_loader = DataLoaderFactory.get_dataloader(
+                    'train', config, self.args)
+                self.valid_loader = DataLoaderFactory.get_dataloader(
+                    'valid', config, self.args)
+                logger.info("Setup train/valid Dataloader!")
+            else:
+                decode_batch_size = config.get('decode', dict()).get(
+                    'decode_batch_size', 1)
+                self.test_loader = DataLoaderFactory.get_dataloader(
+                    'test', config, self.args)
+                self.align_loader = DataLoaderFactory.get_dataloader(
+                    'align', config, self.args)
+                logger.info("Setup test/align Dataloader!")
 
     def setup_model(self):
         config = self.config
         model_conf = config
 
         with UpdateConfig(model_conf):
-            if self.train:
-                model_conf.input_dim = self.train_loader.feat_dim
-                model_conf.output_dim = self.train_loader.vocab_size
+            if self.use_sb:
+                model_conf.output_dim = self.tokenizer.vocab_size
             else:
-                model_conf.input_dim = self.test_loader.feat_dim
-                model_conf.output_dim = self.test_loader.vocab_size
+                if self.train:
+                    model_conf.input_dim = self.train_loader.feat_dim
+                    model_conf.output_dim = self.train_loader.vocab_size
+                else:
+                    model_conf.input_dim = self.test_loader.feat_dim
+                    model_conf.output_dim = self.test_loader.vocab_size
 
         model = Wav2vec2ASR.from_config(model_conf)
+
         model_dict = paddle.load(config.wav2vec2_params_path)
         model.wav2vec2.set_state_dict(model_dict)
 
         if self.parallel:
             model = paddle.DataParallel(model, find_unused_parameters=True)
-        logger.info(f"{model}")
+
         layer_tools.print_params(model, logger.info)
         self.model = model
         logger.info("Setup model!")
@@ -422,8 +677,11 @@ class Wav2Vec2ASRTrainer(Trainer):
         train_config = config
         model_optim_type = train_config.model_optim
         model_optim_conf = train_config.model_optim_conf
-        wav2vec2_optim_type = train_config.model_optim
+        logger.info("optim_model:{},{}", model_optim_type, model_optim_conf)
+        wav2vec2_optim_type = train_config.wav2vec2_optim
         wav2vec2_optim_conf = train_config.wav2vec2_optim_conf
+        logger.info("optim_model:{},{}", wav2vec2_optim_type,
+                    wav2vec2_optim_conf)
 
         model_scheduler_type = train_config.model_scheduler
         model_scheduler_conf = train_config.model_scheduler_conf
@@ -449,11 +707,8 @@ class Wav2Vec2ASRTrainer(Trainer):
                 optim_conf,
                 parameters,
                 lr_scheduler=None, ):
-            train_config = config
             optim_arg = dict(optim_conf)
             optim_arg.update({
-                "grad_clip":
-                train_config.global_grad_clip,
                 "learning_rate":
                 lr_scheduler if lr_scheduler else optim_conf.lr,
                 "parameters":
@@ -475,10 +730,12 @@ class Wav2Vec2ASRTrainer(Trainer):
                                                   'params':
                                                   model.ctc.parameters()
                                               }], model_lr_scheduler)
+
         wav2vec2_optimizer_args = optimizer_args(
             config, wav2vec2_optim_type, wav2vec2_optim_conf,
             model._layers.wav2vec2.parameters() if self.parallel else
             model.wav2vec2.parameters(), wav2vec2_lr_scheduler)
+
         model_optimizer = OptimizerFactory.from_args(model_optim_type,
                                                      model_optimizer_args)
         wav2vec2_optimizer = OptimizerFactory.from_args(wav2vec2_optim_type,
@@ -507,12 +764,7 @@ class Wav2Vec2ASRTester(Wav2Vec2ASRTrainer):
             trans.append(self.text_featurizer.defeaturize(ids.numpy().tolist()))
         return trans
 
-    def compute_metrics(self,
-                        utts,
-                        audio,
-                        audio_len,
-                        texts,
-                        texts_len,
+    def compute_metrics(self, id, audio, audio_len, texts, texts_len,
                         fout=None):
         decode_cfg = self.config.decode
         errors_sum, len_refs, num_ins = 0.0, 0, 0
@@ -529,7 +781,7 @@ class Wav2Vec2ASRTester(Wav2Vec2ASRTrainer):
         decode_time = time.time() - start_time
 
         for utt, target, result, rec_tids in zip(
-                utts, target_transcripts, result_transcripts, result_tokenids):
+                id, target_transcripts, result_transcripts, result_tokenids):
             errors, len_ref = errors_func(target, result)
             errors_sum += errors
             len_refs += len_ref
@@ -556,6 +808,49 @@ class Wav2Vec2ASRTester(Wav2Vec2ASRTrainer):
             num_frames=audio_len.sum().numpy().item(),
             decode_time=decode_time)
 
+    def sb_compute_metrics(self, id, sig, wrd, tokens, fout=None):
+        decode_cfg = self.config.decode
+        errors_sum, len_refs, num_ins = 0.0, 0, 0
+        errors_func = error_rate.char_errors if decode_cfg.error_rate_type == 'cer' else error_rate.word_errors
+        error_rate_func = error_rate.cer if decode_cfg.error_rate_type == 'cer' else error_rate.wer
+        start_time = time.time()
+        target_transcripts = wrd
+        result_transcripts, result_tokenids = self.model.decode(
+            sig[0],
+            text_feature=self.tokenizer,
+            decoding_method=decode_cfg.decoding_method,
+            beam_size=decode_cfg.beam_size,
+            sb_pipeline=True)
+        decode_time = time.time() - start_time
+
+        for utt, target, result, rec_tids in zip(
+                id, target_transcripts, result_transcripts, result_tokenids):
+            errors, len_ref = errors_func(target, result)
+            errors_sum += errors
+            len_refs += len_ref
+            num_ins += 1
+            if fout:
+                fout.write({
+                    "utt": utt,
+                    "refs": [target],
+                    "hyps": [result],
+                    "hyps_tokenid": [rec_tids],
+                })
+            logger.info(f"Utt: {utt}")
+            logger.info(f"Ref: {target}")
+            logger.info(f"Hyp: {result}")
+            logger.info("One example error rate [%s] = %f" % (
+                decode_cfg.error_rate_type, error_rate_func(target, result)))
+
+        return dict(
+            errors_sum=errors_sum,
+            len_refs=len_refs,
+            num_ins=num_ins,  # num examples
+            error_rate=errors_sum / len_refs,
+            error_rate_type=decode_cfg.error_rate_type,
+            num_frames=sig[1].sum().numpy().item(),
+            decode_time=decode_time)
+
     @mp_tools.rank_zero_only
     @paddle.no_grad()
     def test(self):
@@ -573,7 +868,10 @@ class Wav2Vec2ASRTester(Wav2Vec2ASRTrainer):
 
         with jsonlines.open(self.args.result_file, 'w') as fout:
             for i, batch in enumerate(self.test_loader):
-                metrics = self.compute_metrics(*batch, fout=fout)
+                if self.use_sb:
+                    metrics = self.sb_compute_metrics(**batch, fout=fout)
+                else:
+                    metrics = self.compute_metrics(*batch, fout=fout)
                 num_frames += metrics['num_frames']
                 num_time += metrics["decode_time"]
                 errors_sum += metrics['errors_sum']
@@ -595,7 +893,7 @@ class Wav2Vec2ASRTester(Wav2Vec2ASRTrainer):
 
         err_meta_path = os.path.splitext(self.args.result_file)[0] + '.err'
         err_type_str = "{}".format(error_rate_type)
-        with open(err_meta_path, 'w') as f:
+        with open(err_meta_path, 'w', encoding='utf8') as f:
             data = json.dumps({
                 "epoch":
                 self.epoch,
diff --git a/paddlespeech/s2t/frontend/augmentor/augmentation.py b/paddlespeech/s2t/frontend/augmentor/augmentation.py
index 4c5ca4fe..744ea56d 100644
--- a/paddlespeech/s2t/frontend/augmentor/augmentation.py
+++ b/paddlespeech/s2t/frontend/augmentor/augmentation.py
@@ -45,7 +45,7 @@ class AugmentationPipeline():
     samples to make the model invariant to certain types of perturbations in the
     real world, improving model's generalization ability.
 
-    The pipeline is built according the the augmentation configuration in json
+    The pipeline is built according to the augmentation configuration in json
     string, e.g.
     
     .. code-block::
diff --git a/paddlespeech/s2t/frontend/featurizer/text_featurizer.py b/paddlespeech/s2t/frontend/featurizer/text_featurizer.py
index 982c6b8f..7623d0b8 100644
--- a/paddlespeech/s2t/frontend/featurizer/text_featurizer.py
+++ b/paddlespeech/s2t/frontend/featurizer/text_featurizer.py
@@ -48,13 +48,16 @@ class TextFeaturizer():
         self.unit_type = unit_type
         self.unk = UNK
         self.maskctc = maskctc
+        self.vocab_path_or_list = vocab
 
-        if vocab:
+        if self.vocab_path_or_list:
             self.vocab_dict, self._id2token, self.vocab_list, self.unk_id, self.eos_id, self.blank_id = self._load_vocabulary_from_file(
                 vocab, maskctc)
             self.vocab_size = len(self.vocab_list)
         else:
-            logger.warning("TextFeaturizer: not have vocab file or vocab list.")
+            logger.warning(
+                "TextFeaturizer: not have vocab file or vocab list. Only Tokenizer can use, can not convert to token idx"
+            )
 
         if unit_type == 'spm':
             spm_model = spm_model_prefix + '.model'
@@ -62,6 +65,7 @@ class TextFeaturizer():
             self.sp.Load(spm_model)
 
     def tokenize(self, text, replace_space=True):
+        """tokenizer split text into text tokens"""
         if self.unit_type == 'char':
             tokens = self.char_tokenize(text, replace_space)
         elif self.unit_type == 'word':
@@ -71,6 +75,7 @@ class TextFeaturizer():
         return tokens
 
     def detokenize(self, tokens):
+        """tokenizer convert text tokens back to text"""
         if self.unit_type == 'char':
             text = self.char_detokenize(tokens)
         elif self.unit_type == 'word':
@@ -88,6 +93,7 @@ class TextFeaturizer():
         Returns:
             List[int]: List of token indices.
         """
+        assert self.vocab_path_or_list, "toidx need vocab path or vocab list"
         tokens = self.tokenize(text)
         ids = []
         for token in tokens:
@@ -107,6 +113,7 @@ class TextFeaturizer():
         Returns:
             str: Text.
         """
+        assert self.vocab_path_or_list, "toidx need vocab path or vocab list"
         tokens = []
         for idx in idxs:
             if idx == self.eos_id:
@@ -127,10 +134,10 @@ class TextFeaturizer():
         """
         text = text.strip()
         if replace_space:
-            text_list = [SPACE if item == " " else item for item in list(text)]
+            tokens = [SPACE if item == " " else item for item in list(text)]
         else:
-            text_list = list(text)
-        return text_list
+            tokens = list(text)
+        return tokens
 
     def char_detokenize(self, tokens):
         """Character detokenizer.
diff --git a/paddlespeech/s2t/io/dataloader.py b/paddlespeech/s2t/io/dataloader.py
index 5ba891c3..db6292f2 100644
--- a/paddlespeech/s2t/io/dataloader.py
+++ b/paddlespeech/s2t/io/dataloader.py
@@ -464,5 +464,5 @@ class DataLoaderFactory():
                 subsampling_factor=config.subsampling_factor,
                 load_aux_output=config.get('load_transcript', None),
                 num_encs=config.num_encs,
-                dist_sampler=config.dist_sampler,
+                dist_sampler=config.get('dist_sampler', None),
                 shortest_first=config.shortest_first)
diff --git a/paddlespeech/s2t/io/speechbrain/__init__.py b/paddlespeech/s2t/io/speechbrain/__init__.py
new file mode 100644
index 00000000..595add0a
--- /dev/null
+++ b/paddlespeech/s2t/io/speechbrain/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddlespeech/s2t/io/speechbrain/batch.py b/paddlespeech/s2t/io/speechbrain/batch.py
new file mode 100755
index 00000000..73f13181
--- /dev/null
+++ b/paddlespeech/s2t/io/speechbrain/batch.py
@@ -0,0 +1,107 @@
+# Copyright (c) 2023 speechbrain Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# 
+# Modified from speechbrain 2023 (https://github.com/speechbrain/speechbrain/blob/develop/speechbrain/dataio/batch.py)
+"""Batch collation
+
+Authors
+  * Aku Rouhe 2020
+"""
+import collections
+
+import paddle
+
+from paddlespeech.s2t.io.speechbrain.data_utils import batch_pad_right
+from paddlespeech.s2t.io.speechbrain.data_utils import mod_default_collate
+
+PaddedData = collections.namedtuple("PaddedData", ["data", "lengths"])
+
+
+class PaddedBatch:
+    """Collate_fn when examples are dicts and have variable-length sequences.
+
+    Different elements in the examples get matched by key.
+    All numpy tensors get converted to paddle.Tensor 
+    Then, by default, all paddle.Tensor valued elements get padded and support
+    collective pin_memory() and to() calls.
+    Regular Python data types are just collected in a list.
+
+    Arguments
+    ---------
+    examples : list
+        List of example dicts, as produced by Dataloader.
+    padded_keys : list, None
+        (Optional) List of keys to pad on. If None, pad all paddle.Tensors
+    device_prep_keys : list, None
+        (Optional) Only these keys participate in collective memory pinning and moving with
+        to().
+        If None, defaults to all items with paddle.Tensor values.
+    padding_func : callable, optional
+        Called with a list of tensors to be padded together. Needs to return
+        two tensors: the padded data, and another tensor for the data lengths.
+    padding_kwargs : dict
+        (Optional) Extra kwargs to pass to padding_func. E.G. mode, value
+    nonpadded_stack : bool
+        Whether to apply Tensor stacking on values that didn't get padded. 
+        This stacks if it can, but doesn't error out if it cannot. 
+        Default:True, usually does the right thing.
+    """
+
+    def __init__(
+            self,
+            examples,
+            padded_keys=None,
+            device_prep_keys=None,
+            padding_func=batch_pad_right,
+            padding_kwargs={},
+            nonpadded_stack=True, ):
+        self.__length = len(examples)
+        self.__keys = list(examples[0].keys())
+        self.__padded_keys = []
+        self.__device_prep_keys = []
+        for key in self.__keys:
+            values = [example[key] for example in examples]
+            # Default convert usually does the right thing (numpy2tensor etc.)
+            values = paddle.to_tensor(values)
+
+            if (padded_keys is not None and key in padded_keys) or (
+                    padded_keys is None and
+                    isinstance(values[0], paddle.Tensor)):
+                # Padding and PaddedData
+                self.__padded_keys.append(key)
+                padded = PaddedData(*padding_func(values, **padding_kwargs))
+                setattr(self, key, padded)
+            else:
+                if nonpadded_stack:
+                    values = mod_default_collate(values)
+                setattr(self, key, values)
+            if (device_prep_keys is not None and key in device_prep_keys) or (
+                    device_prep_keys is None and
+                    isinstance(values[0], paddle.Tensor)):
+                self.__device_prep_keys.append(key)
+
+    def __len__(self):
+        return self.__length
+
+    def __getitem__(self, key):
+        if key in self.__keys:
+            return getattr(self, key)
+        else:
+            raise KeyError(f"Batch doesn't have key: {key}")
+
+    def __iter__(self):
+        """Iterates over the different elements of the batch.
+        """
+        return iter((getattr(self, key) for key in self.__keys))
diff --git a/paddlespeech/s2t/io/speechbrain/data_pipeline.py b/paddlespeech/s2t/io/speechbrain/data_pipeline.py
new file mode 100755
index 00000000..1bfe0e1d
--- /dev/null
+++ b/paddlespeech/s2t/io/speechbrain/data_pipeline.py
@@ -0,0 +1,488 @@
+# Copyright (c) 2023 speechbrain Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# 
+# Modified from speechbrain 2023 (https://github.com/speechbrain/speechbrain/blob/develop/speechbrain/utils/data_pipeline.py)
+"""A pipeline for data transformations.
+
+Author:
+    * Aku Rouhe
+"""
+import inspect
+from dataclasses import dataclass
+
+from paddlespeech.s2t.io.speechbrain.depgraph import DependencyGraph
+
+
+@dataclass
+class StaticItem:
+    """Data class that represents a static item.
+
+    Static items are in-memory items so they don't need to be computed
+    dynamically.
+    """
+
+    key: str
+
+
+class DynamicItem:
+    """Essentially represents a data transformation function.
+
+    A DynamicItem takes some arguments and computes its value dynamically when
+    called. A straight-forward use-case is to load something from disk
+    dynamically; take the path and provide the loaded data.
+
+    Instances of this class are often created implicitly via the
+    @takes and @provides decorators or otherwise from specifying the taken and
+    provided arguments and the function.
+
+    A counterpart is the GeneratorDynamicItem, which should be used for
+    generator functions.
+
+    Arguments
+    ---------
+    takes : list
+        The keys of the items that this needs to compute its output.
+    func : callable
+        The function that is used to compute the output.
+    provides : list
+        The keys that this provides.
+    """
+
+    def __init__(self, takes=[], func=None, provides=[]):
+        self.takes = takes
+        self.func = func
+        self.provides = provides
+
+    def __call__(self, *args):
+        return self.func(*args)
+
+    # The next methods are more about supporting GeneratorDynamicItems
+    def next_takes(self):
+        """The next argkeys to provide to this, when called."""
+        # Regular function DynamicItems always just need the same set of args
+        return self.takes
+
+    def next_provides(self):
+        """The next keys that this provides, when called."""
+        # Regular function DynamicItems always just provide the same set of keys
+        return self.provides
+
+    def provided_in_order(self):
+        """Assuming that this may need to be called multiple times; which keys
+        does it provide at that call. Returns a list, with len equal to the
+        number of times that this may be called."""
+        # Regular function DynamicItems are only called once:
+        return [self.provides]
+
+    def reset(self):
+        """Signals that this will not be called any more times on this pipeline
+        call."""
+        # Regular function DynamicItems don't need special resets.
+        pass
+
+
+class GeneratorDynamicItem(DynamicItem):
+    """Essentially represents a multi-step data transformation.
+
+    This is the generator function counterpart for DynamicItem (which should be
+    used for regular functions).
+
+    A GeneratorDynamicItem first takes some arguments and then uses those in
+    multiple steps to incrementally compute some values when called.
+
+    A typical use-case is a pipeline of transformations on data: e.g. taking in
+    text as a string, and first a tokenized version, and then on the second
+    call providing an integer-encoded version. This can be used even though the
+    integer-encoder needs to be trained on the first outputs.
+
+    The main benefit is to be able to define the pipeline in a clear function,
+    even if parts of the pipeline depend on others for their initialization.
+
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # Doesn't generate electricity, only stores the currently active
+        # generator:
+        self.current_generator = None
+        self.num_provided_items = 0
+
+    def __call__(self, *args):
+        if self.num_provided_items == len(self.provides):
+            raise RuntimeError("DynamicItemPipeline called too many times!")
+        if not self.current_generator:
+            self.current_generator = self.func(*args)
+        # NOTE: Not supporting sending new values to the pipeline.
+        out = next(self.current_generator)
+        self.num_provided_items += 1
+        return out
+
+    def next_takes(self):
+        """The next argkeys to provide to this, when called."""
+        if not self.current_generator:
+            return self.takes
+        else:
+            return []
+
+    def next_provides(self):
+        """The next keys that this provides, when called."""
+        keys = self.provides[self.num_provided_items]
+        # Support multiple yielded values like:
+        # @yields("wav_read", ["left_ch", "right_ch"])
+        if isinstance(keys, str):
+            return [keys]
+        else:
+            return keys
+
+    def provided_in_order(self):
+        """Assuming that this may need to be called multiple times; which keys
+        does it provide at that call. Returns a list, with len equal to the
+        number of times that this may be called."""
+        in_order = []
+        for keys in self.provides:
+            # Support multiple yielded values like:
+            # @provides("wav_read", ["left_ch", "right_ch"])
+            if isinstance(keys, str):
+                in_order.append([keys])
+            else:
+                in_order.append(keys)
+        return in_order
+
+    def reset(self):
+        """Signals that this will not be called any more times on this pipeline
+        call."""
+        if self.current_generator is not None:
+            self.current_generator.close()
+        self.current_generator = None
+        self.num_provided_items = 0
+
+
+def takes(*argkeys):
+    """Decorator which makes a DynamicItem and specifies its argkeys.
+
+    If the wrapped object is a generator function (has a yield statement),
+    Creates a GeneratorDynamicItem. If the object is already a DynamicItem,
+    just specifies the argkeys for that. Otherwise creates a new regular
+    DynamicItem, with argkeys specified.
+
+    The args are always passed to the function at the start. Generators could
+    support sending new arguments, but for such use cases, simply create a new
+    dynamic item. The GeneratorDynamicItem class is meant for pipelines which
+    take in an input and transform it in multiple ways, where the intermediate
+    representations may be needed for e.g. fitting a BPE segmenter.
+
+    Example
+    -------
+    >>> @takes("text")
+    ... def tokenize(text):
+    ...     return text.strip().lower().split()
+    >>> tokenize.provides = ["tokenized"]
+    >>> tokenize('\tThis Example gets tokenized')
+    ['this', 'example', 'gets', 'tokenized']
+    """
+
+    def decorator(obj):
+        """Decorator definition."""
+        if isinstance(obj, DynamicItem):
+            if obj.takes:
+                raise ValueError("Can't overwrite DynamicItem.takes")
+            obj.takes = argkeys
+            return obj
+        elif inspect.isgeneratorfunction(obj):
+            return GeneratorDynamicItem(takes=argkeys, func=obj)
+        else:
+            return DynamicItem(takes=argkeys, func=obj)
+
+    return decorator
+
+
+takes_decorator = takes  # Just for DataPipeline.add_dynamic_item
+
+
+def provides(*output_keys):
+    """Decorator which makes a DynamicItem and specifies what keys it provides.
+
+    If the wrapped object is a generator function (has a yield statement),
+    Creates a GeneratorDynamicItem. If the object is already a DynamicItem,
+    just specifies the provided keys for that. Otherwise creates a new regular
+    DynamicItem, with provided keys specified.
+
+    NOTE
+    ----
+    The behavior is slightly different for generators and regular functions, if
+    many output keys are specified, e.g. @provides("signal", "mfcc"). Regular
+    functions should return a tuple with len equal to len(output_keys), while
+    generators should yield the items one by one.
+
+    >>> @provides("signal", "feat")
+    ... def read_feat():
+    ...     wav = [.1,.2,-.1]
+    ...     feat = [s**2 for s in wav]
+    ...     return wav, feat
+    >>> @provides("signal", "feat")
+    ... def read_feat():
+    ...     wav = [.1,.2,-.1]
+    ...     yield wav
+    ...     feat = [s**2 for s in wav]
+    ...     yield feat
+
+    If multiple keys are yielded at once, write e.g.,
+
+    >>> @provides("wav_read", ["left_channel", "right_channel"])
+    ... def read_multi_channel():
+    ...     wav = [[.1,.2,-.1],[.2,.1,-.1]]
+    ...     yield wav
+    ...     yield wav[0], wav[1]
+
+    """
+
+    def decorator(obj):
+        """Decorator definition."""
+        if isinstance(obj, DynamicItem):
+            if obj.provides:
+                raise ValueError("Can't overwrite DynamicItem provides-list.")
+            obj.provides = output_keys
+            return obj
+        elif inspect.isgeneratorfunction(obj):
+            return GeneratorDynamicItem(func=obj, provides=output_keys)
+        else:
+            return DynamicItem(func=obj, provides=output_keys)
+
+    return decorator
+
+
+provides_decorator = provides  # Just for DataPipeline.add_dynamic_item
+
+
+class DataPipeline:
+    """Organises data transformations into a pipeline.
+
+    Example
+    -------
+    >>> pipeline = DataPipeline(
+    ...     static_data_keys=["text"],
+    ...     dynamic_items=[
+    ...     {"func": lambda x: x.lower(), "takes": "text", "provides": "foo"},
+    ...     {"func": lambda x: x[::-1], "takes": "foo", "provides": "bar"},
+    ...     ],
+    ...     output_keys=["bar"],
+    ... )
+    >>> pipeline({"text": "Test"})
+    {'bar': 'tset'}
+    """
+
+    def __init__(self, static_data_keys, dynamic_items=[], output_keys=[]):
+        self.dg = DependencyGraph()
+        self._exec_order = None
+        self.key_to_node = {}
+        self.unaccounted_keys = {}
+        self.dynamic_items = []
+        self.output_mapping = {}
+        self.add_static_keys(static_data_keys)
+        self.add_dynamic_items(dynamic_items)
+        self.set_output_keys(output_keys)
+
+    def add_static_keys(self, static_keys):
+        """Informs the pipeline about static items.
+
+        Static items are the ones provided to __call__ as data.
+        """
+        for key in static_keys:
+            node_id = self.dg.add_node(data=StaticItem(key=key))
+            self.key_to_node[key] = node_id
+
+    def add_dynamic_items(self, dynamic_items):
+        """Add multiple dynamic items at once."""
+        for item in dynamic_items:
+            try:
+                self.add_dynamic_item(**item)
+            except TypeError:
+                self.add_dynamic_item(item)
+
+    def add_dynamic_item(self, func, takes=None, provides=None):
+        """Adds a dynamic item to the Pipeline.
+
+        Two calling conventions. For DynamicItem objects, just use:
+        add_dynamic_item(dynamic_item)
+        But otherwise, should use:
+        add_dynamic_item(func, takes, provides)
+
+        Arguments
+        ---------
+        func : callable, DynamicItem
+            If a DynamicItem is given, adds that directly. Otherwise a
+            DynamicItem is created, and this specifies the callable to use. If
+            a generator function is given, then create a GeneratorDynamicItem.
+            Otherwise creates a normal DynamicItem.
+        takes : list, str
+            List of keys. When func is called, each key is resolved to
+            either an entry in the data or the output of another dynamic_item.
+            The func is then called with these as positional arguments,
+            in the same order as specified here.
+            A single key can be given as a bare string.
+        provides : str, list
+            For regular functions, the key or list of keys that it provides.
+            If you give a generator function, key or list of keys that it
+            yields, in order. Also see the provides decorator.
+            A single key can be given as a bare string.
+        """
+        if isinstance(func, DynamicItem):
+            if takes is not None or provides is not None:
+                raise ValueError("If providing a DynamicItem directly, don't "
+                                 "specify takes or provides")
+            else:
+                self._add_dynamic_item_object(func)
+                return
+        if isinstance(takes, str):
+            takes = [takes]
+        if isinstance(provides, str):
+            provides = [provides]
+        di = takes_decorator(*takes)(provides_decorator(*provides)(func))
+        self._add_dynamic_item_object(di)
+
+    def _add_dynamic_item_object(self, obj):
+        """Internally adds the object.
+
+        There is a node in the dependency graph for each call of the
+        DynamicItem. Each call may return multiple keys and depend on multiple
+        keys. An internal dict maps key to the id of the node that produces it.
+        """
+        if not obj.provides:
+            raise ValueError("Won't add redundant dynamic item which doesn't "
+                             "provide anything.")
+        depended = []
+        for key in obj.takes:
+            # Might not be accounted for, yet:
+            if key not in self.key_to_node:
+                dependee_keys = self.unaccounted_keys.setdefault(key, [])
+                dependee_keys.extend(obj.next_provides())
+            else:
+                depended.append(self.key_to_node[key])
+        for provided in obj.provided_in_order():
+            node_id = self.dg.add_node(data=obj)
+            for key in provided:
+                self.key_to_node[key] = node_id
+                # This key may also be unaccounted for, so account for it now:
+                if key in self.unaccounted_keys:
+                    for dependee_key in self.unaccounted_keys[key]:
+                        dependee_node = self.key_to_node[dependee_key]
+                        self.dg.add_edge(dependee_node, node_id)
+                    del self.unaccounted_keys[key]  # Now accounted for!
+            for dep_id in depended:
+                self.dg.add_edge(node_id, dep_id)
+            # Next call will depend on this call:
+            depended = [node_id]
+        # Keep a reference to the item in this object, as well:
+        self.dynamic_items.append(obj)
+
+    def set_output_keys(self, keys):
+        """Use this to change the output keys.
+
+        Also re-evaluates execution order.
+        So if you request different outputs, some parts of the
+        data pipeline may be skipped.
+
+        Arguments
+        ---------
+        keys : dict, list, None
+            List of keys (str) to produce in output.
+
+            If a dict is given; it is used to map internal keys to output keys.
+            From the output_keys dict key:value pairs the key appears outside,
+            and value is the internal key.
+        """
+        self.output_mapping = self._output_keys_to_mapping(keys)
+        self._exec_order = None
+
+    @staticmethod
+    def _output_keys_to_mapping(keys):
+        # Ensure a mapping (accept a list for convenience, too)
+        if keys is None:
+            output_mapping = {}
+        elif isinstance(keys, dict):
+            output_mapping = keys
+        else:
+            output_mapping = {key: key for key in keys}
+        return output_mapping
+
+    def compute_outputs(self, data):
+        """
+        Arguments
+        ---------
+        data : dict
+            Dictionary with data entries by key.
+
+        Returns
+        -------
+        dict
+            With the keys that were set.
+        """
+        if self._exec_order is None:
+            self._prepare_run(data)
+        return self._compute(data, self._exec_order, self.output_mapping)
+
+    def compute_specific(self, keys, data):
+        """Compute output of specific item, without changing output_keys."""
+        output_mapping = self._output_keys_to_mapping(keys)
+        order = self.dg.get_evaluation_order(
+            selected_keys=self.get_selected_node_ids(keys))
+        return self._compute(data, order, output_mapping)
+
+    def _compute(self, data, order, output_mapping):
+        if self.unaccounted_keys:
+            MSG = "These keys are still unaccounted for in the data pipeline: "
+            MSG += ", ".join(self.unaccounted_keys)
+            raise RuntimeError(MSG)
+        intermediate = {}
+        for node_id, edges, item in order:
+            if isinstance(item, StaticItem):
+                # Static item in data.
+                # Just check that key is found.
+                try:
+                    data[item.key]
+                    continue
+                except KeyError:
+                    raise KeyError(f"Expected key {item.key} in data!")
+            # A dynamic item, which we should compute:
+            args = [
+                data[argkey] if argkey in data else intermediate[argkey]
+                for argkey in item.next_takes()
+            ]
+            # This needs to be called BEFORE the dynamic item is called.
+            provided_keys = item.next_provides()
+            values = item(*args)  # Call the DynamicItem to produce output
+            # If there is just one output value, wrap in a list so that
+            # it can be zipped as well:
+            if len(provided_keys) == 1:
+                values = [values]
+            intermediate.update(zip(provided_keys, values))
+        for dynamic_item in self.dynamic_items:
+            dynamic_item.reset()
+        return {
+            outkey: data[inkey] if inkey in data else intermediate[inkey]
+            for outkey, inkey in output_mapping.items()
+        }
+
+    def get_selected_node_ids(self, selected_keys):
+        """Translates selected keys to dependency graph keys."""
+        return [self.key_to_node[key] for key in selected_keys]
+
+    def __call__(self, data):
+        return self.compute_outputs(data)
+
+    def _prepare_run(self, data):
+        self._exec_order = list(
+            self.dg.get_evaluation_order(
+                self.get_selected_node_ids(self.output_mapping.values())))
diff --git a/paddlespeech/s2t/io/speechbrain/data_utils.py b/paddlespeech/s2t/io/speechbrain/data_utils.py
new file mode 100755
index 00000000..3fca690a
--- /dev/null
+++ b/paddlespeech/s2t/io/speechbrain/data_utils.py
@@ -0,0 +1,177 @@
+# Copyright (c) 2023 speechbrain Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# 
+# Modified from speechbrain 2023 (https://github.com/speechbrain/speechbrain/blob/develop/speechbrain/utils/data_utils.py)
+import collections.abc
+import csv
+import os
+import pathlib
+import re
+import shutil
+import urllib.request
+
+import numpy as np
+import paddle
+import tqdm
+
+
+def batch_pad_right(array: list, mode="constant", value=0):
+    """Given a list of paddle tensors it batches them together by padding to the right
+    on each dimension in order to get same length for all.
+
+    Parameters
+    ----------
+    array : list
+        List of tensor we wish to pad together.
+    mode : str
+        Padding mode see numpy.pad documentation.
+    value : float
+        Padding value see numpy.pad documentation.
+
+    Returns
+    -------
+    batched : numpy array
+        Padded numpy array.
+    valid_vals : list
+        List containing proportion for each dimension of original, non-padded values.
+
+    """
+
+    if not len(array):
+        raise IndexError("Tensors list must not be empty")
+
+    if len(array) == 1:
+        # if there is only one tensor in the batch we simply unsqueeze it.
+        return np.expand_dims(array[0], 0), np.array([1.0], dtype="float32")
+    if not (any(
+        [array[i].ndim == array[0].ndim for i in range(1, len(array))])):
+        raise IndexError("All array must have same number of dimensions")
+
+    # FIXME we limit the support here: we allow padding of only the first dimension
+    # need to remove this when feat extraction is updated to handle multichannel.
+    max_shape = []
+    for dim in range(array[0].ndim):
+        if dim != 0:
+            if not all(
+                [x.shape[dim] == array[0].shape[dim] for x in array[1:]]):
+                raise EnvironmentError(
+                    "Tensors should have same dimensions except for the first one"
+                )
+        max_shape.append(max([x.shape[dim] for x in array]))
+
+    batched = []
+    valid = []
+    for t in array:
+        # for each tensor we apply pad_right_to
+        padded, valid_percent = pad_right_to(
+            t, max_shape, mode=mode, value=value)
+        batched.append(padded)
+        valid.append(valid_percent[0])
+
+    batched = np.stack(batched)
+
+    return batched, np.array(valid, dtype="float32")
+
+
+np_str_obj_array_pattern = re.compile(r"[SaUO]")
+
+
+def pad_right_to(
+        array: np.ndarray,
+        target_shape: (list, tuple),
+        mode="constant",
+        value=0, ):
+    """
+    This function takes a numpy of arbitrary shape and pads it to target
+    shape by appending values on the right.
+
+    Parameters
+    ----------
+    array : input numpy array
+        Input tensor whose dimension we need to pad.
+    target_shape : (list, tuple)
+        Target shape we want for the target tensor its len must be equal to tensor.ndim
+    mode : str
+        Pad mode, please refer to numpy.pad documentation.
+    value : float
+        Pad value, please refer to numpy.pad documentation.
+
+    Returns
+    -------
+    array : numpy array
+        Padded numpy array.
+    valid_vals : list
+        List containing proportion for each dimension of original, non-padded values.
+    """
+    assert len(target_shape) == array.ndim
+    pads = []  # this contains the abs length of the padding for each dimension.
+    valid_vals = []  # this contains the relative lengths for each dimension.
+    i = len(target_shape) - 1  # iterating over target_shape ndims
+    j = 0
+    while i >= 0:
+        assert (target_shape[i] >= array.shape[i]
+                ), "Target shape must be >= original shape for every dim"
+        pads.extend([0, target_shape[i] - array.shape[i]])
+        valid_vals.append(array.shape[j] / target_shape[j])
+        i -= 1
+        j += 1
+    array = np.pad(array, pads, mode, constant_values=(value, value))
+
+    return array, valid_vals
+
+
+def mod_default_collate(batch):
+    """Makes a tensor from list of batch values.
+
+    Note that this doesn't need to zip(*) values together
+    as PaddedBatch connects them already (by key).
+
+    Here the idea is not to error out.
+    """
+    elem = batch[0]
+    elem_type = type(elem)
+    if isinstance(elem, paddle.Tensor):
+        out = None
+        try:
+            if paddle.io.get_worker_info() is not None:
+
+                # If we're in a background process, concatenate directly into a
+                # shared memory tensor to avoid an extra copy
+                numel = sum([x.numel() for x in batch])
+                storage = elem.storage()._new_shared(numel)
+                out = elem.new(storage)
+            return paddle.stack(batch, 0, name=out)
+        except RuntimeError:  # Unequal size:
+            return batch
+    elif (elem_type.__module__ == "numpy" and elem_type.__name__ != "str_" and
+          elem_type.__name__ != "string_"):
+        try:
+            if (elem_type.__name__ == "ndarray" or
+                    elem_type.__name__ == "memmap"):
+                # array of string classes and object
+                if np_str_obj_array_pattern.search(elem.dtype.str) is not None:
+                    return batch
+                return mod_default_collate(
+                    [paddle.to_tensor(b, dtype=b.dtype) for b in batch])
+            elif elem.shape == ():  # scalars
+                return paddle.to_tensor(batch, dtype=batch.dtype)
+        except RuntimeError:  # Unequal size
+            return batch
+    elif isinstance(elem, float):
+        return paddle.to_tensor(batch, dtype=paddle.float64)
+    elif isinstance(elem, int):
+        return paddle.to_tensor(batch, dtype=paddle.int64)
+    else:
+        return batch
diff --git a/paddlespeech/s2t/io/speechbrain/dataio.py b/paddlespeech/s2t/io/speechbrain/dataio.py
new file mode 100755
index 00000000..e5e6f766
--- /dev/null
+++ b/paddlespeech/s2t/io/speechbrain/dataio.py
@@ -0,0 +1,845 @@
+# Copyright (c) 2023 speechbrain Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# 
+# Modified from speechbrain 2023 (https://github.com/speechbrain/speechbrain/blob/develop/speechbrain/dataio/dataio.py)
+"""
+Data reading and writing.
+
+Authors
+ * Mirco Ravanelli 2020
+ * Aku Rouhe 2020
+ * Ju-Chieh Chou 2020
+ * Samuele Cornell 2020
+ * Abdel HEBA 2020
+"""
+import csv
+import hashlib
+import json
+import logging
+import os
+import pickle
+import re
+import time
+
+import numpy as np
+import soundfile
+logger = logging.getLogger(__name__)
+import paddle
+
+
+def load_data_json(json_path, replacements={}):
+    """Loads JSON and recursively formats string values.
+
+    Arguments
+    ----------
+    json_path : str
+        Path to CSV file.
+    replacements : dict
+        (Optional dict), e.g., {"data_folder": "/home/PaddleSpeech/data"}.
+        This is used to recursively format all string values in the data.
+
+    Returns
+    -------
+    dict
+        JSON data with replacements applied.
+
+
+    """
+    with open(json_path, "r") as f:
+        out_json = json.load(f)
+    _recursive_format(out_json, replacements)
+    return out_json
+
+
+def _recursive_format(data, replacements):
+    # Data: dict or list, replacements : dict
+    # Replaces string keys in replacements by their values
+    # at all levels of data (in str values)
+    # Works in-place.
+    if isinstance(data, dict):
+        for key, item in data.items():
+            if isinstance(item, dict) or isinstance(item, list):
+                _recursive_format(item, replacements)
+            elif isinstance(item, str):
+                data[key] = item.format_map(replacements)
+            # If not dict, list or str, do nothing
+    if isinstance(data, list):
+        for i, item in enumerate(data):
+            if isinstance(item, dict) or isinstance(item, list):
+                _recursive_format(item, replacements)
+            elif isinstance(item, str):
+                data[i] = item.format_map(replacements)
+            # If not dict, list or str, do nothing
+
+
+def load_data_csv(csv_path, replacements={}):
+    """Loads CSV and formats string values.
+
+    Uses the legacy CSV data format, where the CSV must have an
+    'ID' field.
+    If there is a field called duration, it is interpreted as a float.
+    The rest of the fields are left as they are (legacy _format and _opts fields
+    are not used to load the data in any special way).
+
+    Bash-like string replacements with $to_replace are supported.
+
+    Arguments
+    ----------
+    csv_path : str
+        Path to CSV file.
+    replacements : dict
+        (Optional dict), e.g., {"data_folder": "/home/PaddleSpeech/data"}
+        This is used to recursively format all string values in the data.
+
+    Returns
+    -------
+    dict
+        CSV data with replacements applied.
+    """
+
+    with open(csv_path, newline="") as csvfile:
+        result = {}
+        reader = csv.DictReader(csvfile, skipinitialspace=True)
+        variable_finder = re.compile(r"\$([\w.]+)")
+        for row in reader:
+            # ID:
+            try:
+                data_id = row["ID"]
+                del row["ID"]  # This is used as a key in result, instead.
+            except KeyError:
+                raise KeyError("CSV has to have an 'ID' field, with unique ids"
+                               " for all data points")
+            if data_id in result:
+                raise ValueError(f"Duplicate id: {data_id}")
+            # Replacements:
+            for key, value in row.items():
+                try:
+                    row[key] = variable_finder.sub(
+                        lambda match: str(replacements[match[1]]), value)
+                except KeyError:
+                    raise KeyError(f"The item {value} requires replacements "
+                                   "which were not supplied.")
+            # Duration:
+            if "duration" in row:
+                row["duration"] = float(row["duration"])
+            result[data_id] = row
+    return result
+
+
+def read_audio(waveforms_obj):
+    """General audio loading, based on a custom notation.
+
+    Expected use case is in conjunction with Datasets
+    specified by JSON.
+
+    The custom notation:
+
+    The annotation can be just a path to a file:
+    "/path/to/wav1.wav"
+
+    Or can specify more options in a dict:
+    {"file": "/path/to/wav2.wav",
+    "start": 8000,
+    "stop": 16000
+    }
+
+    Arguments
+    ----------
+    waveforms_obj : str, dict
+        Audio reading annotation, see above for format.
+
+    Returns
+    -------
+    paddle.Tensor
+        Audio tensor with shape: (samples, ).
+    """
+    if isinstance(waveforms_obj, str):
+        audio, _ = soundfile.read(waveforms_obj, dtype="float32")
+        return audio
+
+    path = waveforms_obj["file"]
+    start = waveforms_obj.get("start", 0)
+    # Default stop to start -> if not specified, num_frames becomes 0
+    stop = waveforms_obj.get("stop", start)
+    num_frames = stop - start
+    audio, fs = soundfile.read(
+        path, start=start, stop=start + num_frames, dtype="float32")
+    return audio
+
+
+def read_audio_multichannel(waveforms_obj):
+    """General audio loading, based on a custom notation.
+
+    Expected use case is in conjunction with Datasets
+    specified by JSON.
+
+    The custom notation:
+
+    The annotation can be just a path to a file:
+    "/path/to/wav1.wav"
+
+    Multiple (possibly multi-channel) files can be specified, as long as they
+    have the same length:
+    {"files": [
+        "/path/to/wav1.wav",
+        "/path/to/wav2.wav"
+        ]
+    }
+
+    Or you can specify a single file more succinctly:
+    {"files": "/path/to/wav2.wav"}
+
+    Offset number samples and stop number samples also can be specified to read
+    only a segment within the files.
+    {"files": [
+        "/path/to/wav1.wav",
+        "/path/to/wav2.wav"
+        ]
+    "start": 8000
+    "stop": 16000
+    }
+
+    Arguments
+    ----------
+    waveforms_obj : str, dict
+        Audio reading annotation, see above for format.
+
+    Returns
+    -------
+    paddle.Tensor
+        Audio tensor with shape: (samples, ).
+    """
+    if isinstance(waveforms_obj, str):
+        audio, _ = soundfile.read(waveforms_obj, dtype="float32")
+        audio = paddle.to_tensor(audio)
+        return audio
+
+    files = waveforms_obj["files"]
+    if not isinstance(files, list):
+        files = [files]
+
+    waveforms = []
+    start = waveforms_obj.get("start", 0)
+    # Default stop to start -> if not specified, num_frames becomes 0
+    stop = waveforms_obj.get("stop", start - 1)
+    num_frames = stop - start
+    for f in files:
+        audio, fs = soundfile.read(
+            path, start=start, stop=start + num_frames, dtype="float32")
+        audio = paddle.to_tensor(audio)
+        waveforms.append(audio)
+
+    out = paddle.concat(waveforms, 0)
+    return out
+
+
+def write_audio(filepath, audio, samplerate):
+    """Write audio on disk. It is basically a wrapper to support saving
+    audio signals in format (audio, channels).
+
+    Arguments
+    ---------
+    filepath: path
+        Path where to save the audio file.
+    audio : paddle.Tensor
+        Audio file in the expected format (signal, channels).
+    samplerate: int
+        Sample rate (e.g., 16000).
+
+    """
+    if len(audio.shape) == 2:
+        audio = audio.transpose([1, 0])
+    elif len(audio.shape) == 1:
+        audio = audio.unsqueeze(0)
+
+    soundfile.write(filepath, audio, samplerate)
+
+
+def load_pickle(pickle_path):
+    """Utility function for loading .pkl pickle files.
+
+    Arguments
+    ---------
+    pickle_path : str
+        Path to pickle file.
+
+    Returns
+    -------
+    out : object
+        Python object loaded from pickle.
+    """
+    with open(pickle_path, "rb") as f:
+        out = pickle.load(f)
+    return out
+
+
+def to_floatTensor(x: (list, tuple, np.ndarray)):
+    """
+    Arguments
+    ---------
+    x : (list, tuple, np.ndarray)
+        Input data to be converted to paddle float.
+
+    Returns
+    -------
+    tensor : paddle.tensor
+        Data now in paddle.tensor float datatype.
+    """
+    return paddle.to_tensor(x, dtype='float32')
+
+
+def to_doubleTensor(x: (list, tuple, np.ndarray)):
+    """
+    Arguments
+    ---------
+    x : (list, tuple, np.ndarray)
+        Input data to be converted to paddle double.
+
+    Returns
+    -------
+    tensor : paddle.tensor
+        Data now in paddle.tensor double datatype.
+    """
+    return paddle.to_tensor(x, dtype='float64')
+
+
+def to_longTensor(x: (list, tuple, np.ndarray)):
+    """
+    Arguments
+    ---------
+    x : (list, tuple, np.ndarray)
+        Input data to be converted to paddle long.
+
+    Returns
+    -------
+    tensor : paddle.tensor
+        Data now in paddle.tensor long datatype.
+    """
+    return paddle.to_tensor(x, dtype='int64')
+
+
+def convert_index_to_lab(batch, ind2lab):
+    """Convert a batch of integer IDs to string labels.
+
+    Arguments
+    ---------
+    batch : list
+        List of lists, a batch of sequences.
+    ind2lab : dict
+        Mapping from integer IDs to labels.
+
+    Returns
+    -------
+    list
+        List of lists, same size as batch, with labels from ind2lab.
+
+    """
+    return [[ind2lab[int(index)] for index in seq] for seq in batch]
+
+
+def relative_time_to_absolute(batch, relative_lens, rate):
+    """Converts relative length to the absolute duration.
+
+    Operates on batch level.
+
+    Arguments
+    ---------
+    batch : paddle.tensor
+        Sequences to determine the duration for.
+    relative_lens : paddle.tensor
+        The relative length of each sequence in batch. The longest sequence in
+        the batch needs to have relative length 1.0.
+    rate : float
+        The rate at which sequence elements occur in real-world time. Sample
+        rate, if batch is raw wavs (recommended) or 1/frame_shift if batch is
+        features. This has to have 1/s as the unit.
+
+    Returns
+    ------:
+    paddle.tensor
+        Duration of each sequence in seconds.
+
+    """
+    max_len = batch.shape[1]
+    durations = paddle.round(relative_lens * max_len) / rate
+    return durations
+
+
+class IterativeCSVWriter:
+    """Write CSV files a line at a time.
+
+    Arguments
+    ---------
+    outstream : file-object
+        A writeable stream
+    data_fields : list
+        List of the optional keys to write. Each key will be expanded, 
+        producing three fields: key, key_format, key_opts.
+    """
+
+    def __init__(self, outstream, data_fields, defaults={}):
+        self._outstream = outstream
+        self.fields = ["ID", "duration"] + self._expand_data_fields(data_fields)
+        self.defaults = defaults
+        self._outstream.write(",".join(self.fields))
+
+    def set_default(self, field, value):
+        """Sets a default value for the given CSV field.
+
+        Arguments
+        ---------
+        field : str
+            A field in the CSV.
+        value
+            The default value.
+        """
+        if field not in self.fields:
+            raise ValueError(f"{field} is not a field in this CSV!")
+        self.defaults[field] = value
+
+    def write(self, *args, **kwargs):
+        """Writes one data line into the CSV.
+
+        Arguments
+        ---------
+        *args
+            Supply every field with a value in positional form OR.
+        **kwargs
+            Supply certain fields by key. The ID field is mandatory for all
+            lines, but others can be left empty.
+        """
+        if args and kwargs:
+            raise ValueError(
+                "Use either positional fields or named fields, but not both.")
+        if args:
+            if len(args) != len(self.fields):
+                raise ValueError("Need consistent fields")
+            to_write = [str(arg) for arg in args]
+        if kwargs:
+            if "ID" not in kwargs:
+                raise ValueError("I'll need to see some ID")
+            full_vals = self.defaults.copy()
+            full_vals.update(kwargs)
+            to_write = [str(full_vals.get(field, "")) for field in self.fields]
+        self._outstream.write("\n")
+        self._outstream.write(",".join(to_write))
+
+    def write_batch(self, *args, **kwargs):
+        """Writes a batch of lines into the CSV.
+
+        Here each argument should be a list with the same length.
+
+        Arguments
+        ---------
+        *args
+            Supply every field with a value in positional form OR.
+        **kwargs
+            Supply certain fields by key. The ID field is mandatory for all
+            lines, but others can be left empty.
+        """
+        if args and kwargs:
+            raise ValueError(
+                "Use either positional fields or named fields, but not both.")
+        if args:
+            if len(args) != len(self.fields):
+                raise ValueError("Need consistent fields")
+            for arg_row in zip(*args):
+                self.write(*arg_row)
+        if kwargs:
+            if "ID" not in kwargs:
+                raise ValueError("I'll need to see some ID")
+            keys = kwargs.keys()
+            for value_row in zip(*kwargs.values()):
+                kwarg_row = dict(zip(keys, value_row))
+                self.write(**kwarg_row)
+
+    @staticmethod
+    def _expand_data_fields(data_fields):
+        expanded = []
+        for data_field in data_fields:
+            expanded.append(data_field)
+            expanded.append(data_field + "_format")
+            expanded.append(data_field + "_opts")
+        return expanded
+
+
+def write_txt_file(data, filename, sampling_rate=None):
+    """Write data in text format.
+
+    Arguments
+    ---------
+    data : str, list, paddle.tensor, numpy.ndarray
+        The data to write in the text file.
+    filename : str
+        Path to file where to write the data.
+    sampling_rate : None
+        Not used, just here for interface compatibility.
+
+    Returns
+    -------
+    None
+
+    """
+    del sampling_rate  # Not used.
+    # Check if the path of filename exists
+    os.makedirs(os.path.dirname(filename), exist_ok=True)
+    with open(filename, "w") as fout:
+        if isinstance(data, paddle.Tensor):
+            data = data.tolist()
+        if isinstance(data, np.ndarray):
+            data = data.tolist()
+        if isinstance(data, list):
+            for line in data:
+                print(line, file=fout)
+        if isinstance(data, str):
+            print(data, file=fout)
+
+
+def write_stdout(data, filename=None, sampling_rate=None):
+    """Write data to standard output.
+
+    Arguments
+    ---------
+    data : str, list, paddle.Tensor, numpy.ndarray
+        The data to write in the text file.
+    filename : None
+        Not used, just here for compatibility.
+    sampling_rate : None
+        Not used, just here for compatibility.
+
+    Returns
+    -------
+    None
+
+    """
+    # Managing paddle.Tensor
+    if isinstance(data, paddle.Tensor):
+        data = data.tolist()
+    # Managing np.ndarray
+    if isinstance(data, np.ndarray):
+        data = data.tolist()
+    if isinstance(data, list):
+        for line in data:
+            print(line)
+    if isinstance(data, str):
+        print(data)
+
+
+def length_to_mask(length, max_len=None, dtype=None, device=None):
+    """Creates a binary mask for each sequence.
+    Arguments
+    ---------
+    length : LongTensor
+        Containing the length of each sequence in the batch. Must be 1D.
+    max_len : int
+        Max length for the mask, also the size of the second dimension.
+    dtype : dtype, default: None
+        The dtype of the generated mask.
+    device: device, default: None
+        The device to put the mask variable.
+
+    Returns
+    -------
+    mask : tensor
+        The binary mask.
+
+    """
+    assert len(length.shape) == 1
+
+    if max_len is None:
+        max_len = length.max().long().item()  # using arange to generate mask
+    mask = paddle.arange(
+        max_len, dtype=length.dtype).expand(
+            [len(length), max_len]) < length.unsqueeze(1)
+
+    if dtype is None:
+        dtype = length.dtype
+
+    if device is None:
+        device = length.device
+
+    mask = paddle.to_tensor(mask, dtype=dtype)
+    return mask
+
+
+def read_kaldi_lab(kaldi_ali, kaldi_lab_opts):
+    """Read labels in kaldi format.
+
+    Uses kaldi IO.
+
+    Arguments
+    ---------
+    kaldi_ali : str
+        Path to directory where kaldi alignments are stored.
+    kaldi_lab_opts : str
+        A string that contains the options for reading the kaldi alignments.
+
+    Returns
+    -------
+    lab : dict
+        A dictionary containing the labels.
+
+    Note
+    ----
+    This depends on kaldi-io-for-python. Install it separately.
+    See: https://github.com/vesis84/kaldi-io-for-python
+    ```
+    """
+    # EXTRA TOOLS
+    try:
+        import kaldi_io
+    except ImportError:
+        raise ImportError("Could not import kaldi_io. Install it to use this.")
+    # Reading the Kaldi labels
+    lab = {
+        k: v
+        for k, v in kaldi_io.read_vec_int_ark(
+            "gunzip -c " + kaldi_ali + "/ali*.gz | " + kaldi_lab_opts + " " +
+            kaldi_ali + "/final.mdl ark:- ark:-|")
+    }
+    return lab
+
+
+def get_md5(file):
+    """Get the md5 checksum of an input file.
+
+    Arguments
+    ---------
+    file : str
+        Path to file for which compute the checksum.
+
+    Returns
+    -------
+    md5
+        Checksum for the given filepath.
+    """
+    # Lets read stuff in 64kb chunks!
+    BUF_SIZE = 65536
+    md5 = hashlib.md5()
+    # Computing md5
+    with open(file, "rb") as f:
+        while True:
+            data = f.read(BUF_SIZE)
+            if not data:
+                break
+            md5.update(data)
+    return md5.hexdigest()
+
+
+def save_md5(files, out_file):
+    """Saves the md5 of a list of input files as a pickled dict into a file.
+
+    Arguments
+    ---------
+    files : list
+        List of input files from which we will compute the md5.
+    outfile : str
+        The path where to store the output pkl file.
+
+    Returns
+    -------
+    None
+    """
+    # Initialization of the dictionary
+    md5_dict = {}
+    # Computing md5 for all the files in the list
+    for file in files:
+        md5_dict[file] = get_md5(file)
+    # Saving dictionary in pkl format
+    save_pkl(md5_dict, out_file)
+
+
+def save_pkl(obj, file):
+    """Save an object in pkl format.
+
+    Arguments
+    ---------
+    obj : object
+        Object to save in pkl format
+    file : str
+        Path to the output file
+    sampling_rate : int
+        Sampling rate of the audio file, TODO: this is not used?
+
+    """
+    with open(file, "wb") as f:
+        pickle.dump(obj, f)
+
+
+def load_pkl(file):
+    """Loads a pkl file.
+
+    For an example, see `save_pkl`.
+
+    Arguments
+    ---------
+    file : str
+        Path to the input pkl file.
+
+    Returns
+    -------
+    The loaded object.
+    """
+
+    # Deals with the situation where two processes are trying
+    # to access the same label dictionary by creating a lock
+    count = 100
+    while count > 0:
+        if os.path.isfile(file + ".lock"):
+            time.sleep(1)
+            count -= 1
+        else:
+            break
+
+    try:
+        open(file + ".lock", "w").close()
+        with open(file, "rb") as f:
+            return pickle.load(f)
+    finally:
+        if os.path.isfile(file + ".lock"):
+            os.remove(file + ".lock")
+
+
+def prepend_bos_token(label, bos_index):
+    """Create labels with <bos> token at the beginning.
+
+    Arguments
+    ---------
+    label : IntTensor
+        Containing the original labels. Must be of size: [batch_size, max_length].
+    bos_index : int
+        The index for <bos> token.
+
+    Returns
+    -------
+    new_label : tensor
+        The new label with <bos> at the beginning.
+
+    """
+    new_label = label.long().clone()
+    batch_size = label.shape[0]
+
+    bos = new_label.new_zeros(batch_size, 1).fill_(bos_index)
+    new_label = paddle.concat([bos, new_label], axis=1)
+    return new_label
+
+
+def append_eos_token(label, length, eos_index):
+    """Create labels with <eos> token appended.
+
+    Arguments
+    ---------
+    label : IntTensor
+        Containing the original labels. Must be of size: [batch_size, max_length]
+    length : LongTensor
+        Containing the original length of each label sequences. Must be 1D.
+    eos_index : int
+        The index for <eos> token.
+
+    Returns
+    -------
+    new_label : tensor
+        The new label with <eos> appended.
+
+    """
+    new_label = paddle.to_tensor(label, dtype="int32").clone()
+    batch_size = label.shape[0]
+
+    pad = paddle.zeros([batch_size, 1], dtype=new_label.dtype)
+
+    new_label = paddle.concat([new_label, pad], dim=1)
+    new_label[paddle.arange(batch_size), paddle.to_tensor(
+        length, dtype="int64")] = eos_index
+    return new_label
+
+
+def merge_char(sequences, space="_"):
+    """Merge characters sequences into word sequences.
+
+    Arguments
+    ---------
+    sequences : list
+        Each item contains a list, and this list contains a character sequence.
+    space : string
+        The token represents space. Default: _
+
+    Returns
+    -------
+    The list contains word sequences for each sentence.
+
+    """
+    results = []
+    for seq in sequences:
+        words = "".join(seq).split(space)
+        results.append(words)
+    return results
+
+
+def merge_csvs(data_folder, csv_lst, merged_csv):
+    """Merging several csv files into one file.
+
+    Arguments
+    ---------
+    data_folder : string
+        The folder to store csv files to be merged and after merging.
+    csv_lst : list
+        Filenames of csv file to be merged.
+    merged_csv : string
+        The filename to write the merged csv file.
+
+    """
+    write_path = os.path.join(data_folder, merged_csv)
+    if os.path.isfile(write_path):
+        logger.info("Skipping merging. Completed in previous run.")
+    with open(os.path.join(data_folder, csv_lst[0])) as f:
+        header = f.readline()
+    lines = []
+    for csv_file in csv_lst:
+        with open(os.path.join(data_folder, csv_file)) as f:
+            for i, line in enumerate(f):
+                if i == 0:
+                    # Checking header
+                    if line != header:
+                        raise ValueError("Different header for "
+                                         f"{csv_lst[0]} and {csv}.")
+                    continue
+                lines.append(line)
+    with open(write_path, "w") as f:
+        f.write(header)
+        for line in lines:
+            f.write(line)
+    logger.info(f"{write_path} is created.")
+
+
+def split_word(sequences, space="_"):
+    """Split word sequences into character sequences.
+
+    Arguments
+    ---------
+    sequences : list
+        Each item contains a list, and this list contains a words sequence.
+    space : string
+        The token represents space. Default: _
+
+    Returns
+    -------
+    The list contains word sequences for each sentence.
+
+    """
+    results = []
+    for seq in sequences:
+        chars = list(space.join(seq))
+        results.append(chars)
+    return results
diff --git a/paddlespeech/s2t/io/speechbrain/dataloader.py b/paddlespeech/s2t/io/speechbrain/dataloader.py
new file mode 100755
index 00000000..ed0fe904
--- /dev/null
+++ b/paddlespeech/s2t/io/speechbrain/dataloader.py
@@ -0,0 +1,172 @@
+# Copyright (c) 2023 speechbrain Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# 
+# Modified from speechbrain 2023 (https://github.com/speechbrain/speechbrain/blob/develop/speechbrain/dataio/dataloader.py)
+"""Paddle compatible DataLoaders
+
+Essentially we extend Paddle DataLoader by adding the ability to save the
+data loading state, so that a checkpoint may be saved in the middle of an
+epoch.
+
+Authors:
+  * Aku Rouhe 2020
+"""
+import collections
+import functools
+import logging
+import warnings
+
+import paddle
+from paddle.io import DataLoader
+
+from paddlespeech.s2t.io.speechbrain.data_utils import batch_pad_right
+from paddlespeech.s2t.io.speechbrain.data_utils import mod_default_collate
+from paddlespeech.s2t.io.speechbrain.dataset import DynamicItemDataset
+from paddlespeech.s2t.io.speechbrain.sampler import ReproducibleRandomSampler
+PaddedData = collections.namedtuple("PaddedData", ["data", "lengths"])
+import numpy
+
+
+class Wav2vec2DataLoader(DataLoader):
+    def __init__(self,
+                 dataset,
+                 batch_size=1,
+                 shuffle=False,
+                 sampler=None,
+                 batch_sampler=None,
+                 num_workers=0,
+                 collate_fn=None,
+                 pin_memory=False,
+                 drop_last=False,
+                 timeout=0,
+                 worker_init_fn=None,
+                 multiprocessing_context=None,
+                 generator=None):
+        if isinstance(dataset[0], (tuple, list)):
+            return_list = True
+        else:
+            return_list = False
+
+        super().__init__(
+            dataset,
+            feed_list=None,
+            places=None,
+            return_list=return_list,
+            batch_sampler=batch_sampler,
+            batch_size=batch_size,
+            shuffle=shuffle,
+            drop_last=drop_last,
+            collate_fn=collate_fn,
+            num_workers=num_workers,
+            use_buffer_reader=True,
+            use_shared_memory=False,
+            timeout=timeout,
+            worker_init_fn=worker_init_fn)
+        if sampler is not None:
+            self.batch_sampler.sampler = sampler
+
+
+def PaddedBatch(
+        examples,
+        padded_keys=None,
+        device_prep_keys=None,
+        padding_func=batch_pad_right,
+        padding_kwargs={},
+        nonpadded_stack=True, ):
+    __length = len(examples)
+    __keys = list(examples[0].keys())
+    __padded_keys = []
+    __device_prep_keys = []
+    res = {}
+    for key in __keys:
+        values = [example[key] for example in examples]
+        # Default convert usually does the right thing (numpy2tensor etc.)
+        # values = default_convert(values)
+        if (padded_keys is not None and key in padded_keys) or (
+                padded_keys is None and isinstance(values[0], numpy.ndarray)):
+            # Padding and PaddedData
+            __padded_keys.append(key)
+
+            padded = PaddedData(*padding_func(values, **padding_kwargs))
+            res[key] = padded
+        else:
+            # Default collate usually does the right thing
+            # (convert lists of equal sized tensors to batch tensors, etc.)
+            if nonpadded_stack:
+                values = mod_default_collate(values)
+            res[key] = values
+        if (device_prep_keys is not None and key in device_prep_keys) or (
+                device_prep_keys is None and
+                isinstance(values[0], paddle.Tensor)):
+            __device_prep_keys.append(key)
+    return res
+
+
+def make_dataloader(dataset, stage, **loader_kwargs):
+    """Makes a basic DataLoader.
+
+    For DynamicItemDatasets (which return dicts), use
+    PaddedBatch as the default collate_fn.
+
+    Shuffling gets implemented by ReproducibleRandomSampler.
+
+    If the Dataset is not an IterableDataset, the DataLoader
+    is a SaveableDataLoader.
+
+    If the Dataset is a webdataset.dataset.Composable, set default
+    batch_size = None.
+
+    Can also loop over the underlying dataloader continuously,
+    and stop iterations at nominal epoch lengths.
+
+    Arguments
+    ---------
+    dataset : Dataset
+        The dataset to make a DataLoader for.
+    looped_nominal_epoch : None, int
+        If an integer is given, loop the underlying DataLoader infinitely and
+        set a nominal epoch length in batches (or whatever the DataLoader
+        yields).
+    **loader_kwargs : dict
+        Keyword args to DataLoader, see Paddle DataLoader for
+        options.
+
+    Returns
+    -------
+    DataLoader
+        If looped_nominal_epoch is None
+    LoopedLoader
+        If looped_nominal_epoch is not None
+    """
+    # PaddedBatch as default collation for DynamicItemDataset
+    if "collate_fn" not in loader_kwargs and isinstance(dataset,
+                                                        DynamicItemDataset):
+        loader_kwargs["collate_fn"] = PaddedBatch
+    # Reproducible random sampling
+    if loader_kwargs.get("shuffle", False):
+        if loader_kwargs.get("sampler") is not None:
+            raise ValueError("Cannot specify both shuffle=True and a "
+                             "sampler in loader_kwargs")
+        sampler = ReproducibleRandomSampler(dataset)
+        loader_kwargs["sampler"] = sampler
+        # Should delete shuffle because you can't set both Sampler and
+        # shuffle
+        # NOTE: the dict of loader options may get used elsewhere!
+        # However, this del doesn't touch those because loader_kwargs comes
+        # from a **kwargs dict.
+        del loader_kwargs["shuffle"]
+    # Create the loader
+    dataloader = Wav2vec2DataLoader(dataset, **loader_kwargs)
+    return dataloader
diff --git a/paddlespeech/s2t/io/speechbrain/dataset.py b/paddlespeech/s2t/io/speechbrain/dataset.py
new file mode 100755
index 00000000..136275b7
--- /dev/null
+++ b/paddlespeech/s2t/io/speechbrain/dataset.py
@@ -0,0 +1,371 @@
+# Copyright (c) 2023 speechbrain Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# 
+# Modified from speechbrain 2023 (https://github.com/speechbrain/speechbrain/blob/develop/speechbrain/dataio/dataset.py)
+import contextlib
+import copy
+import logging
+from types import MethodType
+
+from paddle.io import Dataset
+
+from paddlespeech.s2t.io.speechbrain.data_pipeline import DataPipeline
+from paddlespeech.s2t.io.speechbrain.dataio import load_data_csv
+from paddlespeech.s2t.io.speechbrain.dataio import load_data_json
+
+logger = logging.getLogger(__name__)
+
+
+class DynamicItemDataset(Dataset):
+    """Dataset that reads, wrangles, and produces dicts.
+
+    Each data point dict provides some items (by key), for example, a path to a
+    wavefile with the key "wav_file". When a data point is fetched from this
+    Dataset, more items are produced dynamically, based on pre-existing items
+    and other dynamic created items. For example, a dynamic item could take the
+    wavfile path and load the audio from the disk.
+
+    The dynamic items can depend on other dynamic items: a suitable evaluation
+    order is used automatically,  as long as there are no circular dependencies.
+
+    A specified list of keys is collected in the output dict. These can be items
+    in the original data or dynamic items. If some dynamic items are not
+    requested, nor depended on by other requested items, they won't be computed.
+    So for example if a user simply wants to iterate over the text, the
+    time-consuming audio loading can be skipped.
+
+    About the format:
+    Takes a dict of dicts as the collection of data points to read/wrangle.
+    The top level keys are data point IDs.
+    Each data point (example) dict should have the same keys, corresponding to
+    different items in that data point.
+
+    Altogether the data collection could look like this:
+
+    >>> data = {
+    ...  "spk1utt1": {
+    ...      "wav_file": "/path/to/spk1utt1.wav",
+    ...      "text": "hello world",
+    ...      "speaker": "spk1",
+    ...      },
+    ...  "spk1utt2": {
+    ...      "wav_file": "/path/to/spk1utt2.wav",
+    ...      "text": "how are you world",
+    ...      "speaker": "spk1",
+    ...      }
+    ... }
+
+    NOTE
+    ----
+        The top-level key, the data point id, is implicitly added as an item
+        in the data point, with the key "id"
+
+    Each dynamic item is configured by three things: a key, a func, and a list
+    of argkeys. The key should be unique among all the items (dynamic or not) in
+    each data point. The func is any callable, and it returns the dynamic item's
+    value. The callable is called with the values of other items as specified
+    by the argkeys list (as positional args, passed in the order specified by
+    argkeys).
+
+    Arguments
+    ---------
+    data : dict
+        Dictionary containing single data points (e.g. utterances).
+    dynamic_items : list, optional
+        Configuration for the dynamic items produced when fetching an example.
+        List of DynamicItems or dicts with the format::
+            func: <callable> # To be called
+            takes: <list> # key or list of keys of args this takes
+            provides: key # key or list of keys that this provides
+    output_keys : dict, list, optional
+        List of keys (either directly available in data or dynamic items)
+        to include in the output dict when data points are fetched.
+
+        If a dict is given; it is used to map internal keys to output keys.
+        From the output_keys dict key:value pairs the key appears outside,
+        and value is the internal key.
+    """
+
+    def __init__(
+            self,
+            data,
+            dynamic_items=[],
+            output_keys=[], ):
+        self.data = data
+        self.data_ids = list(self.data.keys())
+        static_keys = list(self.data[self.data_ids[0]].keys())
+        if "id" in static_keys:
+            raise ValueError("The key 'id' is reserved for the data point id.")
+        else:
+            static_keys.append("id")
+        self.pipeline = DataPipeline(static_keys, dynamic_items)
+        self.set_output_keys(output_keys)
+
+    def __len__(self):
+        return len(self.data_ids)
+
+    def __getitem__(self, index):
+        data_id = self.data_ids[index]
+        data_point = self.data[data_id]
+        return self.pipeline.compute_outputs({"id": data_id, **data_point})
+
+    def add_dynamic_item(self, func, takes=None, provides=None):
+        """Makes a new dynamic item available on the dataset.
+
+        Two calling conventions. For DynamicItem objects, just use:
+        add_dynamic_item(dynamic_item).
+        But otherwise, should use:
+        add_dynamic_item(func, takes, provides).
+
+        Arguments
+        ---------
+        func : callable, DynamicItem
+            If a DynamicItem is given, adds that directly. Otherwise a
+            DynamicItem is created, and this specifies the callable to use. If
+            a generator function is given, then create a GeneratorDynamicItem.
+            Otherwise creates a normal DynamicItem.
+        takes : list, str
+            List of keys. When func is called, each key is resolved to
+            either an entry in the data or the output of another dynamic_item.
+            The func is then called with these as positional arguments,
+            in the same order as specified here.
+            A single arg can be given directly.
+        provides : str
+            Unique key or keys that this provides.
+        """
+        self.pipeline.add_dynamic_item(func, takes, provides)
+
+    def set_output_keys(self, keys):
+        """Use this to change the output keys.
+
+        These are the keys that are actually evaluated when a data point
+        is fetched from the dataset.
+
+        Arguments
+        ---------
+        keys : dict, list
+            List of keys (str) to produce in output.
+
+            If a dict is given; it is used to map internal keys to output keys.
+            From the output_keys dict key:value pairs the key appears outside,
+            and value is the internal key.
+        """
+        self.pipeline.set_output_keys(keys)
+
+    @contextlib.contextmanager
+    def output_keys_as(self, keys):
+        """Context manager to temporarily set output keys.
+
+        NOTE
+        ----
+        Not thread-safe. While in this context manager, the output keys
+        are affected for any call.
+        """
+        saved_output = self.pipeline.output_mapping
+        self.pipeline.set_output_keys(keys)
+        yield self
+        self.pipeline.set_output_keys(saved_output)
+
+    def filtered_sorted(
+            self,
+            key_min_value={},
+            key_max_value={},
+            key_test={},
+            sort_key=None,
+            reverse=False,
+            select_n=None, ):
+        """Get a filtered and/or sorted version of this, shares static data.
+
+        The reason to implement these operations in the same method is that
+        computing some dynamic items may be expensive, and this way the
+        filtering and sorting steps don't need to compute the dynamic items
+        twice.
+
+        Arguments
+        ---------
+        key_min_value : dict
+            Map from key (in data or in dynamic items) to limit, will only keep
+            data_point if data_point[key] >= limit
+        key_max_value : dict
+            Map from key (in data or in dynamic items) to limit, will only keep
+            data_point if data_point[key] <= limit
+        key_test : dict
+            Map from key (in data or in dynamic items) to func, will only keep
+            data_point if bool(func(data_point[key])) == True
+        sort_key : None, str
+            If not None, sort by data_point[sort_key]. Default is ascending
+            order.
+        reverse : bool
+            If True, sort in descending order.
+        select_n : None, int
+            If not None, only keep (at most) the first n filtered data_points.
+            The possible sorting is applied, but only on the first n data
+            points found. Meant for debugging.
+
+        Returns
+        -------
+        FilteredSortedDynamicItemDataset
+            Shares the static data, but has its own output keys and
+            dynamic items (initially deep copied from this, so they have the
+            same dynamic items available)
+
+        NOTE
+        ----
+        Temporarily changes the output keys!
+        """
+        filtered_sorted_ids = self._filtered_sorted_ids(
+            key_min_value,
+            key_max_value,
+            key_test,
+            sort_key,
+            reverse,
+            select_n, )
+        return FilteredSortedDynamicItemDataset(
+            self, filtered_sorted_ids)  # NOTE: defined below
+
+    def _filtered_sorted_ids(
+            self,
+            key_min_value={},
+            key_max_value={},
+            key_test={},
+            sort_key=None,
+            reverse=False,
+            select_n=None, ):
+        """Returns a list of data ids, fulfilling the sorting and filtering."""
+
+        def combined_filter(computed):
+            """Applies filter."""
+            for key, limit in key_min_value.items():
+                # NOTE: docstring promises >= so using that.
+                # Mathematically could also use < for nicer syntax, but
+                # maybe with some super special weird edge case some one can
+                # depend on the >= operator
+                if computed[key] >= limit:
+                    continue
+                return False
+            for key, limit in key_max_value.items():
+                if computed[key] <= limit:
+                    continue
+                return False
+            for key, func in key_test.items():
+                if bool(func(computed[key])):
+                    continue
+                return False
+            return True
+
+        temp_keys = (set(key_min_value.keys()) | set(key_max_value.keys()) |
+                     set(key_test.keys()) |
+                     set([] if sort_key is None else [sort_key]))
+        filtered_ids = []
+        with self.output_keys_as(temp_keys):
+            for i, data_id in enumerate(self.data_ids):
+                if select_n is not None and len(filtered_ids) == select_n:
+                    break
+                data_point = self.data[data_id]
+                data_point["id"] = data_id
+                computed = self.pipeline.compute_outputs(data_point)
+                if combined_filter(computed):
+                    if sort_key is not None:
+                        # Add (main sorting index, current index, data_id)
+                        # So that we maintain current sorting and don't compare
+                        # data_id values ever.
+                        filtered_ids.append((computed[sort_key], i, data_id))
+                    else:
+                        filtered_ids.append(data_id)
+        if sort_key is not None:
+            filtered_sorted_ids = [
+                tup[2] for tup in sorted(filtered_ids, reverse=reverse)
+            ]
+        else:
+            filtered_sorted_ids = filtered_ids
+        return filtered_sorted_ids
+
+    @classmethod
+    def from_json(cls,
+                  json_path,
+                  replacements={},
+                  dynamic_items=[],
+                  output_keys=[]):
+        """Load a data prep JSON file and create a Dataset based on it."""
+        data = load_data_json(json_path, replacements)
+        return cls(data, dynamic_items, output_keys)
+
+    @classmethod
+    def from_csv(cls,
+                 csv_path,
+                 replacements={},
+                 dynamic_items=[],
+                 output_keys=[]):
+        """Load a data prep CSV file and create a Dataset based on it."""
+        data = load_data_csv(csv_path, replacements)
+        return cls(data, dynamic_items, output_keys)
+
+    @classmethod
+    def from_arrow_dataset(cls,
+                           dataset,
+                           replacements={},
+                           dynamic_items=[],
+                           output_keys=[]):
+        """Loading a prepared huggingface dataset"""
+
+        # define an unbound method to generate puesdo keys
+        def keys(self):
+            "Returns the keys."
+            return [i for i in range(dataset.__len__())]
+
+        # bind this method to arrow dataset
+        dataset.keys = MethodType(keys, dataset)
+        return cls(dataset, dynamic_items, output_keys)
+
+
+class FilteredSortedDynamicItemDataset(DynamicItemDataset):
+    """Possibly filtered, possibly sorted DynamicItemDataset.
+
+    Shares the static data (reference).
+    Has its own dynamic_items and output_keys (deepcopy).
+    """
+
+    def __init__(self, from_dataset, data_ids):
+        self.data = from_dataset.data
+        self.data_ids = data_ids
+        self.pipeline = copy.deepcopy(from_dataset.pipeline)
+
+    @classmethod
+    def from_json(cls,
+                  json_path,
+                  replacements={},
+                  dynamic_items=None,
+                  output_keys=None):
+        raise TypeError("Cannot create SubsetDynamicItemDataset directly!")
+
+    @classmethod
+    def from_csv(cls,
+                 csv_path,
+                 replacements={},
+                 dynamic_items=None,
+                 output_keys=None):
+        raise TypeError("Cannot create SubsetDynamicItemDataset directly!")
+
+
+def add_dynamic_item(datasets, func, takes=None, provides=None):
+    """Helper for adding the same item to multiple datasets."""
+    for dataset in datasets:
+        dataset.add_dynamic_item(func, takes, provides)
+
+
+def set_output_keys(datasets, output_keys):
+    """Helper for setting the same item to multiple datasets."""
+    for dataset in datasets:
+        dataset.set_output_keys(output_keys)
diff --git a/paddlespeech/s2t/io/speechbrain/depgraph.py b/paddlespeech/s2t/io/speechbrain/depgraph.py
new file mode 100755
index 00000000..501b1d50
--- /dev/null
+++ b/paddlespeech/s2t/io/speechbrain/depgraph.py
@@ -0,0 +1,237 @@
+# Copyright (c) 2023 speechbrain Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# 
+# Modified from speechbrain 2023 (https://github.com/speechbrain/speechbrain/blob/develop/speechbrain/utils/depgraph.py)
+"""A dependency graph for finding evaluation order.
+
+Authors:
+    * Aku Rouhe 2020
+"""
+import collections
+import uuid
+
+
+class CircularDependencyError(ValueError):
+    """
+    An error caused by running into circular dependencies while searching for
+    an evaluation order in a DependencyGraph.
+    """
+
+    pass
+
+
+DGNode = collections.namedtuple("DGNode", ["key", "edges", "data"])
+
+# A node in DependencyGraph.
+
+
+class DependencyGraph:
+    """General-purpose dependency graph.
+
+    Essentially a directed acyclic graph.
+    Usually used to find an evaluation order for e.g. variable substitution
+    The relation that an edge between A and B represents is:
+    "A depends on B, i.e. B should be evaluated before A"
+
+    Nodes can be added explicitly or they can be created implicitly
+    while adding edges.
+    Nodes have keys, which should be some hashable value that identifies
+    the elements the graph represents in your use case. E.G. they can just
+    be the variable name you want to substitute.
+    However, if needed, more generally you can attach any data to a node
+    (e.g. a path in your tree), and if so desired, a unique key can be
+    created for you. You'll only need to know that key while adding edges
+    to/from it.
+    Implicit keys and explicit keys can also be mixed.
+    """
+
+    def __init__(self):
+        self.digraph = []
+        self.key2ind = {}
+        # Guard for manual duplicates (but not implicitly added ones)
+        self._manually_added_keys = []
+
+    @staticmethod
+    def get_unique_key():
+        """Returns a unique hashable identifier."""
+        return uuid.uuid4()
+
+    def add_node(self, key=None, data=None):
+        """Adds a node explicitly.
+
+        Arguments
+        ---------
+        key : hashable, optional
+            If not given, a key is created for you.
+        data : Any, optional
+            Any additional data you wish to attach to this node.
+
+        Returns
+        -------
+        hashable
+            The key that was used (either yours or generated).
+
+        Raises
+        ------
+        ValueError
+            If node with the given key has already been added explicitly
+            (with this method, not "add_edge").
+        """
+        if key is None:
+            key = self.get_unique_key()
+        elif key in self._manually_added_keys:
+            raise ValueError("Adding duplicate node: {key}".format(key=key))
+        else:
+            self._manually_added_keys.append(key)
+        if key in self.key2ind:  # Implicitly added already; don't add again.
+            ind = self.key2ind[key]
+            node = self.digraph[ind]
+            # All that this operation can do is add data:
+            self.digraph[ind] = DGNode(node.key, node.edges, data)
+            return key
+        self.key2ind[key] = len(self.digraph)
+        self.digraph.append(DGNode(key, [], data))
+        return key
+
+    def add_edge(self, from_key, to_key):
+        """Adds an edge, and implicitly also creates nodes for keys which have
+        not been seen before. This will not let you add data to your nodes.
+        The relation encodes: "from_key depends on to_key"
+        (to_key must be evaluated before from_key).
+
+        Arguments
+        ---------
+        from_key : hashable
+            The key which depends on.
+        to_key : hashable
+            The key which is depended on.
+
+        Returns
+        -------
+        None
+        """
+        from_ind = self._get_ind_and_add_if_new(from_key)
+        to_ind = self._get_ind_and_add_if_new(to_key)
+        edges_list = self.digraph[from_ind].edges
+        if to_ind not in edges_list:
+            edges_list.append(to_ind)
+
+    def _get_ind_and_add_if_new(self, key):
+        # Used internally to implicitly add nodes for unseen keys
+        if key not in self.key2ind:
+            self.key2ind[key] = len(self.digraph)
+            self.digraph.append(DGNode(key, [], None))
+        return self.key2ind[key]
+
+    def is_valid(self):
+        """Checks if an evaluation order can be found.
+
+        A dependency graph is evaluatable if there are no circular
+        dependencies, i.e., the graph is acyclic.
+
+        Returns
+        -------
+        bool
+            Indicating if the graph is evaluatable.
+        """
+        return not self._find_first_cycle()
+
+    def get_evaluation_order(self, selected_keys=None):
+        """Finds one valid evaluation order.
+
+        There can be many different valid
+        orders.
+        NOTE: Generates output one DGNode at a time. May generate DGNodes
+        before it finds a circular dependency. If you really need to know
+        whether an order can be found, check is_valid() first. However,
+        the algorithm for finding cycles is essentially the same as the one
+        used for finding an evaluation order, so for very large graphs...
+        Ah well, but maybe then you should be using some other solution
+        anyway.
+
+        Arguments
+        ---------
+        selected_keys : list, None
+            List of keys. If not None, only the selected keys are guaranteed
+            in the evaluation order (along with the keys they depend on).
+
+        Yields
+        ------
+        DGNode
+            The added DGNodes in a valid evaluation order.
+            See the DGNode namedtuple above.
+
+        Raises
+        ------
+        CircularDependencyError
+            If a circular dependency is found.
+        """
+        seen_ever = set()
+
+        def toposort(root_ind, visited):
+            """Implementation of topsort."""
+            nonlocal seen_ever
+            here = visited + [root_ind]
+            if root_ind in visited:
+                raise CircularDependencyError("{cycle}".format(
+                    cycle=" -> ".join(str(self.digraph[i].key) for i in here)))
+            if root_ind in seen_ever:
+                return  # Yield nothing
+            seen_ever = seen_ever.union(set([root_ind]))
+            for to_ind in self.digraph[root_ind].edges:
+                for ind in toposort(to_ind, visited=here):
+                    yield ind
+            yield root_ind
+
+        if selected_keys is None:
+            start_inds = range(len(self.digraph))
+        else:
+            start_inds = [self.key2ind[key] for key in selected_keys]
+
+        for start_ind in start_inds:
+            for ind in toposort(start_ind, []):
+                yield self.digraph[ind]
+
+    def _find_first_cycle(self):
+        """Depth-first search based algorithm for finding cycles in the graph."""
+        seen_ever = set()
+
+        def cycle_dfs(root_ind, visited):
+            """Implementation of cycle_dfs."""
+            nonlocal seen_ever
+            print(root_ind, visited)
+            here = visited + [root_ind]
+            if root_ind in visited:
+                return here
+            if root_ind in seen_ever:
+                return []
+            seen_ever = seen_ever.union(set([root_ind]))
+            for to_ind in self.digraph[root_ind].edges:
+                cycle = cycle_dfs(to_ind, here)
+                if cycle:
+                    return cycle
+            return []
+
+        for ind in range(len(self.digraph)):
+            if ind not in seen_ever:
+                cycle = cycle_dfs(ind, [])
+                if cycle:
+                    return cycle
+        return []
+
+    def __contains__(self, key):
+        # Allows the syntax:
+        # 'key' in dependency_graph
+        return key in self.key2ind
diff --git a/paddlespeech/s2t/io/speechbrain/make_dataloader.py b/paddlespeech/s2t/io/speechbrain/make_dataloader.py
new file mode 100755
index 00000000..e6a622d7
--- /dev/null
+++ b/paddlespeech/s2t/io/speechbrain/make_dataloader.py
@@ -0,0 +1,118 @@
+# Copyright (c) 2023 speechbrain Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Modified from speechbrain 2023 (https://github.com/speechbrain/speechbrain/blob/develop/speechbrain/core.py)
+import paddlespeech.s2t.io.speechbrain.dataloader
+
+
+def _train_loader_specifics(self, dataset, loader_kwargs):
+    sampler = loader_kwargs.get("sampler", None)
+    # Shuffling should really only matter for the train stage. Shuffling
+    # will also lead to more padding in batches if the order was otherwise
+    # sorted by length.
+    shuffle = loader_kwargs.get("shuffle", False)
+    if shuffle and not self.distributed_launch:
+        if sampler is not None:
+            raise ValueError("Cannot specify both shuffle=True"
+                             "and a sampler in loader_kwargs")
+        sampler = ReproducibleRandomSampler(dataset)
+        self.train_sampler = sampler
+        loader_kwargs["sampler"] = self.train_sampler
+        # Delete the shuffle flag, since you cannot specify both a sampler and
+        # shuffling:
+        del loader_kwargs["shuffle"]
+
+    # Possibly make a DistributedSampler or a wrapper for some other sampler
+    if self.distributed_launch and not isinstance(dataset, IterableDataset):
+        drop_last = loader_kwargs.get("drop_last", False)
+        # num_replicas arg is equal to world_size
+        # and retrieved automatically within
+        # DistributedSampler obj.
+        if sampler is not None:
+            self.train_sampler = DistributedSamplerWrapper(
+                sampler,
+                rank=self.rank,
+                drop_last=drop_last,
+                shuffle=shuffle, )
+
+            # with DistributedSamplerWrapper, one must disable shuffling for dataloader
+            loader_kwargs["shuffle"] = False
+            loader_kwargs["sampler"] = self.train_sampler
+        elif loader_kwargs.get("batch_sampler") is None:
+            # no sampler and batch-sampler
+            self.train_sampler = DistributedSampler(
+                dataset, rank=self.rank, shuffle=True, drop_last=drop_last)
+
+            # with DistributedSamplerWrapper, one must disable shuffling for dataloader
+            loader_kwargs["shuffle"] = False
+            loader_kwargs["sampler"] = self.train_sampler
+        else:  # batch_sampler was specified
+            self.train_sampler = DistributedSamplerWrapper(
+                loader_kwargs.get("batch_sampler", None),
+                rank=self.rank,
+                shuffle=True, )
+            loader_kwargs["batch_sampler"] = self.train_sampler
+    elif self.distributed_launch and isinstance(dataset, IterableDataset):
+        logger.warning("Cannot automatically solve distributed sampling "
+                       "for IterableDataset.")
+    return loader_kwargs
+
+
+def make_dataloader(self, dataset, stage, **loader_kwargs):
+    """Creates DataLoaders for Datasets.
+
+        This is used by ``fit()`` and ``evaluate()`` if they just receive
+        Datasets.
+
+        Alternatively, this can be called from outside the Brain subclass.
+        In that case, the DataLoader should be passed to ``fit()`` in place
+        of the dataset.
+
+        The Stage.TRAIN DataLoader is handled specially. It has extra args for
+        shuffle and drop_last. In DDP a DistributedSampler is created (unless
+        the dataset is an IterableDataset).
+
+        NOTE
+        ----
+        Some important DataLoader arguments are passed via **loader_kwargs,
+        e.g., batch_size, num_workers, pin_memory.
+
+        NOTE
+        ----
+        By default, ``evaluate()`` specifies ckpt_prefix=None to stop the test
+        DataLoader being added to the checkpointer. If you need to add a
+        recoverable after saving checkpoints (e.g., at test time, after
+        checkpointing the training), and still be able to recover reasonably,
+        you should probably specify ``allow_partial_load=True``.
+
+        Arguments
+        ---------
+        dataset : Dataset
+            A set of data to use to create data loader. If the Dataset is a
+            DynamicItemDataset, PaddedBatch is used as the default collate_fn,
+            unless specified in loader_kwargs.
+        stage : Stage
+            The stage of the experiment: Stage.TRAIN, Stage.VALID, Stage.TEST
+        ckpt_prefix : str, None
+            Prefix to use for SaveableDataLoader Checkpoint name. The Stage
+            name is added to this to create the full key. Set to None to not
+            save the DataLoader.
+        **loader_kwargs : dict
+            Additional keyword arguments to the DataLoader.
+            E.g., batch_size, num_workers, pin_memory.
+        """
+
+    dataloader_ = dataloader.make_dataloader(dataset, **loader_kwargs)
+    return dataloader_
diff --git a/paddlespeech/s2t/io/speechbrain/sampler.py b/paddlespeech/s2t/io/speechbrain/sampler.py
new file mode 100755
index 00000000..09a884c2
--- /dev/null
+++ b/paddlespeech/s2t/io/speechbrain/sampler.py
@@ -0,0 +1,503 @@
+# Copyright (c) 2023 speechbrain Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# 
+# Modified from speechbrain 2023 (https://github.com/speechbrain/speechbrain/blob/develop/speechbrain/dataio/sampler.py)
+"""compatible samplers.
+
+These determine the order of iteration through a dataset.
+
+Authors:
+  * Aku Rouhe 2020
+  * Samuele Cornell 2020
+  * Ralf Leibold 2020
+  * Artem Ploujnikov 2021
+  * Andreas Nautsch 2021
+"""
+import logging
+from collections import Counter
+from typing import List
+
+import numpy as np
+import paddle
+from paddle.io import RandomSampler
+from paddle.io import Sampler
+from paddle.io import WeightedRandomSampler
+from scipy.stats import lognorm
+
+from paddlespeech.s2t.io.speechbrain.dataset import DynamicItemDataset
+
+logger = logging.getLogger(__name__)
+
+
+class ReproducibleRandomSampler(RandomSampler):
+    """A modification of RandomSampler which always returns the same values.
+
+    Also look at `paddle.io.RandomSampler`. This has mostly
+    the same behaviour and arguments, except for adding 'seed' and 'epoch' and
+    not supporting 'generator'.
+
+    Note
+    ----
+    Call `set_epoch` before every epoch. Otherwise, the sampler will produce the
+    same sequence of indices every epoch.
+
+    Arguments
+    ---------
+    data_source : Dataset
+        The data source to sample indices for.
+    seed : int
+        The base seed to use for the random number generator. It is recommended
+        to use a value which has a good mix of 0 and 1 bits.
+    epoch : int
+        The epoch to start at.
+
+    """
+
+    def __init__(self, data_source, seed=563375142, epoch=0, **kwargs):
+        if "generator" in kwargs:
+            MSG = ("Cannot give a separate generator when using " +
+                   "ReproducibleRandomSampler")
+            raise ValueError(MSG)
+        super().__init__(data_source, **kwargs)
+        self.seed = int(seed)
+        self.epoch = epoch
+        self.gen = paddle.seed(1)
+
+    def set_epoch(self, epoch):
+        """
+        You can also just access self.epoch, but we maintain this interface
+        to mirror paddle.io.DistributedBatchSampler
+        """
+        self.epoch = epoch
+
+    def __iter__(self):
+        self.gen.manual_seed(self.seed + self.epoch)
+        return super().__iter__()
+
+
+class ReproducibleWeightedRandomSampler(WeightedRandomSampler):
+    """A reproducible modification of WeightedRandomSampler.
+
+    Also look at `paddle.io.WeightedRandomSampler`. This has the
+    the same behaviour and arguments, except for adding 'seed' and 'epoch' and
+    not supporting 'generator'.
+
+    Note
+    ----
+    Call `set_epoch` before every epoch. Otherwise, the sampler will produce the
+    same sequence of indices every epoch.
+
+    Arguments
+    ---------
+    weights : sequence of float
+        Weights for each index. Doesn't need to sum to one.
+    num_samples : int
+        Number of samples to draw
+    replacement : bool
+        To draw with replacement or not (within an epoch of num_samples).
+    seed : int
+        The base seed to use for the random number generator. It is recommended
+        to use a value which has a good mix of 0 and 1 bits.
+    epoch : int
+        The epoch to start at.
+    """
+
+    def __init__(
+            self,
+            weights,
+            num_samples,
+            replacement,
+            seed=129491412,
+            epoch=0,
+            **kwargs, ):
+        if "generator" in kwargs:
+            MSG = ("Cannot give a separate generator when using " +
+                   "ReproducibleRandomSampler")
+            raise ValueError(MSG)
+        super().__init__(weights, num_samples, replacement, **kwargs)
+        self.seed = int(seed)
+        self.epoch = epoch
+        self.gen = paddle.seed(1)
+
+    def set_epoch(self, epoch):
+        """
+        You can also just access self.epoch, but we maintain this interface
+        to mirror paddle.io.DistributedBatchSampler
+        """
+        self.epoch = epoch
+
+    def __iter__(self):
+        self.gen.manual_seed(self.seed + self.epoch)
+        return super().__iter__()
+
+
+class DynamicBatchSampler(Sampler):
+    """This BatchSampler batches examples together by grouping them by their length.
+
+    Every example in the batch have approximately the same length and
+    thus padding is minimized.
+    This enables faster training on datasets
+    where length of examples can vary significantly (e.g Librispeech).
+    Inspired by: https://www.tensorflow.org/api_docs/python/tf/data/experimental/bucket_by_sequence_length
+
+    Dynamic batching is performed by specifying a max_batch_length which is the
+    upper limit for the sum of the length of examples in a batch:
+    e.g., if ex1 has length 4, ex2 length 5 and if max_batch_length is set to 6
+    ex1 and ex2 will be placed, alone, in two distinct batches.
+
+    Length for each example can be obtained in two manners.
+    If the input dataset is a DynamicItemDataset it can be obtained by specifying a
+    length_func. Default assumes a "duration" entry is in the annotation.
+    Length for each example can also be passed to this class upon instantiation
+    by specifying a list containing the length for each example and passing it to
+    lengths_list.
+
+    Examples are grouped together by defining a set of possible discrete intervals
+    (buckets). Examples whose length fall into these intervals can be batched together.
+
+    The number of buckets can be specified by using the arg num_buckets.
+    There is usually an optimal range for the value of this argument.
+
+    If num_buckets == 1, all examples can be batched together. You have maximum randomization
+    but your training speed will be slower due to the fact that a large amount of the values will be padding
+    as long and short examples can be batched together.
+    As the number of buckets grows only examples with similar
+    length can be grouped together.
+    This trades-off speed with randomization.
+    TLDR: Low number -> better randomization, High number -> faster training.
+    NOTE THAT: if set too high the training speed will decrease. If num_buckets -> number of examples in the 
+    dataset the batch size will be small impacting training speed and possibly performance.
+
+    The buckets can also be specified by passing a list to the bucket_boundaries
+    argument instead of specifying a left_bucket_length and a bucket_length_multiplier.
+
+    """
+
+    def __init__(
+            self,
+            dataset,
+            max_batch_length: int,
+            num_buckets: int=None,
+            length_func=lambda x: x["duration"],
+            shuffle: bool=True,
+            batch_ordering: str="random",
+            max_batch_ex: int=None,
+            bucket_boundaries: List[int]=[],
+            lengths_list: List[int]=None,
+            seed: int=42,
+            epoch: int=0,
+            drop_last: bool=False,
+            verbose: bool=False, ):
+        self._dataset = dataset
+        self._ex_lengths = {}
+        ex_ids = self._dataset.data_ids
+        self.verbose = verbose
+
+        # We do not put a default on num_buckets to encourage users to play with this parameter
+        if num_buckets is None and len(bucket_boundaries) == 0:
+            raise RuntimeError(
+                "Please specify either num_buckets or bucket boundaries."
+                "Check the docs, and/or the tutorial !")
+
+        if lengths_list is not None:
+            # take length of examples from this argument and bypass length_key
+            for indx in range(len(lengths_list)):
+                self._ex_lengths[str(indx)] = lengths_list[indx]
+        else:
+            # use length func
+            if not isinstance(dataset, DynamicItemDataset):
+                raise NotImplementedError(
+                    "Dataset should be a DynamicItemDataset when using length function"
+                )
+            for indx in range(len(self._dataset)):
+                self._ex_lengths[str(indx)] = length_func(
+                    self._dataset.data[ex_ids[indx]])
+
+        if len(bucket_boundaries) > 0:
+            if not all([x >= 0 for x in bucket_boundaries]):
+                raise ValueError(
+                    "All elements in bucket boundaries should be non-negative (>= 0)."
+                )
+            if not len(set(bucket_boundaries)) == len(bucket_boundaries):
+                raise ValueError(
+                    "Bucket_boundaries should not contain duplicates.")
+            np.testing.assert_array_equal(
+                np.array(bucket_boundaries),
+                np.array(sorted(bucket_boundaries)),
+                err_msg="The arg bucket_boundaries should be an ascending sorted list of non negative values values!",
+            )
+            self._bucket_boundaries = np.array(sorted(bucket_boundaries))
+        else:
+            # use num_buckets
+            self._bucket_boundaries = np.array(
+                self._get_boundaries_through_warping(
+                    max_batch_length=max_batch_length,
+                    num_quantiles=num_buckets, ))
+
+        self._max_batch_length = max_batch_length
+        self._shuffle_ex = shuffle
+        self._batch_ordering = batch_ordering
+        self._seed = seed
+        self._drop_last = drop_last
+        if max_batch_ex is None:
+            max_batch_ex = np.inf
+        self._max_batch_ex = max_batch_ex
+        # Calculate bucket lengths - how often does one bucket boundary fit into max_batch_length?
+        self._bucket_lens = [
+            max(1, int(max_batch_length / self._bucket_boundaries[i]))
+            for i in range(len(self._bucket_boundaries))
+        ] + [1]
+        self._epoch = epoch
+        self._generate_batches()
+
+    def get_durations(self, batch):
+        """Gets durations of the elements in the batch."""
+        return [self._ex_lengths[str(idx)] for idx in batch]
+
+    def _get_boundaries_through_warping(
+            self,
+            max_batch_length: int,
+            num_quantiles: int, ) -> List[int]:
+
+        # NOTE: the following lines do not cover that there is only one example in the dataset
+        # warp frames (duration) distribution of train data
+        logger.info("Batch quantisation in latent space")
+        # linspace set-up
+        num_boundaries = num_quantiles + 1
+        # create latent linearly equal spaced buckets
+        latent_boundaries = np.linspace(
+            1 / num_boundaries,
+            num_quantiles / num_boundaries,
+            num_quantiles, )
+        # get quantiles using lognormal distribution
+        quantiles = lognorm.ppf(latent_boundaries, 1)
+        # scale up to max_batch_length
+        bucket_boundaries = quantiles * max_batch_length / quantiles[-1]
+        # compute resulting bucket length multipliers
+        length_multipliers = [
+            bucket_boundaries[x + 1] / bucket_boundaries[x]
+            for x in range(num_quantiles - 1)
+        ]
+        # logging
+        logger.info(
+            "Latent bucket boundary - buckets: {} - length multipliers: {}".
+            format(
+                list(map("{:.2f}".format, bucket_boundaries)),
+                list(map("{:.2f}".format, length_multipliers)), ))
+        return list(sorted(bucket_boundaries))
+
+    def _permute_batches(self):
+
+        if self._batch_ordering == "random":
+            # deterministically shuffle based on epoch and seed
+            gen = paddle.seed(1)
+            gen.manual_seed(self._seed + self._epoch)
+            sampler = paddle.randperm(
+                len(self._batches)).tolist()  # type: ignore
+            tmp = []
+            for idx in sampler:
+                tmp.append(self._batches[idx])
+            self._batches = tmp
+
+        elif self._batch_ordering == "ascending":
+            self._batches = sorted(
+                self._batches,
+                key=lambda x: max([self._ex_lengths[str(idx)] for idx in x]), )
+        elif self._batch_ordering == "descending":
+            self._batches = sorted(
+                self._batches,
+                key=lambda x: max([self._ex_lengths[str(idx)] for idx in x]),
+                reverse=True, )
+        else:
+            raise NotImplementedError
+
+    def _generate_batches(self):
+        logger.info("DynamicBatchSampler: Generating dynamic batches")
+        if self._shuffle_ex:
+            # deterministically shuffle based on epoch and seed
+            gen = paddle.seed(1)
+            gen.manual_seed(self._seed + self._epoch)
+            sampler = paddle.randperm(
+                len(self._dataset)).tolist()  # type: ignore
+        else:
+            # take examples as they are: e.g. they have been sorted
+            sampler = range(len(self._dataset))  # type: ignore
+
+        self._batches = []
+        bucket_batches = [[] for i in self._bucket_lens]
+
+        stats_tracker = [{
+            "min": np.inf,
+            "max": -np.inf,
+            "tot": 0,
+            "n_ex": 0
+        } for i in self._bucket_lens]
+
+        for idx in sampler:
+            # length of pre-sampled audio
+            item_len = self._ex_lengths[str(idx)]
+            # bucket to fill up most padding
+            bucket_id = np.searchsorted(self._bucket_boundaries, item_len)
+            # fill audio's duration into that bucket
+            bucket_batches[bucket_id].append(idx)
+
+            stats_tracker[bucket_id]["min"] = min(
+                stats_tracker[bucket_id]["min"], item_len)
+            stats_tracker[bucket_id]["max"] = max(
+                stats_tracker[bucket_id]["max"], item_len)
+            stats_tracker[bucket_id]["tot"] += item_len
+            stats_tracker[bucket_id]["n_ex"] += 1
+            # track #samples - why not duration/#frames; rounded up?
+            # keep track of durations, if necessary
+
+            if (len(bucket_batches[bucket_id]) >= self._bucket_lens[bucket_id]
+                    or len(bucket_batches[bucket_id]) >= self._max_batch_ex):
+                self._batches.append(bucket_batches[bucket_id])
+                bucket_batches[bucket_id] = []
+                # keep track of durations
+
+            # Dump remaining batches
+        if not self._drop_last:
+            for batch in bucket_batches:
+                if batch:
+                    self._batches.append(batch)
+
+        self._permute_batches()  # possibly reorder batches
+
+        if self._epoch == 0:  # only log at first epoch
+            # frames per batch & their padding remaining
+            boundaries = [0] + self._bucket_boundaries.tolist()
+
+            for bucket_indx in range(len(self._bucket_boundaries)):
+                try:
+                    num_batches = stats_tracker[bucket_indx]["tot"] // (
+                        self._max_batch_length)
+                    pad_factor = (stats_tracker[bucket_indx]["max"] -
+                                  stats_tracker[bucket_indx]["min"]) / (
+                                      stats_tracker[bucket_indx]["tot"] /
+                                      stats_tracker[bucket_indx]["n_ex"])
+                except ZeroDivisionError:
+                    num_batches = 0
+                    pad_factor = 0
+
+                logger.info((
+                    "DynamicBatchSampler: Bucket {} with boundary {:.1f}-{:.1f} and "
+                    +
+                    "batch_size {}: Num Examples {:.1f}, Num Full Batches {:.3f}, Pad Factor {:.3f}."
+                ).format(
+                    bucket_indx,
+                    boundaries[bucket_indx],
+                    boundaries[bucket_indx + 1],
+                    self._bucket_lens[bucket_indx],
+                    stats_tracker[bucket_indx]["n_ex"],
+                    num_batches,
+                    pad_factor * 100, ))
+
+            if self.verbose:
+                batch_stats = {
+                    "tot_frames": [],
+                    "tot_pad_frames": [],
+                    "pad_%": [],
+                }
+                for batch in self._batches:
+                    tot_frames = sum(
+                        [self._ex_lengths[str(idx)] for idx in batch])
+                    batch_stats["tot_frames"].append(tot_frames)
+                    max_frames = max(
+                        [self._ex_lengths[str(idx)] for idx in batch])
+                    tot_pad = sum([
+                        max_frames - self._ex_lengths[str(idx)] for idx in batch
+                    ])
+                    batch_stats["tot_pad_frames"].append(tot_pad)
+                    batch_stats["pad_%"].append(tot_pad / tot_frames * 100)
+
+                padding_details = "Batch {} with {:.1f} frames with {} files - {:.1f} padding, {:.2f} (%) of total."
+                padding_details = "DynamicBatchSampler: " + padding_details
+                for i in range(len(self._batches)):
+                    logger.info(
+                        padding_details.format(
+                            i,
+                            batch_stats["tot_frames"][i],
+                            len(self._batches[i]),
+                            batch_stats["tot_pad_frames"][i],
+                            batch_stats["pad_%"][i], ))
+
+    def __iter__(self):
+        for batch in self._batches:
+            yield batch
+        if self._shuffle_ex:  # re-generate examples if ex_ordering == "random"
+            self._generate_batches()
+        if self._batch_ordering == "random":
+            # we randomly permute the batches only --> faster
+            self._permute_batches()
+
+    def set_epoch(self, epoch):
+        """
+        You can also just access self.epoch, but we maintain this interface
+        to mirror paddle.io.DistributedBatchSampler
+        """
+        self._epoch = epoch
+        self._generate_batches()
+
+    def __len__(self):
+        return len(self._batches)
+
+
+class BalancingDataSampler(ReproducibleWeightedRandomSampler):
+    """A data sampler that takes a single key from the dataset and
+    ensures an approximately equal distribution by that key
+
+    Arguments
+    ---------
+    dataset: DynamicItemDataset
+        the dataset form which samples will be drawn
+    key: str
+        the key from which samples will be taken
+    num_samples : int
+        Number of samples to draw
+    replacement : bool
+        To draw with replacement or not (within an epoch of num_samples).
+    seed : int
+        The base seed to use for the random number generator. It is recommended
+        to use a value which has a good mix of 0 and 1 bits.
+    epoch : int
+        The epoch to start at.
+
+    """
+
+    def __init__(
+            self,
+            dataset,
+            key,
+            num_samples=None,
+            replacement=True,
+            seed=563375142,
+            epoch=0,
+            **kwargs, ):
+        self.dataset = dataset
+        self.key = key
+        if not num_samples:
+            num_samples = len(dataset)
+        weights = self._compute_weights()
+        super().__init__(weights, num_samples, replacement, seed, epoch,
+                         **kwargs)
+
+    def _compute_weights(self):
+        with self.dataset.output_keys_as([self.key]):
+            class_ids = [item[self.key] for item in self.dataset]
+            class_counter = Counter(class_ids)
+        weights = 1 / paddle.to_tensor(
+            [class_counter[class_id] for class_id in class_ids])
+        return weights
diff --git a/paddlespeech/s2t/io/speechbrain/sb_pipeline.py b/paddlespeech/s2t/io/speechbrain/sb_pipeline.py
new file mode 100755
index 00000000..0a5cf82f
--- /dev/null
+++ b/paddlespeech/s2t/io/speechbrain/sb_pipeline.py
@@ -0,0 +1,156 @@
+# Copyright (c) 2023 speechbrain Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# 
+# Modified from speechbrain 2023 (https://github.com/speechbrain/speechbrain/blob/develop/recipes/AISHELL-1/ASR/CTC/train_with_wav2vec.py)
+import data_pipeline
+import dataio
+import numpy
+import paddle
+import tqdm
+import transformers
+from dataloader import make_dataloader
+from hyperpyyaml import load_hyperpyyaml
+
+import dataset
+
+
+def dataio_prepare(hparams):
+    """This function prepares the datasets to be used in the brain class.
+    It also defines the data processing pipeline through user-defined functions."""
+    data_folder = hparams["data_folder"]
+
+    train_data = dataset.DynamicItemDataset.from_csv(
+        csv_path=hparams["train_data"],
+        replacements={"data_root": data_folder}, )
+
+    if hparams["sorting"] == "ascending":
+        # we sort training data to speed up training and get better results.
+        train_data = train_data.filtered_sorted(sort_key="duration")
+        # when sorting do not shuffle in dataloader ! otherwise is pointless
+        hparams["train_dataloader_opts"]["shuffle"] = False
+
+    elif hparams["sorting"] == "descending":
+        train_data = train_data.filtered_sorted(
+            sort_key="duration", reverse=True)
+        # when sorting do not shuffle in dataloader ! otherwise is pointless
+        hparams["train_dataloader_opts"]["shuffle"] = False
+
+    elif hparams["sorting"] == "random":
+        pass
+
+    else:
+        raise NotImplementedError(
+            "sorting must be random, ascending or descending")
+
+    valid_data = dataset.DynamicItemDataset.from_csv(
+        csv_path=hparams["valid_data"],
+        replacements={"data_root": data_folder}, )
+    valid_data = valid_data.filtered_sorted(sort_key="duration")
+
+    test_data = dataset.DynamicItemDataset.from_csv(
+        csv_path=hparams["test_data"],
+        replacements={"data_root": data_folder}, )
+    test_data = test_data.filtered_sorted(sort_key="duration")
+
+    datasets = [train_data, valid_data, test_data]
+
+    # Defining tokenizer and loading it
+    tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-chinese')
+
+    # 2. Define audio pipeline:
+    @data_pipeline.takes("wav")
+    @data_pipeline.provides("sig")
+    def audio_pipeline(wav):
+        sig = dataio.read_audio(wav)
+        return sig
+
+    dataset.add_dynamic_item(datasets, audio_pipeline)
+
+    # 3. Define text pipeline:
+    @data_pipeline.takes("transcript")
+    @data_pipeline.provides("wrd", "tokens_list", "tokens")
+    def text_pipeline(wrd):
+        wrd = "".join(wrd.split(" "))
+        yield wrd
+        tokens_list = tokenizer(wrd)["input_ids"]
+        yield tokens_list
+        tokens = numpy.array(tokens_list, dtype="int64")
+        yield tokens
+
+    dataset.add_dynamic_item(datasets, text_pipeline)
+
+    # 4. Set output:
+    dataset.set_output_keys(
+        datasets,
+        ["id", "sig", "wrd", "tokens"], )
+
+    # 5. If Dynamic Batching is used, we instantiate the needed samplers.
+    train_batch_sampler = None
+    valid_batch_sampler = None
+    if hparams["dynamic_batching"]:
+        from sampler import DynamicBatchSampler  # noqa
+
+        dynamic_hparams = hparams["dynamic_batch_sampler"]
+        num_buckets = dynamic_hparams["num_buckets"]
+
+        train_batch_sampler = DynamicBatchSampler(
+            train_data,
+            dynamic_hparams["max_batch_len"],
+            num_buckets=num_buckets,
+            length_func=lambda x: x["duration"],
+            shuffle=dynamic_hparams["shuffle_ex"],
+            batch_ordering=dynamic_hparams["batch_ordering"], )
+
+        valid_batch_sampler = DynamicBatchSampler(
+            valid_data,
+            dynamic_hparams["max_batch_len"],
+            num_buckets=num_buckets,
+            length_func=lambda x: x["duration"],
+            shuffle=dynamic_hparams["shuffle_ex"],
+            batch_ordering=dynamic_hparams["batch_ordering"], )
+
+    return (train_data, valid_data, test_data, tokenizer, train_batch_sampler,
+            valid_batch_sampler, )
+
+
+hparams_file = 'train_with_wav2vec.yaml'
+with open(hparams_file) as fin:
+    hparams = load_hyperpyyaml(fin, None)
+
+(train_data, valid_data, test_data, tokenizer, train_bsampler,
+ valid_bsampler, ) = dataio_prepare(hparams)
+
+train_dataloader_opts = hparams["train_dataloader_opts"]
+valid_dataloader_opts = hparams["valid_dataloader_opts"]
+
+if train_bsampler is not None:
+    train_dataloader_opts = {
+        "batch_sampler": train_bsampler,
+        "num_workers": hparams["num_workers"],
+    }
+
+if valid_bsampler is not None:
+    valid_dataloader_opts = {"batch_sampler": valid_bsampler}
+
+train_set = make_dataloader(train_data, stage='train', **train_dataloader_opts)
+
+valid_set = make_dataloader(
+    valid_data,
+    stage='train',
+    **valid_dataloader_opts, )
+
+for batch in valid_set:
+    print(batch)
+print('done')  # exit()
diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py
index 544c1e83..f716fa3b 100644
--- a/paddlespeech/s2t/models/u2/u2.py
+++ b/paddlespeech/s2t/models/u2/u2.py
@@ -43,6 +43,7 @@ from paddlespeech.s2t.modules.ctc import CTCDecoderBase
 from paddlespeech.s2t.modules.decoder import BiTransformerDecoder
 from paddlespeech.s2t.modules.decoder import TransformerDecoder
 from paddlespeech.s2t.modules.encoder import ConformerEncoder
+from paddlespeech.s2t.modules.encoder import SqueezeformerEncoder
 from paddlespeech.s2t.modules.encoder import TransformerEncoder
 from paddlespeech.s2t.modules.initializer import DefaultInitializerContext
 from paddlespeech.s2t.modules.loss import LabelSmoothingLoss
@@ -559,7 +560,7 @@ class U2BaseModel(ASRInterface, nn.Layer):
             [len(hyp[0]) for hyp in hyps], place=device,
             dtype=paddle.long)  # (beam_size,)
         hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id)
-        hyps_lens = hyps_lens + 1  # Add <sos> at begining
+        hyps_lens = hyps_lens + 1  # Add <sos> at beginning
         logger.debug(
             f"hyps pad: {hyps_pad} {self.sos} {self.eos} {self.ignore_id}")
 
@@ -708,7 +709,7 @@ class U2BaseModel(ASRInterface, nn.Layer):
             hypothesis from ctc prefix beam search and one encoder output
         Args:
             hyps (paddle.Tensor): hyps from ctc prefix beam search, already
-                pad sos at the begining, (B, T)
+                pad sos at the beginning, (B, T)
             hyps_lens (paddle.Tensor): length of each hyp in hyps, (B)
             encoder_out (paddle.Tensor): corresponding encoder output, (B=1, T, D)
         Returns:
@@ -905,6 +906,9 @@ class U2Model(U2DecodeModel):
         elif encoder_type == 'conformer':
             encoder = ConformerEncoder(
                 input_dim, global_cmvn=global_cmvn, **configs['encoder_conf'])
+        elif encoder_type == 'squeezeformer':
+            encoder = SqueezeformerEncoder(
+                input_dim, global_cmvn=global_cmvn, **configs['encoder_conf'])
         else:
             raise ValueError(f"not support encoder type:{encoder_type}")
 
diff --git a/paddlespeech/s2t/models/u2_st/u2_st.py b/paddlespeech/s2t/models/u2_st/u2_st.py
index 31defbba..b4c8c255 100644
--- a/paddlespeech/s2t/models/u2_st/u2_st.py
+++ b/paddlespeech/s2t/models/u2_st/u2_st.py
@@ -455,7 +455,7 @@ class U2STBaseModel(nn.Layer):
             hypothesis from ctc prefix beam search and one encoder output
         Args:
             hyps (paddle.Tensor): hyps from ctc prefix beam search, already
-                pad sos at the begining, (B, T)
+                pad sos at the beginning, (B, T)
             hyps_lens (paddle.Tensor): length of each hyp in hyps, (B)
             encoder_out (paddle.Tensor): corresponding encoder output, (B=1, T, D)
         Returns:
diff --git a/paddlespeech/s2t/models/wav2vec2/__init__.py b/paddlespeech/s2t/models/wav2vec2/__init__.py
index 3a12a9cf..1ad76171 100644
--- a/paddlespeech/s2t/models/wav2vec2/__init__.py
+++ b/paddlespeech/s2t/models/wav2vec2/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddlespeech/s2t/models/wav2vec2/processing/signal_processing.py b/paddlespeech/s2t/models/wav2vec2/processing/signal_processing.py
index 0c4ade7b..7267e221 100644
--- a/paddlespeech/s2t/models/wav2vec2/processing/signal_processing.py
+++ b/paddlespeech/s2t/models/wav2vec2/processing/signal_processing.py
@@ -1,10 +1,5 @@
-# Authors
-#  * Peter Plantinga 2020
-#  * Francois Grondin 2020
-#  * William Aris 2020
-#  * Samuele Cornell 2020
-#  * Sarthak Yadav 2022
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2023 speechbrain Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,7 +12,16 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# Modified from speechbrain(https://github.com/speechbrain/speechbrain/blob/develop/speechbrain/processing/signal_processing.py)
+# Modified from speechbrain 2023 (https://github.com/speechbrain/speechbrain/blob/develop/speechbrain/processing/signal_processing.py)
+"""
+Low level signal processing utilities
+Authors
+ * Peter Plantinga 2020
+ * Francois Grondin 2020
+ * William Aris 2020
+ * Samuele Cornell 2020
+ * Sarthak Yadav 2022
+"""
 import numpy as np
 import paddle
 
diff --git a/paddlespeech/s2t/models/wav2vec2/processing/speech_augmentation.py b/paddlespeech/s2t/models/wav2vec2/processing/speech_augmentation.py
index 9224549a..5482ed56 100644
--- a/paddlespeech/s2t/models/wav2vec2/processing/speech_augmentation.py
+++ b/paddlespeech/s2t/models/wav2vec2/processing/speech_augmentation.py
@@ -1,5 +1,4 @@
-# Authors
-# * Peter Plantinga 2020
+# Copyright (c) 2023 speechbrain Authors. All Rights Reserved.
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -14,6 +13,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # Modified from speechbrain(https://github.com/speechbrain/speechbrain/blob/develop/speechbrain/processing/speech_augmentation.py)
+"""Classes for mutating speech data for data augmentation.
+This module provides classes that produce realistic distortions of speech
+data for the purpose of training speech processing models. The list of
+distortions includes adding noise, adding reverberation, changing speed,
+and more. All the classes are of type `torch.nn.Module`. This gives the
+possibility to have end-to-end differentiability and
+backpropagate the gradient through them. In addition, all operations
+are expected to be performed on the GPU (where available) for efficiency.
+
+Authors
+ * Peter Plantinga 2020
+"""
 import math
 
 import paddle
@@ -64,7 +75,6 @@ class SpeedPerturb(nn.Layer):
 
         # Initialize index of perturbation
         self.samp_index = 0
-
         # Initialize resamplers
         self.resamplers = []
         for speed in self.speeds:
@@ -89,7 +99,6 @@ class SpeedPerturb(nn.Layer):
 
         # Don't perturb (return early) 1-`perturb_prob` portion of the batches
         if paddle.rand([1]) > self.perturb_prob:
-
             return waveform.clone()
         # Perform a random perturbation
         self.samp_index = paddle.randint(len(self.speeds), shape=(1, ))[0]
@@ -456,10 +465,6 @@ class DropFreq(nn.Layer):
             high=self.drop_count_high + 1,
             shape=(1, ), )
 
-        # Pick a frequency to drop
-        drop_range = self.drop_freq_high - self.drop_freq_low
-        drop_frequency = (
-            paddle.rand(drop_count) * drop_range + self.drop_freq_low)
         # Filter parameters
         filter_length = 101
         pad = filter_length // 2
@@ -467,13 +472,19 @@ class DropFreq(nn.Layer):
         # Start with delta function
         drop_filter = paddle.zeros([1, filter_length, 1])
         drop_filter[0, pad, 0] = 1
-        # Subtract each frequency
-        for frequency in drop_frequency:
-            notch_kernel = notch_filter(
-                frequency,
-                filter_length,
-                self.drop_width, )
-            drop_filter = convolve1d(drop_filter, notch_kernel, pad)
+
+        if drop_count.shape == 0:
+            # Pick a frequency to drop
+            drop_range = self.drop_freq_high - self.drop_freq_low
+            drop_frequency = (
+                paddle.rand(drop_count) * drop_range + self.drop_freq_low)
+            # Subtract each frequency
+            for frequency in drop_frequency:
+                notch_kernel = notch_filter(
+                    frequency,
+                    filter_length,
+                    self.drop_width, )
+                drop_filter = convolve1d(drop_filter, notch_kernel, pad)
 
         # Apply filter
         dropped_waveform = convolve1d(dropped_waveform, drop_filter, pad)
@@ -736,8 +747,7 @@ class SpecAugment(paddle.nn.Layer):
         # compute center and corresponding window
         c = paddle.randint(window, time - window, (1, ))[0]
         w = paddle.randint(c - window, c + window, (1, ))[0] + 1
-        # c = 5
-        # w = 10
+
         left = paddle.nn.functional.interpolate(
             x[:, :, :c],
             (w, x.shape[3]),
diff --git a/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py b/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py
old mode 100644
new mode 100755
index dc6c6d1d..7468fdce
--- a/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py
+++ b/paddlespeech/s2t/models/wav2vec2/wav2vec2_ASR.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -57,18 +57,24 @@ class Wav2vec2ASR(nn.Layer):
     def forward(self, wav, wavs_lens_rate, target, target_lens):
         if self.normalize_wav:
             wav = F.layer_norm(wav, wav.shape)
+
         # Extract wav2vec output
         out = self.wav2vec2(wav)[0]
         # We normalize the output if required
         if self.output_norm:
             out = F.layer_norm(out, out.shape)
-        if self.train and hasattr(self.config, 'spec_augment'):
+
+        if self.training and hasattr(self.config, 'spec_augment'):
             feats = self.spec_augment(out)
         else:
             feats = out
+
         x = self.enc(feats)
+
         x_lens = (wavs_lens_rate * x.shape[1]).round().astype(paddle.int64)
+
         ctc_loss = self.ctc(x, x_lens, target, target_lens)
+
         return ctc_loss
 
     @paddle.no_grad()
@@ -77,50 +83,60 @@ class Wav2vec2ASR(nn.Layer):
                text_feature: Dict[str, int],
                decoding_method: str,
                beam_size: int,
-               tokenizer: str=None):
+               tokenizer: str=None,
+               sb_pipeline=False):
         batch_size = feats.shape[0]
 
         if decoding_method == 'ctc_prefix_beam_search' and batch_size > 1:
-            raise ValueError(
+            logger.error(
                 f"decoding mode {decoding_method} must be running with batch_size == 1"
             )
+            logger.error(f"current batch_size is {batch_size}")
 
         if decoding_method == 'ctc_greedy_search':
-            if tokenizer is None:
+            if tokenizer is None and sb_pipeline is False:
                 hyps = self.ctc_greedy_search(feats)
                 res = [text_feature.defeaturize(hyp) for hyp in hyps]
                 res_tokenids = [hyp for hyp in hyps]
             else:
-                hyps = self.ctc_greedy_search(feats)
+                if sb_pipeline is True:
+                    hyps = self.ctc_greedy_search(feats.unsqueeze(-1))
+                else:
+                    hyps = self.ctc_greedy_search(feats)
                 res = []
                 res_tokenids = []
                 for sequence in hyps:
-                    # Decode token terms to words
+                    # Decode token terms to words 
                     predicted_tokens = text_feature.convert_ids_to_tokens(
                         sequence)
-                    tmp_res = []
-                    tmp_res_tokenids = []
-                    for c in predicted_tokens:
-                        if c == "[CLS]":
-                            continue
-                        elif c == "[SEP]" or c == "[PAD]":
-                            break
-                        else:
-                            tmp_res.append(c)
-                            tmp_res_tokenids.append(text_feature.vocab[c])
-                    res.append(''.join(tmp_res))
-                    res_tokenids.append(tmp_res_tokenids)
+                tmp_res = []
+                tmp_res_tokenids = []
+                for c in predicted_tokens:
+                    if c == "[CLS]":
+                        continue
+                    elif c == "[SEP]" or c == "[PAD]":
+                        break
+                    else:
+                        tmp_res.append(c)
+                        tmp_res_tokenids.append(text_feature.vocab[c])
+                res.append(''.join(tmp_res))
+                res_tokenids.append(tmp_res_tokenids)
+
         # ctc_prefix_beam_search and attention_rescoring only return one
         # result in List[int], change it to List[List[int]] for compatible
         # with other batch decoding mode
         elif decoding_method == 'ctc_prefix_beam_search':
             assert feats.shape[0] == 1
-            if tokenizer is None:
+            if tokenizer is None and sb_pipeline is False:
                 hyp = self.ctc_prefix_beam_search(feats, beam_size)
                 res = [text_feature.defeaturize(hyp)]
                 res_tokenids = [hyp]
             else:
-                hyp = self.ctc_prefix_beam_search(feats, beam_size)
+                if sb_pipeline is True:
+                    hyp = self.ctc_prefix_beam_search(
+                        feats.unsqueeze(-1), beam_size)
+                else:
+                    hyp = self.ctc_prefix_beam_search(feats, beam_size)
                 res = []
                 res_tokenids = []
                 predicted_tokens = text_feature.convert_ids_to_tokens(hyp)
@@ -290,13 +306,10 @@ class Wav2vec2Base(nn.Layer):
     @classmethod
     def from_config(cls, configs: dict):
         """init model.
-
         Args:
             configs (dict): config dict.
-
         Raises:
             ValueError: raise when using not support encoder type.
-
         Returns:
             nn.Layer: Wav2Vec2Base
         """
diff --git a/paddlespeech/s2t/models/whisper/__init__.py b/paddlespeech/s2t/models/whisper/__init__.py
index 98ab2361..b78dece8 100644
--- a/paddlespeech/s2t/models/whisper/__init__.py
+++ b/paddlespeech/s2t/models/whisper/__init__.py
@@ -1,5 +1,5 @@
 # MIT License, Copyright (c) 2022 OpenAI.
-# Copyright (c) 2022 PaddlePaddle Authors and . All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 # 
 # Modified from OpenAI Whisper 2022 (https://github.com/openai/whisper/whisper/__init__.py)
 from paddlespeech.s2t.models.whisper.whipser import decode
diff --git a/paddlespeech/s2t/models/whisper/tokenizer.py b/paddlespeech/s2t/models/whisper/tokenizer.py
index 8bd85c91..e8b201bc 100644
--- a/paddlespeech/s2t/models/whisper/tokenizer.py
+++ b/paddlespeech/s2t/models/whisper/tokenizer.py
@@ -1,5 +1,5 @@
 # MIT License, Copyright (c) 2022 OpenAI.
-# Copyright (c) 2022 PaddlePaddle Authors and . All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 # 
 # Modified from OpenAI Whisper 2022 (https://github.com/openai/whisper/whisper/tokenizer.py)
 import os
@@ -155,6 +155,10 @@ class Tokenizer:
                 if ids < len(self.tokenizer):
                     ids_list.append(ids)
             token_ids = ids_list
+        elif len(token_ids) == 1:
+            token_ids = token_ids[0]
+        else:
+            raise ValueError(f"token_ids {token_ids} load error.")
 
         return self.tokenizer.decode(token_ids, **kwargs)
 
diff --git a/paddlespeech/s2t/models/whisper/utils.py b/paddlespeech/s2t/models/whisper/utils.py
index d067af7d..5528f960 100644
--- a/paddlespeech/s2t/models/whisper/utils.py
+++ b/paddlespeech/s2t/models/whisper/utils.py
@@ -1,5 +1,5 @@
 # MIT License, Copyright (c) 2022 OpenAI.
-# Copyright (c) 2022 PaddlePaddle Authors and . All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 # 
 # Modified from OpenAI Whisper 2022 (https://github.com/openai/whisper/whisper/utils.py)
 import zlib
diff --git a/paddlespeech/s2t/models/whisper/whipser.py b/paddlespeech/s2t/models/whisper/whipser.py
index 63cafbdb..a28013e4 100644
--- a/paddlespeech/s2t/models/whisper/whipser.py
+++ b/paddlespeech/s2t/models/whisper/whipser.py
@@ -1,5 +1,5 @@
 # MIT License, Copyright (c) 2022 OpenAI.
-# Copyright (c) 2022 PaddlePaddle Authors and . All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Modified from OpenAI Whisper 2022 (https://github.com/openai/whisper/whisper)
 import os
@@ -17,12 +17,11 @@ from typing import Union
 import numpy as np
 import paddle
 import paddle.nn.functional as F
+import paddlespeech.s2t.modules.align as paddlespeech_nn
 import soundfile
 import tqdm
 from paddle import nn
 from paddle.distribution import Categorical
-
-import paddlespeech.s2t.modules.align as paddlespeech_nn
 from paddlespeech.s2t.models.whisper import utils
 from paddlespeech.s2t.models.whisper.tokenizer import get_tokenizer
 from paddlespeech.s2t.models.whisper.tokenizer import LANGUAGES
@@ -477,7 +476,7 @@ def transcribe(
         decode_options["fp16"] = False
 
     if decode_options.get(
-            "language", 'None') or decode_options.get("language", None) is None:
+            "language") == 'None' or decode_options.get("language", None) is None:
         if not model.is_multilingual:
             decode_options["language"] = "en"
         else:
@@ -771,8 +770,10 @@ class GreedyDecoder(TokenDecoder):
         if temperature == 0:
             next_tokens = paddle.argmax(logits, axis=-1)
         else:
-            next_tokens = Categorical(logits=logits / temperature).sample(
-                shape=logits.shape)
+            next_tokens = Categorical(logits=logits / temperature).sample([1])
+            next_tokens = paddle.reshape(next_tokens, [
+                next_tokens.shape[0] * next_tokens.shape[1],
+            ])
 
         logprobs = F.log_softmax(logits, axis=-1, dtype=paddle.float32)
         current_logprobs = logprobs[paddle.arange(logprobs.shape[0]),
@@ -1205,9 +1206,8 @@ class DecodingTask:
                 DecodingResult(
                     audio_features=features,
                     language=language,
-                    language_probs=probs)
-                for features, language, probs in zip(audio_features, languages,
-                                                     language_probs)
+                    language_probs=probs) for features, language, probs in
+                zip(audio_features, languages, language_probs)
             ]
 
         # repeat the audio & text tensors by the group size, for beam search or best-of-n sampling
diff --git a/paddlespeech/s2t/modules/attention.py b/paddlespeech/s2t/modules/attention.py
index d9568dcc..14336c03 100644
--- a/paddlespeech/s2t/modules/attention.py
+++ b/paddlespeech/s2t/modules/attention.py
@@ -200,7 +200,12 @@ class MultiHeadedAttention(nn.Layer):
 class RelPositionMultiHeadedAttention(MultiHeadedAttention):
     """Multi-Head Attention layer with relative position encoding."""
 
-    def __init__(self, n_head, n_feat, dropout_rate):
+    def __init__(self,
+                 n_head,
+                 n_feat,
+                 dropout_rate,
+                 adaptive_scale=False,
+                 init_weights=False):
         """Construct an RelPositionMultiHeadedAttention object.
         Paper: https://arxiv.org/abs/1901.02860
         Args:
@@ -223,6 +228,39 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):
         pos_bias_v = self.create_parameter(
             (self.h, self.d_k), default_initializer=I.XavierUniform())
         self.add_parameter('pos_bias_v', pos_bias_v)
+        self.adaptive_scale = adaptive_scale
+        if self.adaptive_scale:
+            ada_scale = self.create_parameter(
+                [1, 1, n_feat], default_initializer=I.Constant(1.0))
+            self.add_parameter('ada_scale', ada_scale)
+            ada_bias = self.create_parameter(
+                [1, 1, n_feat], default_initializer=I.Constant(0.0))
+            self.add_parameter('ada_bias', ada_bias)
+        if init_weights:
+            self.init_weights()
+
+    def init_weights(self):
+        input_max = (self.h * self.d_k)**-0.5
+        self.linear_q._param_attr = paddle.nn.initializer.Uniform(
+            low=-input_max, high=input_max)
+        self.linear_q._bias_attr = paddle.nn.initializer.Uniform(
+            low=-input_max, high=input_max)
+        self.linear_k._param_attr = paddle.nn.initializer.Uniform(
+            low=-input_max, high=input_max)
+        self.linear_k._bias_attr = paddle.nn.initializer.Uniform(
+            low=-input_max, high=input_max)
+        self.linear_v._param_attr = paddle.nn.initializer.Uniform(
+            low=-input_max, high=input_max)
+        self.linear_v._bias_attr = paddle.nn.initializer.Uniform(
+            low=-input_max, high=input_max)
+        self.linear_pos._param_attr = paddle.nn.initializer.Uniform(
+            low=-input_max, high=input_max)
+        self.linear_pos._bias_attr = paddle.nn.initializer.Uniform(
+            low=-input_max, high=input_max)
+        self.linear_out._param_attr = paddle.nn.initializer.Uniform(
+            low=-input_max, high=input_max)
+        self.linear_out._bias_attr = paddle.nn.initializer.Uniform(
+            low=-input_max, high=input_max)
 
     def rel_shift(self, x, zero_triu: bool=False):
         """Compute relative positinal encoding.
@@ -273,6 +311,11 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):
                 where `cache_t == chunk_size * num_decoding_left_chunks`
                 and `head * d_k == size`
         """
+        if self.adaptive_scale:
+            query = self.ada_scale * query + self.ada_bias
+            key = self.ada_scale * key + self.ada_bias
+            value = self.ada_scale * value + self.ada_bias
+
         q, k, v = self.forward_qkv(query, key, value)
         # q = q.transpose([0, 2, 1, 3])  # (batch, time1, head, d_k)
 
diff --git a/paddlespeech/s2t/modules/conformer_convolution.py b/paddlespeech/s2t/modules/conformer_convolution.py
index 09d903ee..7a0c72f3 100644
--- a/paddlespeech/s2t/modules/conformer_convolution.py
+++ b/paddlespeech/s2t/modules/conformer_convolution.py
@@ -18,6 +18,7 @@ from typing import Tuple
 
 import paddle
 from paddle import nn
+from paddle.nn import initializer as I
 from typeguard import check_argument_types
 
 from paddlespeech.s2t.modules.align import BatchNorm1D
@@ -39,7 +40,9 @@ class ConvolutionModule(nn.Layer):
                  activation: nn.Layer=nn.ReLU(),
                  norm: str="batch_norm",
                  causal: bool=False,
-                 bias: bool=True):
+                 bias: bool=True,
+                 adaptive_scale: bool=False,
+                 init_weights: bool=False):
         """Construct an ConvolutionModule object.
         Args:
             channels (int): The number of channels of conv layers.
@@ -51,6 +54,18 @@ class ConvolutionModule(nn.Layer):
         """
         assert check_argument_types()
         super().__init__()
+        self.bias = bias
+        self.channels = channels
+        self.kernel_size = kernel_size
+        self.adaptive_scale = adaptive_scale
+        if self.adaptive_scale:
+            ada_scale = self.create_parameter(
+                [1, 1, channels], default_initializer=I.Constant(1.0))
+            self.add_parameter('ada_scale', ada_scale)
+            ada_bias = self.create_parameter(
+                [1, 1, channels], default_initializer=I.Constant(0.0))
+            self.add_parameter('ada_bias', ada_bias)
+
         self.pointwise_conv1 = Conv1D(
             channels,
             2 * channels,
@@ -105,6 +120,28 @@ class ConvolutionModule(nn.Layer):
         )
         self.activation = activation
 
+        if init_weights:
+            self.init_weights()
+
+    def init_weights(self):
+        pw_max = self.channels**-0.5
+        dw_max = self.kernel_size**-0.5
+        self.pointwise_conv1._param_attr = paddle.nn.initializer.Uniform(
+            low=-pw_max, high=pw_max)
+        if self.bias:
+            self.pointwise_conv1._bias_attr = paddle.nn.initializer.Uniform(
+                low=-pw_max, high=pw_max)
+        self.depthwise_conv._param_attr = paddle.nn.initializer.Uniform(
+            low=-dw_max, high=dw_max)
+        if self.bias:
+            self.depthwise_conv._bias_attr = paddle.nn.initializer.Uniform(
+                low=-dw_max, high=dw_max)
+        self.pointwise_conv2._param_attr = paddle.nn.initializer.Uniform(
+            low=-pw_max, high=pw_max)
+        if self.bias:
+            self.pointwise_conv2._bias_attr = paddle.nn.initializer.Uniform(
+                low=-pw_max, high=pw_max)
+
     def forward(
             self,
             x: paddle.Tensor,
@@ -123,6 +160,9 @@ class ConvolutionModule(nn.Layer):
             paddle.Tensor: Output tensor (#batch, time, channels).
             paddle.Tensor: Output cache tensor (#batch, channels, time')
         """
+        if self.adaptive_scale:
+            x = self.ada_scale * x + self.ada_bias
+
         # exchange the temporal dimension and the feature dimension
         x = x.transpose([0, 2, 1])  # [B, C, T]
 
diff --git a/paddlespeech/s2t/modules/conv2d.py b/paddlespeech/s2t/modules/conv2d.py
new file mode 100644
index 00000000..ca6e136a
--- /dev/null
+++ b/paddlespeech/s2t/modules/conv2d.py
@@ -0,0 +1,62 @@
+from typing import Optional
+from typing import Union
+
+import paddle
+import paddle.nn.functional as F
+from paddle.nn.layer.conv import _ConvNd
+
+__all__ = ['Conv2DValid']
+
+
+class Conv2DValid(_ConvNd):
+    """
+    Conv2d operator for VALID mode padding.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: int,
+                 stride: int=1,
+                 padding: Union[str, int]=0,
+                 dilation: int=1,
+                 groups: int=1,
+                 padding_mode: str='zeros',
+                 weight_attr=None,
+                 bias_attr=None,
+                 data_format="NCHW",
+                 valid_trigx: bool=False,
+                 valid_trigy: bool=False) -> None:
+        super(Conv2DValid, self).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            False,
+            2,
+            stride=stride,
+            padding=padding,
+            padding_mode=padding_mode,
+            dilation=dilation,
+            groups=groups,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr,
+            data_format=data_format)
+        self.valid_trigx = valid_trigx
+        self.valid_trigy = valid_trigy
+
+    def _conv_forward(self,
+                      input: paddle.Tensor,
+                      weight: paddle.Tensor,
+                      bias: Optional[paddle.Tensor]):
+        validx, validy = 0, 0
+        if self.valid_trigx:
+            validx = (input.shape[-2] *
+                      (self._stride[-2] - 1) - 1 + self._kernel_size[-2]) // 2
+        if self.valid_trigy:
+            validy = (input.shape[-1] *
+                      (self._stride[-1] - 1) - 1 + self._kernel_size[-1]) // 2
+        return F.conv2d(input, weight, bias, self._stride, (validx, validy),
+                        self._dilation, self._groups)
+
+    def forward(self, input: paddle.Tensor) -> paddle.Tensor:
+        return self._conv_forward(input, self.weight, self.bias)
diff --git a/paddlespeech/s2t/modules/encoder.py b/paddlespeech/s2t/modules/encoder.py
index fd7bd7b9..d90d69d7 100644
--- a/paddlespeech/s2t/modules/encoder.py
+++ b/paddlespeech/s2t/modules/encoder.py
@@ -14,7 +14,10 @@
 # limitations under the License.
 # Modified from wenet(https://github.com/wenet-e2e/wenet)
 """Encoder definition."""
+from typing import List
+from typing import Optional
 from typing import Tuple
+from typing import Union
 
 import paddle
 from paddle import nn
@@ -22,6 +25,7 @@ from typeguard import check_argument_types
 
 from paddlespeech.s2t.modules.activation import get_activation
 from paddlespeech.s2t.modules.align import LayerNorm
+from paddlespeech.s2t.modules.align import Linear
 from paddlespeech.s2t.modules.attention import MultiHeadedAttention
 from paddlespeech.s2t.modules.attention import RelPositionMultiHeadedAttention
 from paddlespeech.s2t.modules.conformer_convolution import ConvolutionModule
@@ -29,6 +33,7 @@ from paddlespeech.s2t.modules.embedding import NoPositionalEncoding
 from paddlespeech.s2t.modules.embedding import PositionalEncoding
 from paddlespeech.s2t.modules.embedding import RelPositionalEncoding
 from paddlespeech.s2t.modules.encoder_layer import ConformerEncoderLayer
+from paddlespeech.s2t.modules.encoder_layer import SqueezeformerEncoderLayer
 from paddlespeech.s2t.modules.encoder_layer import TransformerEncoderLayer
 from paddlespeech.s2t.modules.mask import add_optional_chunk_mask
 from paddlespeech.s2t.modules.mask import make_non_pad_mask
@@ -36,12 +41,19 @@ from paddlespeech.s2t.modules.positionwise_feed_forward import PositionwiseFeedF
 from paddlespeech.s2t.modules.subsampling import Conv2dSubsampling4
 from paddlespeech.s2t.modules.subsampling import Conv2dSubsampling6
 from paddlespeech.s2t.modules.subsampling import Conv2dSubsampling8
+from paddlespeech.s2t.modules.subsampling import DepthwiseConv2DSubsampling4
 from paddlespeech.s2t.modules.subsampling import LinearNoSubsampling
+from paddlespeech.s2t.modules.time_reduction import TimeReductionLayer1D
+from paddlespeech.s2t.modules.time_reduction import TimeReductionLayer2D
+from paddlespeech.s2t.modules.time_reduction import TimeReductionLayerStream
 from paddlespeech.s2t.utils.log import Log
 
 logger = Log(__name__).getlog()
 
-__all__ = ["BaseEncoder", 'TransformerEncoder', "ConformerEncoder"]
+__all__ = [
+    "BaseEncoder", 'TransformerEncoder', "ConformerEncoder",
+    "SqueezeformerEncoder"
+]
 
 
 class BaseEncoder(nn.Layer):
@@ -487,3 +499,366 @@ class ConformerEncoder(BaseEncoder):
                 normalize_before=normalize_before,
                 concat_after=concat_after) for _ in range(num_blocks)
         ])
+
+
+class SqueezeformerEncoder(nn.Layer):
+    def __init__(self,
+                 input_size: int,
+                 encoder_dim: int=256,
+                 output_size: int=256,
+                 attention_heads: int=4,
+                 num_blocks: int=12,
+                 reduce_idx: Optional[Union[int, List[int]]]=5,
+                 recover_idx: Optional[Union[int, List[int]]]=11,
+                 feed_forward_expansion_factor: int=4,
+                 dw_stride: bool=False,
+                 input_dropout_rate: float=0.1,
+                 pos_enc_layer_type: str="rel_pos",
+                 time_reduction_layer_type: str="conv1d",
+                 feed_forward_dropout_rate: float=0.1,
+                 attention_dropout_rate: float=0.1,
+                 cnn_module_kernel: int=31,
+                 cnn_norm_type: str="layer_norm",
+                 dropout: float=0.1,
+                 causal: bool=False,
+                 adaptive_scale: bool=True,
+                 activation_type: str="swish",
+                 init_weights: bool=True,
+                 global_cmvn: paddle.nn.Layer=None,
+                 normalize_before: bool=False,
+                 use_dynamic_chunk: bool=False,
+                 concat_after: bool=False,
+                 static_chunk_size: int=0,
+                 use_dynamic_left_chunk: bool=False):
+        """Construct SqueezeformerEncoder
+
+        Args:
+            input_size to use_dynamic_chunk, see in Transformer BaseEncoder.
+            encoder_dim (int): The hidden dimension of encoder layer.
+            output_size (int): The output dimension of final projection layer.
+            attention_heads (int): Num of attention head in attention module.
+            num_blocks (int): Num of encoder layers.
+            reduce_idx Optional[Union[int, List[int]]]:
+                reduce layer index, from 40ms to 80ms per frame.
+            recover_idx Optional[Union[int, List[int]]]:
+                recover layer index, from 80ms to 40ms per frame.
+            feed_forward_expansion_factor (int): Enlarge coefficient of FFN.
+            dw_stride (bool): Whether do depthwise convolution
+                              on subsampling module.
+            input_dropout_rate (float): Dropout rate of input projection layer.
+            pos_enc_layer_type (str): Self attention type.
+            time_reduction_layer_type (str): Conv1d or Conv2d reduction layer.
+            cnn_module_kernel (int): Kernel size of CNN module.
+            activation_type (str): Encoder activation function type.
+            cnn_module_kernel (int): Kernel size of convolution module.
+            adaptive_scale (bool): Whether to use adaptive scale.
+            init_weights (bool): Whether to initialize weights.
+            causal (bool): whether to use causal convolution or not.
+        """
+        assert check_argument_types()
+        super().__init__()
+        self.global_cmvn = global_cmvn
+        self.reduce_idx: Optional[Union[int, List[int]]] = [reduce_idx] \
+            if type(reduce_idx) == int else reduce_idx
+        self.recover_idx: Optional[Union[int, List[int]]] = [recover_idx] \
+            if type(recover_idx) == int else recover_idx
+        self.check_ascending_list()
+        if reduce_idx is None:
+            self.time_reduce = None
+        else:
+            if recover_idx is None:
+                self.time_reduce = 'normal'  # no recovery at the end
+            else:
+                self.time_reduce = 'recover'  # recovery at the end
+                assert len(self.reduce_idx) == len(self.recover_idx)
+            self.reduce_stride = 2
+        self._output_size = output_size
+        self.normalize_before = normalize_before
+        self.static_chunk_size = static_chunk_size
+        self.use_dynamic_chunk = use_dynamic_chunk
+        self.use_dynamic_left_chunk = use_dynamic_left_chunk
+        activation = get_activation(activation_type)
+
+        # self-attention module definition
+        if pos_enc_layer_type != "rel_pos":
+            encoder_selfattn_layer = MultiHeadedAttention
+            encoder_selfattn_layer_args = (attention_heads, output_size,
+                                           attention_dropout_rate)
+        else:
+            encoder_selfattn_layer = RelPositionMultiHeadedAttention
+            encoder_selfattn_layer_args = (attention_heads, encoder_dim,
+                                           attention_dropout_rate,
+                                           adaptive_scale, init_weights)
+
+        # feed-forward module definition
+        positionwise_layer = PositionwiseFeedForward
+        positionwise_layer_args = (
+            encoder_dim, encoder_dim * feed_forward_expansion_factor,
+            feed_forward_dropout_rate, activation, adaptive_scale, init_weights)
+
+        # convolution module definition
+        convolution_layer = ConvolutionModule
+        convolution_layer_args = (encoder_dim, cnn_module_kernel, activation,
+                                  cnn_norm_type, causal, True, adaptive_scale,
+                                  init_weights)
+
+        self.embed = DepthwiseConv2DSubsampling4(
+            1, encoder_dim,
+            RelPositionalEncoding(encoder_dim, dropout_rate=0.1), dw_stride,
+            input_size, input_dropout_rate, init_weights)
+
+        self.preln = LayerNorm(encoder_dim)
+        self.encoders = paddle.nn.LayerList([
+            SqueezeformerEncoderLayer(
+                encoder_dim,
+                encoder_selfattn_layer(*encoder_selfattn_layer_args),
+                positionwise_layer(*positionwise_layer_args),
+                convolution_layer(*convolution_layer_args),
+                positionwise_layer(*positionwise_layer_args), normalize_before,
+                dropout, concat_after) for _ in range(num_blocks)
+        ])
+        if time_reduction_layer_type == 'conv1d':
+            time_reduction_layer = TimeReductionLayer1D
+            time_reduction_layer_args = {
+                'channel': encoder_dim,
+                'out_dim': encoder_dim,
+            }
+        elif time_reduction_layer_type == 'stream':
+            time_reduction_layer = TimeReductionLayerStream
+            time_reduction_layer_args = {
+                'channel': encoder_dim,
+                'out_dim': encoder_dim,
+            }
+        else:
+            time_reduction_layer = TimeReductionLayer2D
+            time_reduction_layer_args = {'encoder_dim': encoder_dim}
+
+        self.time_reduction_layer = time_reduction_layer(
+            **time_reduction_layer_args)
+        self.time_recover_layer = Linear(encoder_dim, encoder_dim)
+        self.final_proj = None
+        if output_size != encoder_dim:
+            self.final_proj = Linear(encoder_dim, output_size)
+
+    def output_size(self) -> int:
+        return self._output_size
+
+    def forward(
+            self,
+            xs: paddle.Tensor,
+            xs_lens: paddle.Tensor,
+            decoding_chunk_size: int=0,
+            num_decoding_left_chunks: int=-1,
+    ) -> Tuple[paddle.Tensor, paddle.Tensor]:
+        """Embed positions in tensor.
+        Args:
+            xs: padded input tensor (B, L, D)
+            xs_lens: input length (B)
+            decoding_chunk_size: decoding chunk size for dynamic chunk
+                0: default for training, use random dynamic chunk.
+                <0: for decoding, use full chunk.
+                >0: for decoding, use fixed chunk size as set.
+            num_decoding_left_chunks: number of left chunks, this is for decoding,
+                the chunk size is decoding_chunk_size.
+                >=0: use num_decoding_left_chunks
+                <0: use all left chunks
+        Returns:
+            encoder output tensor, lens and mask
+        """
+        masks = make_non_pad_mask(xs_lens).unsqueeze(1)  # (B, 1, L)
+
+        if self.global_cmvn is not None:
+            xs = self.global_cmvn(xs)
+        xs, pos_emb, masks = self.embed(xs, masks)
+        mask_pad = masks
+        chunk_masks = add_optional_chunk_mask(
+            xs, masks, self.use_dynamic_chunk, self.use_dynamic_left_chunk,
+            decoding_chunk_size, self.static_chunk_size,
+            num_decoding_left_chunks)
+        xs_lens = chunk_masks.squeeze(1).sum(1)
+        xs = self.preln(xs)
+        recover_activations: \
+            List[Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]] = []
+        index = 0
+        for i, layer in enumerate(self.encoders):
+            if self.reduce_idx is not None:
+                if self.time_reduce is not None and i in self.reduce_idx:
+                    recover_activations.append(
+                        (xs, chunk_masks, pos_emb, mask_pad))
+                    xs, xs_lens, chunk_masks, mask_pad = self.time_reduction_layer(
+                        xs, xs_lens, chunk_masks, mask_pad)
+                    pos_emb = pos_emb[:, ::2, :]
+                    index += 1
+
+            if self.recover_idx is not None:
+                if self.time_reduce == 'recover' and i in self.recover_idx:
+                    index -= 1
+                    recover_tensor, recover_chunk_masks, recover_pos_emb, recover_mask_pad = recover_activations[
+                        index]
+                    # recover output length for ctc decode
+                    xs = paddle.repeat_interleave(xs, repeats=2, axis=1)
+                    xs = self.time_recover_layer(xs)
+                    recoverd_t = recover_tensor.shape[1]
+                    xs = recover_tensor + xs[:, :recoverd_t, :]
+                    chunk_masks = recover_chunk_masks
+                    pos_emb = recover_pos_emb
+                    mask_pad = recover_mask_pad
+
+            xs, chunk_masks, _, _ = layer(xs, chunk_masks, pos_emb, mask_pad)
+
+        if self.final_proj is not None:
+            xs = self.final_proj(xs)
+        return xs, masks
+
+    def check_ascending_list(self):
+        if self.reduce_idx is not None:
+            assert self.reduce_idx == sorted(self.reduce_idx), \
+                "reduce_idx should be int or ascending list"
+        if self.recover_idx is not None:
+            assert self.recover_idx == sorted(self.recover_idx), \
+                "recover_idx should be int or ascending list"
+
+    def calculate_downsampling_factor(self, i: int) -> int:
+        if self.reduce_idx is None:
+            return 1
+        else:
+            reduce_exp, recover_exp = 0, 0
+            for exp, rd_idx in enumerate(self.reduce_idx):
+                if i >= rd_idx:
+                    reduce_exp = exp + 1
+            if self.recover_idx is not None:
+                for exp, rc_idx in enumerate(self.recover_idx):
+                    if i >= rc_idx:
+                        recover_exp = exp + 1
+            return int(2**(reduce_exp - recover_exp))
+
+    def forward_chunk(
+            self,
+            xs: paddle.Tensor,
+            offset: int,
+            required_cache_size: int,
+            att_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]),
+            cnn_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]),
+            att_mask: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool),
+    ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
+        """ Forward just one chunk
+
+        Args:
+            xs (paddle.Tensor): chunk input, with shape (b=1, time, mel-dim),
+                where `time == (chunk_size - 1) * subsample_rate + \
+                        subsample.right_context + 1`
+            offset (int): current offset in encoder output time stamp
+            required_cache_size (int): cache size required for next chunk
+                compuation
+                >=0: actual cache size
+                <0: means all history cache is required
+            att_cache (paddle.Tensor): cache tensor for KEY & VALUE in
+                transformer/conformer attention, with shape
+                (elayers, head, cache_t1, d_k * 2), where
+                `head * d_k == hidden-dim` and
+                `cache_t1 == chunk_size * num_decoding_left_chunks`.
+            cnn_cache (paddle.Tensor): cache tensor for cnn_module in conformer,
+                (elayers, b=1, hidden-dim, cache_t2), where
+                `cache_t2 == cnn.lorder - 1`
+
+        Returns:
+            paddle.Tensor: output of current input xs,
+                with shape (b=1, chunk_size, hidden-dim).
+            paddle.Tensor: new attention cache required for next chunk, with
+                dynamic shape (elayers, head, ?, d_k * 2)
+                depending on required_cache_size.
+            paddle.Tensor: new conformer cnn cache required for next chunk, with
+                same shape as the original cnn_cache.
+        """
+        assert xs.shape[0] == 1  # batch size must be one
+
+        if self.global_cmvn is not None:
+            xs = self.global_cmvn(xs)
+
+        # tmp_masks is just for interface compatibility, [B=1, C=1, T]
+        tmp_masks = paddle.ones([1, 1, xs.shape[1]], dtype=paddle.bool)
+        # before embed, xs=(B, T, D1), pos_emb=(B=1, T, D)
+        xs, pos_emb, _ = self.embed(xs, tmp_masks, offset=offset)
+
+        # NOTE(xcsong): After  embed, shape(xs) is (b=1, chunk_size, hidden-dim)
+        elayers, cache_t1 = att_cache.shape[0], att_cache.shape[2]
+        chunk_size = xs.shape[1]
+        attention_key_size = cache_t1 + chunk_size
+        pos_emb = self.embed.position_encoding(
+            offset=offset - cache_t1, size=attention_key_size)
+        if required_cache_size < 0:
+            next_cache_start = 0
+        elif required_cache_size == 0:
+            next_cache_start = attention_key_size
+        else:
+            next_cache_start = max(attention_key_size - required_cache_size, 0)
+
+        r_att_cache = []
+        r_cnn_cache = []
+
+        mask_pad = paddle.ones([1, xs.shape[1]], dtype=paddle.bool)
+        mask_pad = mask_pad.unsqueeze(1)
+        max_att_len: int = 0
+        recover_activations: \
+            List[Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]] = []
+        index = 0
+        xs_lens = paddle.to_tensor([xs.shape[1]], dtype=paddle.int32)
+        xs = self.preln(xs)
+        for i, layer in enumerate(self.encoders):
+            # NOTE(xcsong): Before layer.forward
+            #   shape(att_cache[i:i + 1]) is (1, head, cache_t1, d_k * 2),
+            #   shape(cnn_cache[i])       is (b=1, hidden-dim, cache_t2)
+            if self.reduce_idx is not None:
+                if self.time_reduce is not None and i in self.reduce_idx:
+                    recover_activations.append(
+                        (xs, att_mask, pos_emb, mask_pad))
+                    xs, xs_lens, att_mask, mask_pad = self.time_reduction_layer(
+                        xs, xs_lens, att_mask, mask_pad)
+                    pos_emb = pos_emb[:, ::2, :]
+                    index += 1
+
+            if self.recover_idx is not None:
+                if self.time_reduce == 'recover' and i in self.recover_idx:
+                    index -= 1
+                    recover_tensor, recover_att_mask, recover_pos_emb, recover_mask_pad = recover_activations[
+                        index]
+                    # recover output length for ctc decode
+                    xs = paddle.repeat_interleave(xs, repeats=2, axis=1)
+                    xs = self.time_recover_layer(xs)
+                    recoverd_t = recover_tensor.shape[1]
+                    xs = recover_tensor + xs[:, :recoverd_t, :]
+                    att_mask = recover_att_mask
+                    pos_emb = recover_pos_emb
+                    mask_pad = recover_mask_pad
+
+            factor = self.calculate_downsampling_factor(i)
+            att_cache1 = att_cache[
+                i:i + 1][:, :, ::factor, :][:, :, :pos_emb.shape[1] - xs.shape[
+                    1], :]
+            cnn_cache1 = cnn_cache[i] if cnn_cache.shape[0] > 0 else cnn_cache
+            xs, _, new_att_cache, new_cnn_cache = layer(
+                xs,
+                att_mask,
+                pos_emb,
+                att_cache=att_cache1,
+                cnn_cache=cnn_cache1)
+            # NOTE(xcsong): After layer.forward
+            #   shape(new_att_cache) is (1, head, attention_key_size, d_k * 2),
+            #   shape(new_cnn_cache) is (b=1, hidden-dim, cache_t2)
+            cached_att = new_att_cache[:, :, next_cache_start // factor:, :]
+            cached_cnn = new_cnn_cache.unsqueeze(0)
+            cached_att = cached_att.repeat_interleave(repeats=factor, axis=2)
+            if i == 0:
+                # record length for the first block as max length
+                max_att_len = cached_att.shape[2]
+            r_att_cache.append(cached_att[:, :, :max_att_len, :])
+            r_cnn_cache.append(cached_cnn)
+        # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2),
+        #   ? may be larger than cache_t1, it depends on required_cache_size
+        r_att_cache = paddle.concat(r_att_cache, axis=0)
+        # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2)
+        r_cnn_cache = paddle.concat(r_cnn_cache, axis=0)
+
+        if self.final_proj is not None:
+            xs = self.final_proj(xs)
+        return xs, r_att_cache, r_cnn_cache
diff --git a/paddlespeech/s2t/modules/encoder_layer.py b/paddlespeech/s2t/modules/encoder_layer.py
index dac62bce..ecba95e8 100644
--- a/paddlespeech/s2t/modules/encoder_layer.py
+++ b/paddlespeech/s2t/modules/encoder_layer.py
@@ -26,7 +26,10 @@ from paddlespeech.s2t.utils.log import Log
 
 logger = Log(__name__).getlog()
 
-__all__ = ["TransformerEncoderLayer", "ConformerEncoderLayer"]
+__all__ = [
+    "TransformerEncoderLayer", "ConformerEncoderLayer",
+    "SqueezeformerEncoderLayer"
+]
 
 
 class TransformerEncoderLayer(nn.Layer):
@@ -276,3 +279,125 @@ class ConformerEncoderLayer(nn.Layer):
             x = self.norm_final(x)
 
         return x, mask, new_att_cache, new_cnn_cache
+
+
+class SqueezeformerEncoderLayer(nn.Layer):
+    """Encoder layer module."""
+
+    def __init__(self,
+                 size: int,
+                 self_attn: paddle.nn.Layer,
+                 feed_forward1: Optional[nn.Layer]=None,
+                 conv_module: Optional[nn.Layer]=None,
+                 feed_forward2: Optional[nn.Layer]=None,
+                 normalize_before: bool=False,
+                 dropout_rate: float=0.1,
+                 concat_after: bool=False):
+        """Construct an EncoderLayer object.
+
+        Args:
+            size (int): Input dimension.
+            self_attn (paddle.nn.Layer): Self-attention module instance.
+                `MultiHeadedAttention` or `RelPositionMultiHeadedAttention`
+                instance can be used as the argument.
+            feed_forward1 (paddle.nn.Layer): Feed-forward module instance.
+                `PositionwiseFeedForward` instance can be used as the argument.
+            conv_module (paddle.nn.Layer): Convolution module instance.
+                `ConvlutionLayer` instance can be used as the argument.
+            feed_forward2 (paddle.nn.Layer): Feed-forward module instance.
+                `PositionwiseFeedForward` instance can be used as the argument.
+            dropout_rate (float): Dropout rate.
+            normalize_before (bool):
+                True: use layer_norm before each sub-block.
+                False: use layer_norm after each sub-block.
+        """
+        super().__init__()
+        self.size = size
+        self.self_attn = self_attn
+        self.layer_norm1 = LayerNorm(size)
+        self.ffn1 = feed_forward1
+        self.layer_norm2 = LayerNorm(size)
+        self.conv_module = conv_module
+        self.layer_norm3 = LayerNorm(size)
+        self.ffn2 = feed_forward2
+        self.layer_norm4 = LayerNorm(size)
+        self.normalize_before = normalize_before
+        self.dropout = nn.Dropout(dropout_rate)
+        self.concat_after = concat_after
+        if concat_after:
+            self.concat_linear = Linear(size + size, size)
+        else:
+            self.concat_linear = nn.Identity()
+
+    def forward(
+            self,
+            x: paddle.Tensor,
+            mask: paddle.Tensor,
+            pos_emb: paddle.Tensor,
+            mask_pad: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool),
+            att_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]),
+            cnn_cache: paddle.Tensor=paddle.zeros([0, 0, 0, 0]),
+    ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]:
+        """Compute encoded features.
+        Args:
+            x (paddle.Tensor): Input tensor (#batch, time, size).
+            mask (paddle.Tensor): Mask tensor for the input (#batch, time, time).
+                (0,0,0) means fake mask.
+            pos_emb (paddle.Tensor): postional encoding, must not be None
+                for ConformerEncoderLayer
+            mask_pad (paddle.Tensor): batch padding mask used for conv module.
+               (#batch, 1，time), (0, 0, 0) means fake mask.
+            att_cache (paddle.Tensor): Cache tensor of the KEY & VALUE
+                (#batch=1, head, cache_t1, d_k * 2), head * d_k == size.
+            cnn_cache (paddle.Tensor): Convolution cache in conformer layer
+                (1, #batch=1, size, cache_t2). First dim will not be used, just
+                for dy2st.
+        Returns:
+           paddle.Tensor: Output tensor (#batch, time, size).
+           paddle.Tensor: Mask tensor (#batch, time, time).
+           paddle.Tensor: att_cache tensor,
+                (#batch=1, head, cache_t1 + time, d_k * 2).
+           paddle.Tensor: cnn_cahce tensor (#batch, size, cache_t2).
+        """
+        # self attention module
+        residual = x
+        if self.normalize_before:
+            x = self.layer_norm1(x)
+        x_att, new_att_cache = self.self_attn(x, x, x, mask, pos_emb, att_cache)
+        if self.concat_after:
+            x_concat = paddle.concat((x, x_att), axis=-1)
+            x = residual + self.concat_linear(x_concat)
+        else:
+            x = residual + self.dropout(x_att)
+        if not self.normalize_before:
+            x = self.layer_norm1(x)
+
+        # ffn module
+        residual = x
+        if self.normalize_before:
+            x = self.layer_norm2(x)
+        x = self.ffn1(x)
+        x = residual + self.dropout(x)
+        if not self.normalize_before:
+            x = self.layer_norm2(x)
+
+        # conv module
+        residual = x
+        if self.normalize_before:
+            x = self.layer_norm3(x)
+        x, new_cnn_cache = self.conv_module(x, mask_pad, cnn_cache)
+        x = residual + self.dropout(x)
+        if not self.normalize_before:
+            x = self.layer_norm3(x)
+
+        # ffn module
+        residual = x
+        if self.normalize_before:
+            x = self.layer_norm4(x)
+        x = self.ffn2(x)
+        # we do not use dropout here since it is inside feed forward function
+        x = residual + self.dropout(x)
+        if not self.normalize_before:
+            x = self.layer_norm4(x)
+
+        return x, mask, new_att_cache, new_cnn_cache
diff --git a/paddlespeech/s2t/modules/positionwise_feed_forward.py b/paddlespeech/s2t/modules/positionwise_feed_forward.py
index c2725dc5..9ebd5d63 100644
--- a/paddlespeech/s2t/modules/positionwise_feed_forward.py
+++ b/paddlespeech/s2t/modules/positionwise_feed_forward.py
@@ -16,6 +16,7 @@
 """Positionwise feed forward layer definition."""
 import paddle
 from paddle import nn
+from paddle.nn import initializer as I
 
 from paddlespeech.s2t.modules.align import Linear
 from paddlespeech.s2t.utils.log import Log
@@ -32,7 +33,9 @@ class PositionwiseFeedForward(nn.Layer):
                  idim: int,
                  hidden_units: int,
                  dropout_rate: float,
-                 activation: nn.Layer=nn.ReLU()):
+                 activation: nn.Layer=nn.ReLU(),
+                 adaptive_scale: bool=False,
+                 init_weights: bool=False):
         """Construct a PositionwiseFeedForward object.
 
         FeedForward are appied on each position of the sequence.
@@ -45,10 +48,35 @@ class PositionwiseFeedForward(nn.Layer):
             activation (paddle.nn.Layer): Activation function
         """
         super().__init__()
+        self.idim = idim
+        self.hidden_units = hidden_units
         self.w_1 = Linear(idim, hidden_units)
         self.activation = activation
         self.dropout = nn.Dropout(dropout_rate)
         self.w_2 = Linear(hidden_units, idim)
+        self.adaptive_scale = adaptive_scale
+        if self.adaptive_scale:
+            ada_scale = self.create_parameter(
+                [1, 1, idim], default_initializer=I.XavierUniform())
+            self.add_parameter('ada_scale', ada_scale)
+            ada_bias = self.create_parameter(
+                [1, 1, idim], default_initializer=I.XavierUniform())
+            self.add_parameter('ada_bias', ada_bias)
+
+        if init_weights:
+            self.init_weights()
+
+    def init_weights(self):
+        ffn1_max = self.idim**-0.5
+        ffn2_max = self.hidden_units**-0.5
+        self.w_1._param_attr = paddle.nn.initializer.Uniform(
+            low=-ffn1_max, high=ffn1_max)
+        self.w_1._bias_attr = paddle.nn.initializer.Uniform(
+            low=-ffn1_max, high=ffn1_max)
+        self.w_2._param_attr = paddle.nn.initializer.Uniform(
+            low=-ffn2_max, high=ffn2_max)
+        self.w_2._bias_attr = paddle.nn.initializer.Uniform(
+            low=-ffn2_max, high=ffn2_max)
 
     def forward(self, xs: paddle.Tensor) -> paddle.Tensor:
         """Forward function.
@@ -57,4 +85,6 @@ class PositionwiseFeedForward(nn.Layer):
         Returns:
             output tensor, (B, Lmax, D)
         """
+        if self.adaptive_scale:
+            xs = self.ada_scale * xs + self.ada_bias
         return self.w_2(self.dropout(self.activation(self.w_1(xs))))
diff --git a/paddlespeech/s2t/modules/subsampling.py b/paddlespeech/s2t/modules/subsampling.py
index 782a437e..ef60bdf0 100644
--- a/paddlespeech/s2t/modules/subsampling.py
+++ b/paddlespeech/s2t/modules/subsampling.py
@@ -29,7 +29,7 @@ logger = Log(__name__).getlog()
 
 __all__ = [
     "LinearNoSubsampling", "Conv2dSubsampling4", "Conv2dSubsampling6",
-    "Conv2dSubsampling8"
+    "Conv2dSubsampling8", "DepthwiseConv2DSubsampling4"
 ]
 
 
@@ -249,3 +249,67 @@ class Conv2dSubsampling8(Conv2dSubsampling):
         x = self.linear(x.transpose([0, 2, 1, 3]).reshape([b, -1, c * f]))
         x, pos_emb = self.pos_enc(x, offset)
         return x, pos_emb, x_mask[:, :, :-2:2][:, :, :-2:2][:, :, :-2:2]
+
+
+class DepthwiseConv2DSubsampling4(BaseSubsampling):
+    """Depthwise Convolutional 2D subsampling (to 1/4 length).
+
+        Args:
+            idim (int): Input dimension.
+            odim (int): Output dimension.
+            pos_enc_class (nn.Layer): position encoding class.
+            dw_stride (int): Whether do depthwise convolution.
+            input_size (int): filter bank dimension.
+
+        """
+
+    def __init__(self,
+                 idim: int,
+                 odim: int,
+                 pos_enc_class: nn.Layer,
+                 dw_stride: bool=False,
+                 input_size: int=80,
+                 input_dropout_rate: float=0.1,
+                 init_weights: bool=True):
+        super(DepthwiseConv2DSubsampling4, self).__init__()
+        self.idim = idim
+        self.odim = odim
+        self.pw_conv = Conv2D(
+            in_channels=idim, out_channels=odim, kernel_size=3, stride=2)
+        self.act1 = nn.ReLU()
+        self.dw_conv = Conv2D(
+            in_channels=odim,
+            out_channels=odim,
+            kernel_size=3,
+            stride=2,
+            groups=odim if dw_stride else 1)
+        self.act2 = nn.ReLU()
+        self.pos_enc = pos_enc_class
+        self.input_proj = nn.Sequential(
+            Linear(odim * (((input_size - 1) // 2 - 1) // 2), odim),
+            nn.Dropout(p=input_dropout_rate))
+        if init_weights:
+            linear_max = (odim * input_size / 4)**-0.5
+            self.input_proj.state_dict()[
+                '0.weight'] = paddle.nn.initializer.Uniform(
+                    low=-linear_max, high=linear_max)
+            self.input_proj.state_dict()[
+                '0.bias'] = paddle.nn.initializer.Uniform(
+                    low=-linear_max, high=linear_max)
+
+        self.subsampling_rate = 4
+        # 6 = (3 - 1) * 1 + (3 - 1) * 2
+        self.right_context = 6
+
+    def forward(self, x: paddle.Tensor, x_mask: paddle.Tensor, offset: int=0
+                ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
+        x = x.unsqueeze(1)  # (b, c=1, t, f)
+        x = self.pw_conv(x)
+        x = self.act1(x)
+        x = self.dw_conv(x)
+        x = self.act2(x)
+        b, c, t, f = x.shape
+        x = x.transpose([0, 2, 1, 3]).reshape([b, -1, c * f])
+        x, pos_emb = self.pos_enc(x, offset)
+        x = self.input_proj(x)
+        return x, pos_emb, x_mask[:, :, :-2:2][:, :, :-2:2]
diff --git a/paddlespeech/s2t/modules/time_reduction.py b/paddlespeech/s2t/modules/time_reduction.py
new file mode 100644
index 00000000..d3393f10
--- /dev/null
+++ b/paddlespeech/s2t/modules/time_reduction.py
@@ -0,0 +1,263 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from wenet(https://github.com/wenet-e2e/wenet)
+"""Subsampling layer definition."""
+from typing import Tuple
+
+import paddle
+import paddle.nn.functional as F
+from paddle import nn
+
+from paddlespeech.s2t import masked_fill
+from paddlespeech.s2t.modules.align import Conv1D
+from paddlespeech.s2t.modules.conv2d import Conv2DValid
+from paddlespeech.s2t.utils.log import Log
+
+logger = Log(__name__).getlog()
+
+__all__ = [
+    "TimeReductionLayerStream", "TimeReductionLayer1D", "TimeReductionLayer2D"
+]
+
+
+class TimeReductionLayer1D(nn.Layer):
+    """
+    Modified NeMo,
+    Squeezeformer Time Reduction procedure.
+    Downsamples the audio by `stride` in the time dimension.
+    Args:
+        channel (int): input dimension of
+                       MultiheadAttentionMechanism and PositionwiseFeedForward
+        out_dim (int): Output dimension of the module.
+        kernel_size (int): Conv kernel size for
+                           depthwise convolution in convolution module
+        stride (int): Downsampling factor in time dimension.
+    """
+
+    def __init__(self,
+                 channel: int,
+                 out_dim: int,
+                 kernel_size: int=5,
+                 stride: int=2):
+        super(TimeReductionLayer1D, self).__init__()
+
+        self.channel = channel
+        self.out_dim = out_dim
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = max(0, self.kernel_size - self.stride)
+
+        self.dw_conv = Conv1D(
+            in_channels=channel,
+            out_channels=channel,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=self.padding,
+            groups=channel, )
+
+        self.pw_conv = Conv1D(
+            in_channels=channel,
+            out_channels=out_dim,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1, )
+
+        self.init_weights()
+
+    def init_weights(self):
+        dw_max = self.kernel_size**-0.5
+        pw_max = self.channel**-0.5
+        self.dw_conv._param_attr = paddle.nn.initializer.Uniform(
+            low=-dw_max, high=dw_max)
+        self.dw_conv._bias_attr = paddle.nn.initializer.Uniform(
+            low=-dw_max, high=dw_max)
+        self.pw_conv._param_attr = paddle.nn.initializer.Uniform(
+            low=-pw_max, high=pw_max)
+        self.pw_conv._bias_attr = paddle.nn.initializer.Uniform(
+            low=-pw_max, high=pw_max)
+
+    def forward(
+            self,
+            xs,
+            xs_lens: paddle.Tensor,
+            mask: paddle.Tensor=paddle.ones((0, 0, 0), dtype=paddle.bool),
+            mask_pad: paddle.Tensor=paddle.ones((0, 0, 0),
+                                                dtype=paddle.bool), ):
+        xs = xs.transpose([0, 2, 1])  # [B, C, T]
+        xs = masked_fill(xs, mask_pad.equal(0), 0.0)
+
+        xs = self.dw_conv(xs)
+        xs = self.pw_conv(xs)
+
+        xs = xs.transpose([0, 2, 1])  # [B, T, C]
+
+        B, T, D = xs.shape
+        mask = mask[:, ::self.stride, ::self.stride]
+        mask_pad = mask_pad[:, :, ::self.stride]
+        L = mask_pad.shape[-1]
+        # For JIT exporting, we remove F.pad operator.
+        if L - T < 0:
+            xs = xs[:, :L - T, :]
+        else:
+            dummy_pad = paddle.zeros([B, L - T, D], dtype=paddle.float32)
+            xs = paddle.concat([xs, dummy_pad], axis=1)
+
+        xs_lens = (xs_lens + 1) // 2
+        return xs, xs_lens, mask, mask_pad
+
+
+class TimeReductionLayer2D(nn.Layer):
+    def __init__(self, kernel_size: int=5, stride: int=2, encoder_dim: int=256):
+        super(TimeReductionLayer2D, self).__init__()
+        self.encoder_dim = encoder_dim
+        self.kernel_size = kernel_size
+        self.dw_conv = Conv2DValid(
+            in_channels=encoder_dim,
+            out_channels=encoder_dim,
+            kernel_size=(kernel_size, 1),
+            stride=stride,
+            valid_trigy=True)
+        self.pw_conv = Conv2DValid(
+            in_channels=encoder_dim,
+            out_channels=encoder_dim,
+            kernel_size=1,
+            stride=1,
+            valid_trigx=False,
+            valid_trigy=False)
+
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.init_weights()
+
+    def init_weights(self):
+        dw_max = self.kernel_size**-0.5
+        pw_max = self.encoder_dim**-0.5
+        self.dw_conv._param_attr = paddle.nn.initializer.Uniform(
+            low=-dw_max, high=dw_max)
+        self.dw_conv._bias_attr = paddle.nn.initializer.Uniform(
+            low=-dw_max, high=dw_max)
+        self.pw_conv._param_attr = paddle.nn.initializer.Uniform(
+            low=-pw_max, high=pw_max)
+        self.pw_conv._bias_attr = paddle.nn.initializer.Uniform(
+            low=-pw_max, high=pw_max)
+
+    def forward(
+            self,
+            xs: paddle.Tensor,
+            xs_lens: paddle.Tensor,
+            mask: paddle.Tensor=paddle.ones((0, 0, 0), dtype=paddle.bool),
+            mask_pad: paddle.Tensor=paddle.ones((0, 0, 0), dtype=paddle.bool),
+    ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]:
+        xs = masked_fill(xs, mask_pad.transpose([0, 2, 1]).equal(0), 0.0)
+        xs = xs.unsqueeze(1)
+        padding1 = self.kernel_size - self.stride
+        xs = F.pad(
+            xs, (0, 0, 0, 0, 0, padding1, 0, 0), mode='constant', value=0.)
+        xs = self.dw_conv(xs.transpose([0, 3, 2, 1]))
+        xs = self.pw_conv(xs).transpose([0, 3, 2, 1]).squeeze(1)
+        tmp_length = xs.shape[1]
+        xs_lens = (xs_lens + 1) // 2
+        padding2 = max(0, (xs_lens.max() - tmp_length).item())
+        batch_size, hidden = xs.shape[0], xs.shape[-1]
+        dummy_pad = paddle.zeros(
+            [batch_size, padding2, hidden], dtype=paddle.float32)
+        xs = paddle.concat([xs, dummy_pad], axis=1)
+        mask = mask[:, ::2, ::2]
+        mask_pad = mask_pad[:, :, ::2]
+        return xs, xs_lens, mask, mask_pad
+
+
+class TimeReductionLayerStream(nn.Layer):
+    """
+    Squeezeformer Time Reduction procedure.
+    Downsamples the audio by `stride` in the time dimension.
+    Args:
+        channel (int): input dimension of
+            MultiheadAttentionMechanism and PositionwiseFeedForward
+        out_dim (int): Output dimension of the module.
+        kernel_size (int): Conv kernel size for
+            depthwise convolution in convolution module
+        stride (int): Downsampling factor in time dimension.
+    """
+
+    def __init__(self,
+                 channel: int,
+                 out_dim: int,
+                 kernel_size: int=1,
+                 stride: int=2):
+        super(TimeReductionLayerStream, self).__init__()
+
+        self.channel = channel
+        self.out_dim = out_dim
+        self.kernel_size = kernel_size
+        self.stride = stride
+
+        self.dw_conv = Conv1D(
+            in_channels=channel,
+            out_channels=channel,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=0,
+            groups=channel)
+
+        self.pw_conv = Conv1D(
+            in_channels=channel,
+            out_channels=out_dim,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            groups=1)
+        self.init_weights()
+
+    def init_weights(self):
+        dw_max = self.kernel_size**-0.5
+        pw_max = self.channel**-0.5
+        self.dw_conv._param_attr = paddle.nn.initializer.Uniform(
+            low=-dw_max, high=dw_max)
+        self.dw_conv._bias_attr = paddle.nn.initializer.Uniform(
+            low=-dw_max, high=dw_max)
+        self.pw_conv._param_attr = paddle.nn.initializer.Uniform(
+            low=-pw_max, high=pw_max)
+        self.pw_conv._bias_attr = paddle.nn.initializer.Uniform(
+            low=-pw_max, high=pw_max)
+
+    def forward(
+            self,
+            xs,
+            xs_lens: paddle.Tensor,
+            mask: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool),
+            mask_pad: paddle.Tensor=paddle.ones([0, 0, 0], dtype=paddle.bool)):
+        xs = xs.transpose([0, 2, 1])  # [B, C, T]
+        xs = masked_fill(xs, mask_pad.equal(0), 0.0)
+
+        xs = self.dw_conv(xs)
+        xs = self.pw_conv(xs)
+
+        xs = xs.transpose([0, 2, 1])  # [B, T, C]
+
+        B, T, D = xs.shape
+        mask = mask[:, ::self.stride, ::self.stride]
+        mask_pad = mask_pad[:, :, ::self.stride]
+        L = mask_pad.shape[-1]
+        # For JIT exporting, we remove F.pad operator.
+        if L - T < 0:
+            xs = xs[:, :L - T, :]
+        else:
+            dummy_pad = paddle.zeros([B, L - T, D], dtype=paddle.float32)
+            xs = paddle.concat([xs, dummy_pad], axis=1)
+
+        xs_lens = (xs_lens + 1) // 2
+        return xs, xs_lens, mask, mask_pad
diff --git a/paddlespeech/s2t/training/gradclip.py b/paddlespeech/s2t/training/gradclip.py
index 26ac501e..06587c74 100644
--- a/paddlespeech/s2t/training/gradclip.py
+++ b/paddlespeech/s2t/training/gradclip.py
@@ -43,8 +43,8 @@ class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm):
             if g.type == core.VarDesc.VarType.SELECTED_ROWS:
                 merge_grad = layers.merge_selected_rows(g)
                 merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
-            square = layers.square(merge_grad)
-            sum_square = layers.reduce_sum(square)
+            square = paddle.square(merge_grad)
+            sum_square = paddle.sum(square)
             sum_square_list.append(sum_square)
 
             # debug log, not dump all since slow down train process
@@ -56,24 +56,25 @@ class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm):
         if len(sum_square_list) == 0:
             return params_grads
 
-        global_norm_var = layers.concat(sum_square_list)
-        global_norm_var = layers.reduce_sum(global_norm_var)
-        global_norm_var = layers.sqrt(global_norm_var)
+        global_norm_var = paddle.concat(sum_square_list)
+        global_norm_var = paddle.sum(global_norm_var)
+        global_norm_var = paddle.sqrt(global_norm_var)
+
         # debug log
         logger.debug(f"Grad Global Norm: {float(global_norm_var)}!!!!")
 
-        max_global_norm = layers.fill_constant(
-            shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm)
-        clip_var = layers.elementwise_div(
+        max_global_norm = paddle.full(
+            shape=[1], dtype=global_norm_var.dtype, fill_value=self.clip_norm)
+        clip_var = paddle.divide(
             x=max_global_norm,
-            y=layers.elementwise_max(x=global_norm_var, y=max_global_norm))
+            y=paddle.maximum(x=global_norm_var, y=max_global_norm))
         for i, (p, g) in enumerate(params_grads):
             if g is None:
                 continue
             if getattr(p, 'need_clip', True) is False:
                 params_and_grads.append((p, g))
                 continue
-            new_grad = layers.elementwise_mul(x=g, y=clip_var)
+            new_grad = paddle.multiply(x=g, y=clip_var)
             params_and_grads.append((p, new_grad))
 
             # debug log, not dump all since slow down train process
diff --git a/paddlespeech/s2t/training/optimizer.py b/paddlespeech/s2t/training/optimizer/__init__.py
similarity index 99%
rename from paddlespeech/s2t/training/optimizer.py
rename to paddlespeech/s2t/training/optimizer/__init__.py
index f7f70c57..aafdc5b6 100644
--- a/paddlespeech/s2t/training/optimizer.py
+++ b/paddlespeech/s2t/training/optimizer/__init__.py
@@ -19,7 +19,6 @@ from typing import Text
 import paddle
 from paddle.optimizer import Optimizer
 from paddle.regularizer import L2Decay
-
 from paddlespeech.s2t.training.gradclip import ClipGradByGlobalNormWithLog
 from paddlespeech.s2t.utils.dynamic_import import dynamic_import
 from paddlespeech.s2t.utils.dynamic_import import instance_class
diff --git a/paddlespeech/s2t/training/optimizer/adadelta.py b/paddlespeech/s2t/training/optimizer/adadelta.py
new file mode 100644
index 00000000..900b697c
--- /dev/null
+++ b/paddlespeech/s2t/training/optimizer/adadelta.py
@@ -0,0 +1,239 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+from paddle.fluid import framework
+from paddle.optimizer import Optimizer
+
+__all__ = []
+
+
+class SimpleAdadelta(Optimizer):
+    r"""
+    **Notes: This API does not support sparse parameter optimization.**
+
+    Adadelta Optimizer. Please refer to this for details:
+    `ADADELTA: AN ADAPTIVE LEARNING RATE METHOD <https://arxiv.org/abs/1212.5701>`_.
+
+    The update is done as follows:
+
+    .. math::
+
+        E(g_t^2) &= \rho * E(g_{t-1}^2) + (1-\rho) * g^2
+
+        learning\_rate &= \sqrt{ ( E(dx_{t-1}^2) + \epsilon ) / ( E(g_t^2) + \epsilon ) }
+
+        E(dx_t^2) &= \rho * E(dx_{t-1}^2) + (1-\rho) * (-g*learning\_rate)^2
+
+    Args:
+        learning_rate (float|Tensor|LearningRateDecay, optional): The learning rate used to update ``Parameter``.
+            It can be a float value, a ``Tensor`` with a float type or a LearningRateDecay. The default value is 0.001.
+        epsilon (float): a small float number for numeric stability. Default 1.0e-6.
+        rho (float): a floating point value indicating the decay rate. Default 0.95.
+        parameters (list|tuple, optional): List/Tuple of ``Tensor`` to update to minimize ``loss``. \
+            This parameter is required in dygraph mode. And you can specify different options for \
+            different parameter groups such as the learning rate, weight decay, etc, \
+            then the parameters are list of dict. Note that the learning_rate in paramter groups \
+            represents the scale of base learning_rate. \
+            The default value is None in static mode, at this time all parameters will be updated.
+        weight_decay (float|WeightDecayRegularizer, optional): The strategy of regularization. \
+            It canbe a float value as coeff of L2 regularization or \
+            :ref:`api_fluid_regularizer_L1Decay`, :ref:`api_fluid_regularizer_L2Decay`.
+            If a parameter has set regularizer using :ref:`api_fluid_ParamAttr` already, \
+            the regularization setting here in optimizer will be ignored for this parameter. \
+            Otherwise, the regularization setting here in optimizer will take effect. \
+            Default None, meaning there is no regularization.
+        foreach (bool, optional): whether foreach implementation of optimizer is used. The default value is None.
+        maximize (bool, optional): maximize the params based on the objective, instead of minimizing.
+            The default value is False.
+        name (str, optional): The default value is None. Normally there is no need for user
+                to set this property. For more information, please refer to
+                :ref:`api_guide_Name` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddlespeech.s2t.training.optimizer.adadelta import SimpleAdadelta
+
+            inp = paddle.uniform([10, 10], dtype="float32", min=-0.1, max=0.1)
+            linear = paddle.nn.Linear(10, 10)
+            out = linear(inp)
+            loss = paddle.mean(out)
+            adadelta = SimpleAdadelta(learning_rate=0.1, parameters=linear.parameters(), weight_decay=0.01)
+            out.backward()
+            adadelta.step()
+            adadelta.clear_grad()
+
+    """
+
+    def __init__(
+            self,
+            learning_rate=0.001,
+            epsilon=1.0e-6,
+            rho=0.95,
+            parameters=None,
+            weight_decay=0.0,
+            foreach=None,
+            maximize=False,
+            name=None, ):
+        if learning_rate is None:
+            raise ValueError("learning_rate is not set.")
+        if epsilon is None:
+            raise ValueError("epsilon is not set.")
+        if rho is None:
+            raise ValueError("rho is not set.")
+        super(SimpleAdadelta, self).__init__(
+            learning_rate=learning_rate,
+            parameters=parameters,
+            weight_decay=weight_decay,
+            name=name, )
+
+        self._epsilon = epsilon
+        self._rho = rho
+
+        self.state = 0  # self.state is 0 or 1, use to control init square_avgs and acc_deltas
+        self._weight_decay = weight_decay
+        self._learning_rate = learning_rate
+        self._foreach = foreach
+        self._maximize = maximize
+        self.square_avgs = []
+        self.acc_deltas = []
+
+    @paddle.no_grad()
+    @framework.dygraph_only
+    def step(self):
+        """Performs a single optimization step.
+
+        Args:
+            closure (Callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        if not isinstance(self._parameter_list[0], dict):
+            params_grads = []
+            for param in self._parameter_list:
+                if param.stop_gradient:
+                    continue
+                if param._grad_ivar() is not None:
+                    grad_var = param._grad_ivar()
+
+                    params_grads.append((param, grad_var))
+                    if self.state == 0:
+                        self.square_avg = paddle.zeros_like(param)
+                        self.acc_delta = paddle.zeros_like(param)
+                        self.square_avgs.append(self.square_avg)
+                        self.acc_deltas.append(self.acc_delta)
+
+        else:
+            # optimize parameters in groups
+            params_grads = []
+            for idx, param_group in enumerate(self._param_groups):
+                for param in param_group['params']:
+                    if param.stop_gradient:
+                        continue
+                    if param._grad_ivar() is not None:
+                        grad_var = param._grad_ivar()
+                        params_grads.append((param, grad_var))
+                        if self.state == 0:
+                            self.square_avg = paddle.zeros_like(param)
+                            self.acc_delta = paddle.zeros_like(param)
+                            self.square_avgs.append(self.square_avg)
+                            self.acc_deltas.append(self.acc_delta)
+
+        self.state = 1
+        adadelta(
+            params_grads,
+            square_avgs=self.square_avgs,
+            acc_deltas=self.acc_deltas,
+            learning_rate=self._learning_rate,
+            rho=self._rho,
+            epsilon=self._epsilon,
+            weight_decay=self._weight_decay,
+            foreach=self._foreach,
+            maximize=self._maximize)
+
+
+def adadelta(params_grads,
+             square_avgs,
+             acc_deltas,
+             foreach=None,
+             *,
+             learning_rate: float,
+             rho: float,
+             epsilon: float,
+             weight_decay: float,
+             maximize: bool):
+
+    if foreach is None:
+        # if foreach is None, set False
+        foreach = False
+    if not foreach:
+        # optimizer is used
+        func = _single_tensor_adadelta
+
+    func(
+        params_grads,
+        square_avgs,
+        acc_deltas,
+        learning_rate=learning_rate,
+        rho=rho,
+        epsilon=epsilon,
+        weight_decay=weight_decay,
+        maximize=maximize)
+
+
+def _single_tensor_adadelta(params_grads,
+                            square_avgs,
+                            acc_deltas,
+                            *,
+                            learning_rate: float,
+                            rho: float,
+                            epsilon: float,
+                            weight_decay: float,
+                            maximize: bool):
+    """
+    Calculate variables(square_avgs, acc_deltas) and update parameters.
+    """
+
+    for (params_grad, square_avg, acc_delta) in zip(params_grads, square_avgs,
+                                                    acc_deltas):
+        param, grad = params_grad
+        grad = grad if not maximize else -grad
+        if weight_decay != 0:
+            grad.set_value(grad.add(paddle.multiply(param, weight_decay)))
+
+        if paddle.is_complex(param):
+            square_avg = paddle.as_real(square_avg)
+            acc_delta = paddle.as_real(acc_delta)
+            grad = paddle.as_real(grad)
+        # square_avg = square_avg * rho + (1-rho) * grad * grad
+        square_avg.set_value(
+            paddle.multiply(square_avg, paddle.to_tensor(rho)).add(
+                paddle.multiply(paddle.to_tensor(1 - rho), grad.square())))
+        # std = (square_avg + eps).sqrt()
+        std = square_avg.add(paddle.to_tensor(epsilon)).sqrt_()
+        # delta = std / (acc_delta + eps).sqrt() * grad
+        delta = (paddle.multiply(
+            paddle.divide(
+                acc_delta.add(paddle.to_tensor(epsilon)).sqrt_(), std), grad))
+        # acc_delta = acc_delta * rho + (1-rho) * delta * delta
+        acc_delta.set_value(
+            paddle.multiply(acc_delta, paddle.to_tensor(rho)).add(
+                paddle.multiply(paddle.to_tensor(1 - rho), delta.square())))
+        if paddle.is_complex(param):
+            delta = paddle.as_real(delta)
+        # param = param - delta*learning_rate
+        param.set_value(
+            param.add(
+                paddle.multiply(
+                    delta.astype('float32'), paddle.to_tensor(-learning_rate))))
diff --git a/paddlespeech/s2t/training/scheduler.py b/paddlespeech/s2t/training/scheduler.py
index 53c756ce..a5e7a08f 100644
--- a/paddlespeech/s2t/training/scheduler.py
+++ b/paddlespeech/s2t/training/scheduler.py
@@ -220,7 +220,6 @@ class NewBobScheduler(LRScheduler):
 
     def load(self, data):
         """Loads the needed information."""
-        data = paddle.load(data)
         self.last_epoch = data["current_epoch_index"]
         self.hyperparam_value = data["hyperparam_value"]
         self.metric_values = data["metric_values"]
diff --git a/paddlespeech/s2t/utils/utility.py b/paddlespeech/s2t/utils/utility.py
index fdd8c029..5655ec3f 100644
--- a/paddlespeech/s2t/utils/utility.py
+++ b/paddlespeech/s2t/utils/utility.py
@@ -29,10 +29,7 @@ from paddlespeech.s2t.utils.log import Log
 
 logger = Log(__name__).getlog()
 
-__all__ = [
-    "all_version", "UpdateConfig", "seed_all", 'print_arguments',
-    'add_arguments', "log_add"
-]
+__all__ = ["all_version", "UpdateConfig", "seed_all", "log_add"]
 
 
 def all_version():
@@ -60,51 +57,6 @@ def seed_all(seed: int=20210329):
     paddle.seed(seed)
 
 
-def print_arguments(args, info=None):
-    """Print argparse's arguments.
-
-    Usage:
-
-    .. code-block:: python
-
-        parser = argparse.ArgumentParser()
-        parser.add_argument("name", default="Jonh", type=str, help="User name.")
-        args = parser.parse_args()
-        print_arguments(args)
-
-    :param args: Input argparse.Namespace for printing.
-    :type args: argparse.Namespace
-    """
-    filename = ""
-    if info:
-        filename = info["__file__"]
-    filename = os.path.basename(filename)
-    print(f"----------- {filename} Arguments -----------")
-    for arg, value in sorted(vars(args).items()):
-        print("%s: %s" % (arg, value))
-    print("-----------------------------------------------------------")
-
-
-def add_arguments(argname, type, default, help, argparser, **kwargs):
-    """Add argparse's argument.
-
-    Usage:
-
-    .. code-block:: python
-
-        parser = argparse.ArgumentParser()
-        add_argument("name", str, "Jonh", "User name.", parser)
-        args = parser.parse_args()
-    """
-    type = distutils.util.strtobool if type == bool else type
-    argparser.add_argument(
-        "--" + argname,
-        default=default,
-        type=type,
-        help=help + ' Default: %(default)s.',
-        **kwargs)
-
-
 def log_add(args: List[int]) -> float:
     """Stable log add
 
@@ -130,8 +82,11 @@ def get_subsample(config):
     Returns:
         int: subsample rate.
     """
-    input_layer = config["encoder_conf"]["input_layer"]
-    assert input_layer in ["conv2d", "conv2d6", "conv2d8"]
+    if config['encoder'] == 'squeezeformer':
+        return 4
+    else:
+        input_layer = config["encoder_conf"]["input_layer"]
+        assert input_layer in ["conv2d", "conv2d6", "conv2d8"]
     if input_layer == "conv2d":
         return 4
     elif input_layer == "conv2d6":
diff --git a/paddlespeech/server/bin/paddlespeech_server.py b/paddlespeech/server/bin/paddlespeech_server.py
index 1b1792bd..299a8c3d 100644
--- a/paddlespeech/server/bin/paddlespeech_server.py
+++ b/paddlespeech/server/bin/paddlespeech_server.py
@@ -16,14 +16,9 @@ import sys
 import warnings
 from typing import List
 
+import numpy
 import uvicorn
 from fastapi import FastAPI
-from prettytable import PrettyTable
-from starlette.middleware.cors import CORSMiddleware
-
-from ..executor import BaseExecutor
-from ..util import cli_server_register
-from ..util import stats_wrapper
 from paddlespeech.cli.log import logger
 from paddlespeech.resource import CommonTaskResource
 from paddlespeech.server.engine.engine_pool import init_engine_pool
@@ -31,6 +26,12 @@ from paddlespeech.server.engine.engine_warmup import warm_up
 from paddlespeech.server.restful.api import setup_router as setup_http_router
 from paddlespeech.server.utils.config import get_config
 from paddlespeech.server.ws.api import setup_router as setup_ws_router
+from prettytable import PrettyTable
+from starlette.middleware.cors import CORSMiddleware
+
+from ..executor import BaseExecutor
+from ..util import cli_server_register
+from ..util import stats_wrapper
 warnings.filterwarnings("ignore")
 
 __all__ = ['ServerExecutor', 'ServerStatsExecutor']
@@ -134,7 +135,7 @@ class ServerStatsExecutor():
             required=True)
         self.task_choices = ['asr', 'tts', 'cls', 'text', 'vector']
         self.model_name_format = {
-            'asr': 'Model-Language-Sample Rate',
+            'asr': 'Model-Size-Code Switch-Multilingual-Language-Sample Rate',
             'tts': 'Model-Language',
             'cls': 'Model-Sample Rate',
             'text': 'Model-Task-Language',
@@ -145,7 +146,20 @@ class ServerStatsExecutor():
         fields = self.model_name_format[self.task].split("-")
         table = PrettyTable(fields)
         for key in pretrained_models:
-            table.add_row(key.split("-"))
+            line = key.split("-")
+            if self.task == "asr" and len(line) < len(fields):
+                for i in range(len(line), len(fields)):
+                    line.append("-")
+                if "codeswitch" in key:
+                    line[3], line[1] = line[1].split("_")[0], line[1].split(
+                        "_")[1:]
+                elif "multilingual" in key:
+                    line[4], line[1] = line[1].split("_")[0], line[1].split(
+                        "_")[1:]
+                tmp = numpy.array(line)
+                idx = [0, 5, 3, 4, 1, 2]
+                line = tmp[idx]
+            table.add_row(line)
         print(table)
 
     def execute(self, argv: List[str]) -> bool:
diff --git a/paddlespeech/server/engine/asr/online/python/asr_engine.py b/paddlespeech/server/engine/asr/online/python/asr_engine.py
index 536ffe0a..a702f0aa 100644
--- a/paddlespeech/server/engine/asr/online/python/asr_engine.py
+++ b/paddlespeech/server/engine/asr/online/python/asr_engine.py
@@ -609,7 +609,7 @@ class PaddleASRConnectionHanddler:
             dtype=paddle.long)  # (beam_size,)
         hyps_pad, _ = add_sos_eos(hyps_pad, self.model.sos, self.model.eos,
                                   self.model.ignore_id)
-        hyps_lens = hyps_lens + 1  # Add <sos> at begining
+        hyps_lens = hyps_lens + 1  # Add <sos> at beginning
 
         # ctc score in ln domain
         # (beam_size, max_hyps_len, vocab_size)
diff --git a/paddlespeech/server/engine/tts/paddleinference/tts_engine.py b/paddlespeech/server/engine/tts/paddleinference/tts_engine.py
index 2878c852..20b98fae 100644
--- a/paddlespeech/server/engine/tts/paddleinference/tts_engine.py
+++ b/paddlespeech/server/engine/tts/paddleinference/tts_engine.py
@@ -247,11 +247,13 @@ class TTSServerExecutor(TTSExecutor):
             else:
                 # multi speaker  do not have static model
                 if am_dataset in {"aishell3", "vctk"}:
-                    pass
+                    am_result = run_model(
+                        self.am_predictor,
+                        [part_phone_ids.numpy(), np.array([spk_id])])
                 else:
                     am_result = run_model(self.am_predictor,
                                           [part_phone_ids.numpy()])
-                    mel = am_result[0]
+                mel = am_result[0]
             self.am_time += (time.time() - am_st)
 
             # voc
diff --git a/paddlespeech/server/ws/asr_api.py b/paddlespeech/server/ws/asr_api.py
index ae1c8831..b3ad0b7c 100644
--- a/paddlespeech/server/ws/asr_api.py
+++ b/paddlespeech/server/ws/asr_api.py
@@ -67,7 +67,7 @@ async def websocket_endpoint(websocket: WebSocket):
                 #              and we break the loop
                 if message['signal'] == 'start':
                     resp = {"status": "ok", "signal": "server_ready"}
-                    # do something at begining here
+                    # do something at beginning here
                     # create the instance to process the audio
                     #connection_handler = PaddleASRConnectionHanddler(asr_model)
                     connection_handler = asr_model.new_handler()
diff --git a/paddlespeech/t2s/datasets/am_batch_fn.py b/paddlespeech/t2s/datasets/am_batch_fn.py
index c95d908d..fe5d977a 100644
--- a/paddlespeech/t2s/datasets/am_batch_fn.py
+++ b/paddlespeech/t2s/datasets/am_batch_fn.py
@@ -114,7 +114,7 @@ def erniesat_batch_fn(examples,
         ]
         span_bdy = paddle.to_tensor(span_bdy)
 
-    # dual_mask 的是混合中英时候同时 mask 语音和文本 
+    # dual_mask 的是混合中英时候同时 mask 语音和文本
     # ernie sat 在实现跨语言的时候都 mask 了
     if text_masking:
         masked_pos, text_masked_pos = phones_text_masking(
@@ -153,7 +153,7 @@ def erniesat_batch_fn(examples,
     batch = {
         "text": text,
         "speech": speech,
-        # need to generate 
+        # need to generate
         "masked_pos": masked_pos,
         "speech_mask": speech_mask,
         "text_mask": text_mask,
@@ -414,6 +414,135 @@ def fastspeech2_multi_spk_batch_fn(examples):
     return batch
 
 
+def diffsinger_single_spk_batch_fn(examples):
+    # fields = ["text", "note", "note_dur", "is_slur", "text_lengths", \
+    # "speech", "speech_lengths", "durations", "pitch", "energy"]
+    text = [np.array(item["text"], dtype=np.int64) for item in examples]
+    note = [np.array(item["note"], dtype=np.int64) for item in examples]
+    note_dur = [
+        np.array(item["note_dur"], dtype=np.float32) for item in examples
+    ]
+    is_slur = [np.array(item["is_slur"], dtype=np.int64) for item in examples]
+    speech = [np.array(item["speech"], dtype=np.float32) for item in examples]
+    pitch = [np.array(item["pitch"], dtype=np.float32) for item in examples]
+    energy = [np.array(item["energy"], dtype=np.float32) for item in examples]
+    durations = [
+        np.array(item["durations"], dtype=np.int64) for item in examples
+    ]
+
+    text_lengths = [
+        np.array(item["text_lengths"], dtype=np.int64) for item in examples
+    ]
+    speech_lengths = [
+        np.array(item["speech_lengths"], dtype=np.int64) for item in examples
+    ]
+
+    text = batch_sequences(text)
+    note = batch_sequences(note)
+    note_dur = batch_sequences(note_dur)
+    is_slur = batch_sequences(is_slur)
+    pitch = batch_sequences(pitch)
+    speech = batch_sequences(speech)
+    durations = batch_sequences(durations)
+    energy = batch_sequences(energy)
+
+    # convert each batch to paddle.Tensor
+    text = paddle.to_tensor(text)
+    note = paddle.to_tensor(note)
+    note_dur = paddle.to_tensor(note_dur)
+    is_slur = paddle.to_tensor(is_slur)
+    pitch = paddle.to_tensor(pitch)
+    speech = paddle.to_tensor(speech)
+    durations = paddle.to_tensor(durations)
+    energy = paddle.to_tensor(energy)
+    text_lengths = paddle.to_tensor(text_lengths)
+    speech_lengths = paddle.to_tensor(speech_lengths)
+
+    batch = {
+        "text": text,
+        "note": note,
+        "note_dur": note_dur,
+        "is_slur": is_slur,
+        "text_lengths": text_lengths,
+        "durations": durations,
+        "speech": speech,
+        "speech_lengths": speech_lengths,
+        "pitch": pitch,
+        "energy": energy
+    }
+    return batch
+
+
+def diffsinger_multi_spk_batch_fn(examples):
+    # fields = ["text", "note", "note_dur", "is_slur", "text_lengths", "speech", \
+    # "speech_lengths", "durations", "pitch", "energy", "spk_id"/"spk_emb"]
+    text = [np.array(item["text"], dtype=np.int64) for item in examples]
+    note = [np.array(item["note"], dtype=np.int64) for item in examples]
+    note_dur = [
+        np.array(item["note_dur"], dtype=np.float32) for item in examples
+    ]
+    is_slur = [np.array(item["is_slur"], dtype=np.int64) for item in examples]
+    speech = [np.array(item["speech"], dtype=np.float32) for item in examples]
+    pitch = [np.array(item["pitch"], dtype=np.float32) for item in examples]
+    energy = [np.array(item["energy"], dtype=np.float32) for item in examples]
+    durations = [
+        np.array(item["durations"], dtype=np.int64) for item in examples
+    ]
+    text_lengths = [
+        np.array(item["text_lengths"], dtype=np.int64) for item in examples
+    ]
+    speech_lengths = [
+        np.array(item["speech_lengths"], dtype=np.int64) for item in examples
+    ]
+
+    text = batch_sequences(text)
+    note = batch_sequences(note)
+    note_dur = batch_sequences(note_dur)
+    is_slur = batch_sequences(is_slur)
+    pitch = batch_sequences(pitch)
+    speech = batch_sequences(speech)
+    durations = batch_sequences(durations)
+    energy = batch_sequences(energy)
+
+    # convert each batch to paddle.Tensor
+    text = paddle.to_tensor(text)
+    note = paddle.to_tensor(note)
+    note_dur = paddle.to_tensor(note_dur)
+    is_slur = paddle.to_tensor(is_slur)
+    pitch = paddle.to_tensor(pitch)
+    speech = paddle.to_tensor(speech)
+    durations = paddle.to_tensor(durations)
+    energy = paddle.to_tensor(energy)
+    text_lengths = paddle.to_tensor(text_lengths)
+    speech_lengths = paddle.to_tensor(speech_lengths)
+
+    batch = {
+        "text": text,
+        "note": note,
+        "note_dur": note_dur,
+        "is_slur": is_slur,
+        "text_lengths": text_lengths,
+        "durations": durations,
+        "speech": speech,
+        "speech_lengths": speech_lengths,
+        "pitch": pitch,
+        "energy": energy
+    }
+    # spk_emb has a higher priority than spk_id
+    if "spk_emb" in examples[0]:
+        spk_emb = [
+            np.array(item["spk_emb"], dtype=np.float32) for item in examples
+        ]
+        spk_emb = batch_sequences(spk_emb)
+        spk_emb = paddle.to_tensor(spk_emb)
+        batch["spk_emb"] = spk_emb
+    elif "spk_id" in examples[0]:
+        spk_id = [np.array(item["spk_id"], dtype=np.int64) for item in examples]
+        spk_id = paddle.to_tensor(spk_id)
+        batch["spk_id"] = spk_id
+    return batch
+
+
 def transformer_single_spk_batch_fn(examples):
     # fields = ["text", "text_lengths", "speech", "speech_lengths"]
     text = [np.array(item["text"], dtype=np.int64) for item in examples]
@@ -540,6 +669,211 @@ def vits_multi_spk_batch_fn(examples):
     return batch
 
 
+def jets_single_spk_batch_fn(examples):
+    """
+    Returns:
+        Dict[str, Any]:
+            - text (Tensor): Text index tensor (B, T_text).
+            - text_lengths (Tensor): Text length tensor (B,).
+            - feats (Tensor): Feature tensor (B, T_feats, aux_channels).
+            - feats_lengths (Tensor): Feature length tensor (B,).
+            - durations (Tensor): Feature tensor (B, T_text,).
+            - durations_lengths (Tensor): Durations length tensor (B,).
+            - pitch (Tensor): Feature tensor (B, pitch_length,).
+            - energy (Tensor): Feature tensor (B, energy_length,).
+            - speech (Tensor): Speech waveform tensor (B, T_wav).
+
+    """
+    # fields = ["text", "text_lengths", "feats", "feats_lengths", "durations", "pitch", "energy", "speech"]
+    text = [np.array(item["text"], dtype=np.int64) for item in examples]
+    feats = [np.array(item["feats"], dtype=np.float32) for item in examples]
+    durations = [
+        np.array(item["durations"], dtype=np.int64) for item in examples
+    ]
+    pitch = [np.array(item["pitch"], dtype=np.float32) for item in examples]
+    energy = [np.array(item["energy"], dtype=np.float32) for item in examples]
+    speech = [np.array(item["wave"], dtype=np.float32) for item in examples]
+
+    text_lengths = [
+        np.array(item["text_lengths"], dtype=np.int64) for item in examples
+    ]
+    feats_lengths = [
+        np.array(item["feats_lengths"], dtype=np.int64) for item in examples
+    ]
+
+    text = batch_sequences(text)
+    feats = batch_sequences(feats)
+    durations = batch_sequences(durations)
+    pitch = batch_sequences(pitch)
+    energy = batch_sequences(energy)
+    speech = batch_sequences(speech)
+
+    # convert each batch to paddle.Tensor
+    text = paddle.to_tensor(text)
+    feats = paddle.to_tensor(feats)
+    durations = paddle.to_tensor(durations)
+    pitch = paddle.to_tensor(pitch)
+    energy = paddle.to_tensor(energy)
+    text_lengths = paddle.to_tensor(text_lengths)
+    feats_lengths = paddle.to_tensor(feats_lengths)
+
+    batch = {
+        "text": text,
+        "text_lengths": text_lengths,
+        "feats": feats,
+        "feats_lengths": feats_lengths,
+        "durations": durations,
+        "durations_lengths": text_lengths,
+        "pitch": pitch,
+        "energy": energy,
+        "speech": speech,
+    }
+    return batch
+
+
+def jets_multi_spk_batch_fn(examples):
+    """
+    Returns:
+        Dict[str, Any]:
+            - text (Tensor): Text index tensor (B, T_text).
+            - text_lengths (Tensor): Text length tensor (B,).
+            - feats (Tensor): Feature tensor (B, T_feats, aux_channels).
+            - feats_lengths (Tensor): Feature length tensor (B,).
+            - durations (Tensor): Feature tensor (B, T_text,).
+            - durations_lengths (Tensor): Durations length tensor (B,).
+            - pitch (Tensor): Feature tensor (B, pitch_length,).
+            - energy (Tensor): Feature tensor (B, energy_length,).
+            - speech (Tensor): Speech waveform tensor (B, T_wav).
+            - spk_id (Optional[Tensor]): Speaker index tensor (B,) or (B, 1).
+            - spk_emb (Optional[Tensor]): Speaker embedding tensor (B, spk_embed_dim).
+    """
+    # fields = ["text", "text_lengths", "feats", "feats_lengths", "durations", "pitch", "energy", "speech", "spk_id"/"spk_emb"]
+    text = [np.array(item["text"], dtype=np.int64) for item in examples]
+    feats = [np.array(item["feats"], dtype=np.float32) for item in examples]
+    durations = [
+        np.array(item["durations"], dtype=np.int64) for item in examples
+    ]
+    pitch = [np.array(item["pitch"], dtype=np.float32) for item in examples]
+    energy = [np.array(item["energy"], dtype=np.float32) for item in examples]
+    speech = [np.array(item["wave"], dtype=np.float32) for item in examples]
+    text_lengths = [
+        np.array(item["text_lengths"], dtype=np.int64) for item in examples
+    ]
+    feats_lengths = [
+        np.array(item["feats_lengths"], dtype=np.int64) for item in examples
+    ]
+
+    text = batch_sequences(text)
+    feats = batch_sequences(feats)
+    durations = batch_sequences(durations)
+    pitch = batch_sequences(pitch)
+    energy = batch_sequences(energy)
+    speech = batch_sequences(speech)
+
+    # convert each batch to paddle.Tensor
+    text = paddle.to_tensor(text)
+    feats = paddle.to_tensor(feats)
+    durations = paddle.to_tensor(durations)
+    pitch = paddle.to_tensor(pitch)
+    energy = paddle.to_tensor(energy)
+    text_lengths = paddle.to_tensor(text_lengths)
+    feats_lengths = paddle.to_tensor(feats_lengths)
+
+    batch = {
+        "text": text,
+        "text_lengths": text_lengths,
+        "feats": feats,
+        "feats_lengths": feats_lengths,
+        "durations": durations,
+        "durations_lengths": text_lengths,
+        "pitch": pitch,
+        "energy": energy,
+        "speech": speech,
+    }
+    # spk_emb has a higher priority than spk_id
+    if "spk_emb" in examples[0]:
+        spk_emb = [
+            np.array(item["spk_emb"], dtype=np.float32) for item in examples
+        ]
+        spk_emb = batch_sequences(spk_emb)
+        spk_emb = paddle.to_tensor(spk_emb)
+        batch["spk_emb"] = spk_emb
+    elif "spk_id" in examples[0]:
+        spk_id = [np.array(item["spk_id"], dtype=np.int64) for item in examples]
+        spk_id = paddle.to_tensor(spk_id)
+        batch["spk_id"] = spk_id
+    return batch
+
+
+# 因为要传参数，所以需要额外构建
+def build_starganv2_vc_collate_fn(latent_dim: int=16, max_mel_length: int=192):
+
+    return StarGANv2VCCollateFn(
+        latent_dim=latent_dim, max_mel_length=max_mel_length)
+
+
+class StarGANv2VCCollateFn:
+    """Functor class of common_collate_fn()"""
+
+    def __init__(self, latent_dim: int=16, max_mel_length: int=192):
+        self.latent_dim = latent_dim
+        self.max_mel_length = max_mel_length
+
+    def random_clip(self, mel: np.array):
+        # [T, 80]
+        mel_length = mel.shape[0]
+        if mel_length > self.max_mel_length:
+            random_start = np.random.randint(0,
+                                             mel_length - self.max_mel_length)
+
+            mel = mel[random_start:random_start + self.max_mel_length, :]
+        return mel
+
+    def __call__(self, exmaples):
+        return self.starganv2_vc_batch_fn(exmaples)
+
+    def starganv2_vc_batch_fn(self, examples):
+        batch_size = len(examples)
+
+        label = [np.array(item["label"], dtype=np.int64) for item in examples]
+        ref_label = [
+            np.array(item["ref_label"], dtype=np.int64) for item in examples
+        ]
+
+        # 需要对 mel 进行裁剪
+        mel = [self.random_clip(item["mel"]) for item in examples]
+        ref_mel = [self.random_clip(item["ref_mel"]) for item in examples]
+        ref_mel_2 = [self.random_clip(item["ref_mel_2"]) for item in examples]
+        mel = batch_sequences(mel)
+        ref_mel = batch_sequences(ref_mel)
+        ref_mel_2 = batch_sequences(ref_mel_2)
+
+        # convert each batch to paddle.Tensor
+        # (B,)
+        label = paddle.to_tensor(label)
+        ref_label = paddle.to_tensor(ref_label)
+        # [B, T, 80] -> [B, 1, 80, T]
+        mel = paddle.to_tensor(mel).transpose([0, 2, 1]).unsqueeze(1)
+        ref_mel = paddle.to_tensor(ref_mel).transpose([0, 2, 1]).unsqueeze(1)
+        ref_mel_2 = paddle.to_tensor(ref_mel_2).transpose(
+            [0, 2, 1]).unsqueeze(1)
+
+        z_trg = paddle.randn([batch_size, self.latent_dim])
+        z_trg2 = paddle.randn([batch_size, self.latent_dim])
+
+        batch = {
+            "x_real": mel,
+            "y_org": label,
+            "x_ref": ref_mel,
+            "x_ref2": ref_mel_2,
+            "y_trg": ref_label,
+            "z_trg": z_trg,
+            "z_trg2": z_trg2
+        }
+
+        return batch
+
+
 # for PaddleSlim
 def fastspeech2_single_spk_batch_fn_static(examples):
     text = [np.array(item["text"], dtype=np.int64) for item in examples]
diff --git a/paddlespeech/t2s/datasets/data_table.py b/paddlespeech/t2s/datasets/data_table.py
index c9815af2..4ac67546 100644
--- a/paddlespeech/t2s/datasets/data_table.py
+++ b/paddlespeech/t2s/datasets/data_table.py
@@ -11,12 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import random
 from multiprocessing import Manager
 from typing import Any
 from typing import Callable
 from typing import Dict
 from typing import List
 
+import numpy as np
 from paddle.io import Dataset
 
 
@@ -131,3 +133,54 @@ class DataTable(Dataset):
             The length of the dataset
         """
         return len(self.data)
+
+
+class StarGANv2VCDataTable(DataTable):
+    def __init__(self, data: List[Dict[str, Any]]):
+        super().__init__(data)
+        raw_data = data
+        spk_id_set = list(set([item['spk_id'] for item in raw_data]))
+        data_list_per_class = {}
+        for spk_id in spk_id_set:
+            data_list_per_class[spk_id] = []
+        for item in raw_data:
+            for spk_id in spk_id_set:
+                if item['spk_id'] == spk_id:
+                    data_list_per_class[spk_id].append(item)
+        self.data_list_per_class = data_list_per_class
+
+    def __getitem__(self, idx: int) -> Dict[str, Any]:
+        """Get an example given an index.
+        Args:
+            idx (int): Index of the example to get
+
+        Returns:
+            Dict[str, Any]: A converted example
+        """
+        if self.use_cache and self.caches[idx] is not None:
+            return self.caches[idx]
+
+        data = self._get_metadata(idx)
+
+        # 裁剪放到 batch_fn 里面
+        # 返回一个字典
+        """
+        {'utt_id': 'p225_111', 'spk_id': '1', 'speech': 'path of *.npy'}
+        """
+        ref_data = random.choice(self.data)
+        ref_label = ref_data['spk_id']
+        ref_data_2 = random.choice(self.data_list_per_class[ref_label])
+        # mel_tensor, label, ref_mel_tensor, ref2_mel_tensor, ref_label
+        new_example = {
+            'utt_id': data['utt_id'],
+            'mel': np.load(data['speech']),
+            'label': int(data['spk_id']),
+            'ref_mel': np.load(ref_data['speech']),
+            'ref_mel_2': np.load(ref_data_2['speech']),
+            'ref_label': int(ref_label)
+        }
+
+        if self.use_cache:
+            self.caches[idx] = new_example
+
+        return new_example
diff --git a/paddlespeech/t2s/datasets/get_feats.py b/paddlespeech/t2s/datasets/get_feats.py
index 21458f15..ea273e24 100644
--- a/paddlespeech/t2s/datasets/get_feats.py
+++ b/paddlespeech/t2s/datasets/get_feats.py
@@ -12,10 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # Modified from espnet(https://github.com/espnet/espnet)
+from typing import List
+from typing import Optional
+from typing import Union
+
 import librosa
 import numpy as np
 import pyworld
 from scipy.interpolate import interp1d
+from typing_extensions import Literal
 
 
 class LogMelFBank():
@@ -27,7 +32,10 @@ class LogMelFBank():
                  window: str="hann",
                  n_mels: int=80,
                  fmin: int=80,
-                 fmax: int=7600):
+                 fmax: int=7600,
+                 norm: Optional[Union[Literal["slaney"], float]]="slaney",
+                 htk: bool=False,
+                 power: float=1.0):
         self.sr = sr
         # stft
         self.n_fft = n_fft
@@ -36,11 +44,14 @@ class LogMelFBank():
         self.window = window
         self.center = True
         self.pad_mode = "reflect"
+        self.norm = norm
+        self.htk = htk
 
         # mel
         self.n_mels = n_mels
         self.fmin = 0 if fmin is None else fmin
         self.fmax = sr / 2 if fmax is None else fmax
+        self.power = power
 
         self.mel_filter = self._create_mel_filter()
 
@@ -50,7 +61,9 @@ class LogMelFBank():
             n_fft=self.n_fft,
             n_mels=self.n_mels,
             fmin=self.fmin,
-            fmax=self.fmax)
+            fmax=self.fmax,
+            norm=self.norm,
+            htk=self.htk)
         return mel_filter
 
     def _stft(self, wav: np.ndarray):
@@ -66,7 +79,7 @@ class LogMelFBank():
 
     def _spectrogram(self, wav: np.ndarray):
         D = self._stft(wav)
-        return np.abs(D)
+        return np.abs(D)**self.power
 
     def _mel_spectrogram(self, wav: np.ndarray):
         S = self._spectrogram(wav)
@@ -102,9 +115,8 @@ class Pitch():
 
     def _convert_to_continuous_f0(self, f0: np.ndarray) -> np.ndarray:
         if (f0 == 0).all():
-            print("All frames seems to be unvoiced.")
+            print("All frames seems to be unvoiced, this utt will be removed.")
             return f0
-
         # padding start and end of f0 sequence
         start_f0 = f0[f0 != 0][0]
         end_f0 = f0[f0 != 0][-1]
@@ -166,6 +178,8 @@ class Pitch():
         f0 = self._calculate_f0(wav, use_continuous_f0, use_log_f0)
         if use_token_averaged_f0 and duration is not None:
             f0 = self._average_by_duration(f0, duration)
+        else:
+            f0 = np.expand_dims(np.array(f0), 0).T
         return f0
 
 
@@ -224,6 +238,8 @@ class Energy():
         energy = self._calculate_energy(wav)
         if use_token_averaged_energy and duration is not None:
             energy = self._average_by_duration(energy, duration)
+        else:
+            energy = np.expand_dims(np.array(energy), 0).T
         return energy
 
 
diff --git a/paddlespeech/t2s/datasets/preprocess_utils.py b/paddlespeech/t2s/datasets/preprocess_utils.py
index 445b69bd..bf813b22 100644
--- a/paddlespeech/t2s/datasets/preprocess_utils.py
+++ b/paddlespeech/t2s/datasets/preprocess_utils.py
@@ -12,6 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import re
+from typing import List
+
+import librosa
+import numpy as np
 
 
 # speaker|utt_id|phn dur phn dur ...
@@ -41,6 +45,90 @@ def get_phn_dur(file_name):
     return sentence, speaker_set
 
 
+def note2midi(notes: List[str]) -> List[str]:
+    """Covert note string to note id, for example: ["C1"] -> [24]
+
+    Args:
+        notes (List[str]): the list of note string
+
+    Returns:
+        List[str]: the list of note id
+    """
+    midis = []
+    for note in notes:
+        if note == 'rest':
+            midi = 0
+        else:
+            midi = librosa.note_to_midi(note.split("/")[0])
+        midis.append(midi)
+
+    return midis
+
+
+def time2frame(
+        times: List[float],
+        sample_rate: int=24000,
+        n_shift: int=128, ) -> List[int]:
+    """Convert the phoneme duration of time(s) into frames
+
+    Args:
+        times (List[float]): phoneme duration of time(s)
+        sample_rate (int, optional): sample rate. Defaults to 24000.
+        n_shift (int, optional): frame shift. Defaults to 128.
+
+    Returns:
+        List[int]: phoneme duration of frame
+    """
+    end = 0.0
+    ends = []
+    for t in times:
+        end += t
+        ends.append(end)
+    frame_pos = librosa.time_to_frames(ends, sr=sample_rate, hop_length=n_shift)
+    durations = np.diff(frame_pos, prepend=0)
+    return durations
+
+
+def get_sentences_svs(
+        file_name,
+        dataset: str='opencpop',
+        sample_rate: int=24000,
+        n_shift: int=128, ):
+    '''
+    read label file
+    Args:
+        file_name (str or Path): path of gen_duration_from_textgrid.py's result
+        dataset (str): dataset name
+    Returns: 
+        Dict: the information of sentence, include [phone id (int)], [the frame of phone (int)], [note id (int)], [note duration (float)], [is slur (int)], text(str), speaker name (str)
+        tuple: speaker name
+    '''
+    f = open(file_name, 'r')
+    sentence = {}
+    speaker_set = set()
+    if dataset == 'opencpop':
+        speaker_set.add("opencpop")
+        for line in f:
+            line_list = line.strip().split('|')
+            utt = line_list[0]
+            text = line_list[1]
+            ph = line_list[2].split()
+            midi = note2midi(line_list[3].split())
+            midi_dur = line_list[4].split()
+            ph_dur = time2frame([float(t) for t in line_list[5].split()], sample_rate=sample_rate, n_shift=n_shift)
+            is_slur = line_list[6].split()
+            assert len(ph) == len(midi) == len(midi_dur) == len(is_slur)
+            sentence[utt] = (ph, [int(i) for i in ph_dur],
+                             [int(i) for i in midi],
+                             [float(i) for i in midi_dur],
+                             [int(i) for i in is_slur], text, "opencpop")
+    else:
+        print("dataset should in {opencpop} now!")
+
+    f.close()
+    return sentence, speaker_set
+
+
 def merge_silence(sentence):
     '''
     merge silences
@@ -88,6 +176,9 @@ def get_input_token(sentence, output_path, dataset="baker"):
     phn_token = ["<pad>", "<unk>"] + phn_token
     if dataset in {"baker", "aishell3"}:
         phn_token += ["，", "。", "？", "！"]
+    # svs dataset
+    elif dataset in {"opencpop"}:
+        pass
     else:
         phn_token += [",", ".", "?", "!"]
     phn_token += ["<eos>"]
diff --git a/paddlespeech/t2s/exps/PTQ_static.py b/paddlespeech/t2s/exps/PTQ_static.py
index 16b3ae98..a9578645 100644
--- a/paddlespeech/t2s/exps/PTQ_static.py
+++ b/paddlespeech/t2s/exps/PTQ_static.py
@@ -42,6 +42,8 @@ def parse_args():
             'hifigan_aishell3',
             'hifigan_ljspeech',
             'hifigan_vctk',
+            'pwgan_opencpop',
+            'hifigan_opencpop',
         ],
         help='Choose model type of tts task.')
 
diff --git a/paddlespeech/t2s/exps/diffsinger/__init__.py b/paddlespeech/t2s/exps/diffsinger/__init__.py
new file mode 100644
index 00000000..595add0a
--- /dev/null
+++ b/paddlespeech/t2s/exps/diffsinger/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddlespeech/t2s/exps/diffsinger/gen_gta_mel.py b/paddlespeech/t2s/exps/diffsinger/gen_gta_mel.py
new file mode 100644
index 00000000..519808f2
--- /dev/null
+++ b/paddlespeech/t2s/exps/diffsinger/gen_gta_mel.py
@@ -0,0 +1,240 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# generate mels using durations.txt
+# for mb melgan finetune
+import argparse
+import os
+from pathlib import Path
+
+import numpy as np
+import paddle
+import yaml
+from tqdm import tqdm
+from yacs.config import CfgNode
+
+from paddlespeech.t2s.datasets.preprocess_utils import get_sentences_svs
+from paddlespeech.t2s.models.diffsinger import DiffSinger
+from paddlespeech.t2s.models.diffsinger import DiffSingerInference
+from paddlespeech.t2s.modules.normalizer import ZScore
+from paddlespeech.t2s.utils import str2bool
+
+
+def evaluate(args, diffsinger_config):
+    rootdir = Path(args.rootdir).expanduser()
+    assert rootdir.is_dir()
+
+    # construct dataset for evaluation
+    with open(args.phones_dict, "r") as f:
+        phn_id = [line.strip().split() for line in f.readlines()]
+    vocab_size = len(phn_id)
+    print("vocab_size:", vocab_size)
+
+    phone_dict = {}
+    for phn, id in phn_id:
+        phone_dict[phn] = int(id)
+
+    if args.speaker_dict:
+        with open(args.speaker_dict, 'rt') as f:
+            spk_id_list = [line.strip().split() for line in f.readlines()]
+            spk_num = len(spk_id_list)
+    else:
+        spk_num = None
+
+    with open(args.diffsinger_stretch, "r") as f:
+        spec_min = np.load(args.diffsinger_stretch)[0]
+        spec_max = np.load(args.diffsinger_stretch)[1]
+        spec_min = paddle.to_tensor(spec_min)
+        spec_max = paddle.to_tensor(spec_max)
+    print("min and max spec done!")
+
+    odim = diffsinger_config.n_mels
+    diffsinger_config["model"]["fastspeech2_params"]["spk_num"] = spk_num
+    model = DiffSinger(
+        spec_min=spec_min,
+        spec_max=spec_max,
+        idim=vocab_size,
+        odim=odim,
+        **diffsinger_config["model"], )
+
+    model.set_state_dict(paddle.load(args.diffsinger_checkpoint)["main_params"])
+    model.eval()
+
+    stat = np.load(args.diffsinger_stat)
+    mu, std = stat
+    mu = paddle.to_tensor(mu)
+    std = paddle.to_tensor(std)
+    diffsinger_normalizer = ZScore(mu, std)
+
+    diffsinger_inference = DiffSingerInference(diffsinger_normalizer, model)
+    diffsinger_inference.eval()
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    sentences, speaker_set = get_sentences_svs(
+        args.dur_file,
+        dataset=args.dataset,
+        sample_rate=diffsinger_config.fs,
+        n_shift=diffsinger_config.n_shift, )
+
+    if args.dataset == "opencpop":
+        wavdir = rootdir / "wavs"
+        # split data into 3 sections
+        train_file = rootdir / "train.txt"
+        train_wav_files = []
+        with open(train_file, "r") as f_train:
+            for line in f_train.readlines():
+                utt = line.split("|")[0]
+                wav_name = utt + ".wav"
+                wav_path = wavdir / wav_name
+                train_wav_files.append(wav_path)
+
+        test_file = rootdir / "test.txt"
+        dev_wav_files = []
+        test_wav_files = []
+        num_dev = 106
+        count = 0
+        with open(test_file, "r") as f_test:
+            for line in f_test.readlines():
+                count += 1
+                utt = line.split("|")[0]
+                wav_name = utt + ".wav"
+                wav_path = wavdir / wav_name
+                if count > num_dev:
+                    test_wav_files.append(wav_path)
+                else:
+                    dev_wav_files.append(wav_path)
+    else:
+        print("dataset should in {opencpop} now!")
+
+    train_wav_files = [
+        os.path.basename(str(str_path)) for str_path in train_wav_files
+    ]
+    dev_wav_files = [
+        os.path.basename(str(str_path)) for str_path in dev_wav_files
+    ]
+    test_wav_files = [
+        os.path.basename(str(str_path)) for str_path in test_wav_files
+    ]
+
+    for i, utt_id in enumerate(tqdm(sentences)):
+        phones = sentences[utt_id][0]
+        durations = sentences[utt_id][1]
+        note = sentences[utt_id][2]
+        note_dur = sentences[utt_id][3]
+        is_slur = sentences[utt_id][4]
+        speaker = sentences[utt_id][-1]
+
+        phone_ids = [phone_dict[phn] for phn in phones]
+        phone_ids = paddle.to_tensor(np.array(phone_ids))
+
+        if args.speaker_dict:
+            speaker_id = int(
+                [item[1] for item in spk_id_list if speaker == item[0]][0])
+            speaker_id = paddle.to_tensor(speaker_id)
+        else:
+            speaker_id = None
+
+        durations = paddle.to_tensor(np.array(durations))
+        note = paddle.to_tensor(np.array(note))
+        note_dur = paddle.to_tensor(np.array(note_dur))
+        is_slur = paddle.to_tensor(np.array(is_slur))
+        # 生成的和真实的可能有 1, 2 帧的差距，但是 batch_fn 会修复
+        # split data into 3 sections
+
+        wav_path = utt_id + ".wav"
+
+        if wav_path in train_wav_files:
+            sub_output_dir = output_dir / ("train/raw")
+        elif wav_path in dev_wav_files:
+            sub_output_dir = output_dir / ("dev/raw")
+        elif wav_path in test_wav_files:
+            sub_output_dir = output_dir / ("test/raw")
+
+        sub_output_dir.mkdir(parents=True, exist_ok=True)
+
+        with paddle.no_grad():
+            mel = diffsinger_inference(
+                text=phone_ids,
+                note=note,
+                note_dur=note_dur,
+                is_slur=is_slur,
+                get_mel_fs2=False)
+        np.save(sub_output_dir / (utt_id + "_feats.npy"), mel)
+
+
+def main():
+    # parse args and config and redirect to train_sp
+    parser = argparse.ArgumentParser(
+        description="Generate mel with diffsinger.")
+    parser.add_argument(
+        "--dataset",
+        default="opencpop",
+        type=str,
+        help="name of dataset, should in {opencpop} now")
+    parser.add_argument(
+        "--rootdir", default=None, type=str, help="directory to dataset.")
+    parser.add_argument(
+        "--diffsinger-config", type=str, help="diffsinger config file.")
+    parser.add_argument(
+        "--diffsinger-checkpoint",
+        type=str,
+        help="diffsinger checkpoint to load.")
+    parser.add_argument(
+        "--diffsinger-stat",
+        type=str,
+        help="mean and standard deviation used to normalize spectrogram when training diffsinger."
+    )
+    parser.add_argument(
+        "--diffsinger-stretch",
+        type=str,
+        help="min and max mel used to stretch before training diffusion.")
+
+    parser.add_argument(
+        "--phones-dict",
+        type=str,
+        default="phone_id_map.txt",
+        help="phone vocabulary file.")
+
+    parser.add_argument(
+        "--speaker-dict", type=str, default=None, help="speaker id map file.")
+
+    parser.add_argument(
+        "--dur-file", default=None, type=str, help="path to durations.txt.")
+    parser.add_argument("--output-dir", type=str, help="output dir.")
+    parser.add_argument(
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
+
+    args = parser.parse_args()
+
+    if args.ngpu == 0:
+        paddle.set_device("cpu")
+    elif args.ngpu > 0:
+        paddle.set_device("gpu")
+    else:
+        print("ngpu should >= 0 !")
+
+    with open(args.diffsinger_config) as f:
+        diffsinger_config = CfgNode(yaml.safe_load(f))
+
+    print("========Args========")
+    print(yaml.safe_dump(vars(args)))
+    print("========Config========")
+    print(diffsinger_config)
+
+    evaluate(args, diffsinger_config)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/paddlespeech/t2s/exps/diffsinger/get_minmax.py b/paddlespeech/t2s/exps/diffsinger/get_minmax.py
new file mode 100644
index 00000000..5457f1e2
--- /dev/null
+++ b/paddlespeech/t2s/exps/diffsinger/get_minmax.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import logging
+
+import jsonlines
+import numpy as np
+from tqdm import tqdm
+
+from paddlespeech.t2s.datasets.data_table import DataTable
+
+
+def get_minmax(spec, min_spec, max_spec):
+    # spec: [T, 80]
+    for i in range(spec.shape[1]):
+        min_value = np.min(spec[:, i])
+        max_value = np.max(spec[:, i])
+        min_spec[i] = min(min_value, min_spec[i])
+        max_spec[i] = max(max_value, max_spec[i])
+
+    return min_spec, max_spec
+
+
+def main():
+    """Run preprocessing process."""
+    parser = argparse.ArgumentParser(
+        description="Normalize dumped raw features (See detail in parallel_wavegan/bin/normalize.py)."
+    )
+    parser.add_argument(
+        "--metadata",
+        type=str,
+        required=True,
+        help="directory including feature files to be normalized. "
+        "you need to specify either *-scp or rootdir.")
+
+    parser.add_argument(
+        "--speech-stretchs",
+        type=str,
+        required=True,
+        help="min max spec file. only computer on train data")
+
+    args = parser.parse_args()
+
+    # get dataset
+    with jsonlines.open(args.metadata, 'r') as reader:
+        metadata = list(reader)
+    dataset = DataTable(
+        metadata, converters={
+            "speech": np.load,
+        })
+    logging.info(f"The number of files = {len(dataset)}.")
+
+    n_mel = 80
+    min_spec = 100.0 * np.ones(shape=(n_mel), dtype=np.float32)
+    max_spec = -100.0 * np.ones(shape=(n_mel), dtype=np.float32)
+
+    for item in tqdm(dataset):
+        spec = item['speech']
+        min_spec, max_spec = get_minmax(spec, min_spec, max_spec)
+
+    # Using min_spec=-6.0 training effect is better so far
+    min_spec = -6.0 * np.ones(shape=(n_mel), dtype=np.float32)
+    min_max_spec = np.stack([min_spec, max_spec], axis=0)
+    np.save(
+        str(args.speech_stretchs),
+        min_max_spec.astype(np.float32),
+        allow_pickle=False)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/paddlespeech/t2s/exps/diffsinger/normalize.py b/paddlespeech/t2s/exps/diffsinger/normalize.py
new file mode 100644
index 00000000..d3e61162
--- /dev/null
+++ b/paddlespeech/t2s/exps/diffsinger/normalize.py
@@ -0,0 +1,189 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Normalize feature files and dump them."""
+import argparse
+import logging
+from operator import itemgetter
+from pathlib import Path
+
+import jsonlines
+import numpy as np
+from sklearn.preprocessing import StandardScaler
+from tqdm import tqdm
+
+from paddlespeech.t2s.datasets.data_table import DataTable
+from paddlespeech.t2s.utils import str2bool
+
+
+def main():
+    """Run preprocessing process."""
+    parser = argparse.ArgumentParser(
+        description="Normalize dumped raw features (See detail in parallel_wavegan/bin/normalize.py)."
+    )
+    parser.add_argument(
+        "--metadata",
+        type=str,
+        required=True,
+        help="directory including feature files to be normalized. "
+        "you need to specify either *-scp or rootdir.")
+
+    parser.add_argument(
+        "--dumpdir",
+        type=str,
+        required=True,
+        help="directory to dump normalized feature files.")
+    parser.add_argument(
+        "--speech-stats",
+        type=str,
+        required=True,
+        help="speech statistics file.")
+    parser.add_argument(
+        "--pitch-stats", type=str, required=True, help="pitch statistics file.")
+    parser.add_argument(
+        "--energy-stats",
+        type=str,
+        required=True,
+        help="energy statistics file.")
+    parser.add_argument(
+        "--phones-dict", type=str, default=None, help="phone vocabulary file.")
+    parser.add_argument(
+        "--speaker-dict", type=str, default=None, help="speaker id map file.")
+    parser.add_argument(
+        "--norm-feats",
+        type=str2bool,
+        default=False,
+        help="whether to norm features")
+
+    args = parser.parse_args()
+
+    dumpdir = Path(args.dumpdir).expanduser()
+    # use absolute path
+    dumpdir = dumpdir.resolve()
+    dumpdir.mkdir(parents=True, exist_ok=True)
+
+    # get dataset
+    with jsonlines.open(args.metadata, 'r') as reader:
+        metadata = list(reader)
+    dataset = DataTable(
+        metadata,
+        converters={
+            "speech": np.load,
+            "pitch": np.load,
+            "energy": np.load,
+        })
+    logging.info(f"The number of files = {len(dataset)}.")
+
+    # restore scaler
+    speech_scaler = StandardScaler()
+    if args.norm_feats:
+        speech_scaler.mean_ = np.load(args.speech_stats)[0]
+        speech_scaler.scale_ = np.load(args.speech_stats)[1]
+    else:
+        speech_scaler.mean_ = np.zeros(
+            np.load(args.speech_stats)[0].shape, dtype="float32")
+        speech_scaler.scale_ = np.ones(
+            np.load(args.speech_stats)[1].shape, dtype="float32")
+    speech_scaler.n_features_in_ = speech_scaler.mean_.shape[0]
+
+    pitch_scaler = StandardScaler()
+    if args.norm_feats:
+        pitch_scaler.mean_ = np.load(args.pitch_stats)[0]
+        pitch_scaler.scale_ = np.load(args.pitch_stats)[1]
+    else:
+        pitch_scaler.mean_ = np.zeros(
+            np.load(args.pitch_stats)[0].shape, dtype="float32")
+        pitch_scaler.scale_ = np.ones(
+            np.load(args.pitch_stats)[1].shape, dtype="float32")
+    pitch_scaler.n_features_in_ = pitch_scaler.mean_.shape[0]
+
+    energy_scaler = StandardScaler()
+    if args.norm_feats:
+        energy_scaler.mean_ = np.load(args.energy_stats)[0]
+        energy_scaler.scale_ = np.load(args.energy_stats)[1]
+    else:
+        energy_scaler.mean_ = np.zeros(
+            np.load(args.energy_stats)[0].shape, dtype="float32")
+        energy_scaler.scale_ = np.ones(
+            np.load(args.energy_stats)[1].shape, dtype="float32")
+    energy_scaler.n_features_in_ = energy_scaler.mean_.shape[0]
+
+    vocab_phones = {}
+    with open(args.phones_dict, 'rt') as f:
+        phn_id = [line.strip().split() for line in f.readlines()]
+    for phn, id in phn_id:
+        vocab_phones[phn] = int(id)
+
+    vocab_speaker = {}
+    with open(args.speaker_dict, 'rt') as f:
+        spk_id = [line.strip().split() for line in f.readlines()]
+    for spk, id in spk_id:
+        vocab_speaker[spk] = int(id)
+
+    # process each file
+    output_metadata = []
+
+    for item in tqdm(dataset):
+        utt_id = item['utt_id']
+        speech = item['speech']
+        pitch = item['pitch']
+        energy = item['energy']
+        # normalize
+        speech = speech_scaler.transform(speech)
+        speech_dir = dumpdir / "data_speech"
+        speech_dir.mkdir(parents=True, exist_ok=True)
+        speech_path = speech_dir / f"{utt_id}_speech.npy"
+        np.save(speech_path, speech.astype(np.float32), allow_pickle=False)
+
+        pitch = pitch_scaler.transform(pitch)
+        pitch_dir = dumpdir / "data_pitch"
+        pitch_dir.mkdir(parents=True, exist_ok=True)
+        pitch_path = pitch_dir / f"{utt_id}_pitch.npy"
+        np.save(pitch_path, pitch.astype(np.float32), allow_pickle=False)
+
+        energy = energy_scaler.transform(energy)
+        energy_dir = dumpdir / "data_energy"
+        energy_dir.mkdir(parents=True, exist_ok=True)
+        energy_path = energy_dir / f"{utt_id}_energy.npy"
+        np.save(energy_path, energy.astype(np.float32), allow_pickle=False)
+        phone_ids = [vocab_phones[p] for p in item['phones']]
+        spk_id = vocab_speaker[item["speaker"]]
+        record = {
+            "utt_id": item['utt_id'],
+            "spk_id": spk_id,
+            "text": phone_ids,
+            "text_lengths": item['text_lengths'],
+            "speech_lengths": item['speech_lengths'],
+            "durations": item['durations'],
+            "speech": str(speech_path),
+            "pitch": str(pitch_path),
+            "energy": str(energy_path),
+            "note": item['note'],
+            "note_dur": item['note_dur'],
+            "is_slur": item['is_slur'],
+        }
+        # add spk_emb for voice cloning
+        if "spk_emb" in item:
+            record["spk_emb"] = str(item["spk_emb"])
+
+        output_metadata.append(record)
+    output_metadata.sort(key=itemgetter('utt_id'))
+    output_metadata_path = Path(args.dumpdir) / "metadata.jsonl"
+    with jsonlines.open(output_metadata_path, 'w') as writer:
+        for item in output_metadata:
+            writer.write(item)
+    logging.info(f"metadata dumped into {output_metadata_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/paddlespeech/t2s/exps/diffsinger/preprocess.py b/paddlespeech/t2s/exps/diffsinger/preprocess.py
new file mode 100644
index 00000000..a60ad44d
--- /dev/null
+++ b/paddlespeech/t2s/exps/diffsinger/preprocess.py
@@ -0,0 +1,377 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+from concurrent.futures import ThreadPoolExecutor
+from operator import itemgetter
+from pathlib import Path
+from typing import Any
+from typing import Dict
+from typing import List
+
+import jsonlines
+import librosa
+import numpy as np
+import tqdm
+import yaml
+from yacs.config import CfgNode
+
+from paddlespeech.t2s.datasets.get_feats import Energy
+from paddlespeech.t2s.datasets.get_feats import LogMelFBank
+from paddlespeech.t2s.datasets.get_feats import Pitch
+from paddlespeech.t2s.datasets.preprocess_utils import compare_duration_and_mel_length
+from paddlespeech.t2s.datasets.preprocess_utils import get_input_token
+from paddlespeech.t2s.datasets.preprocess_utils import get_sentences_svs
+from paddlespeech.t2s.datasets.preprocess_utils import get_spk_id_map
+from paddlespeech.t2s.utils import str2bool
+
+ALL_INITIALS = [
+    'zh', 'ch', 'sh', 'b', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h',
+    'j', 'q', 'x', 'r', 'z', 'c', 's', 'y', 'w'
+]
+ALL_FINALS = [
+    'a', 'ai', 'an', 'ang', 'ao', 'e', 'ei', 'en', 'eng', 'er', 'i', 'ia',
+    'ian', 'iang', 'iao', 'ie', 'in', 'ing', 'iong', 'iu', 'ng', 'o', 'ong',
+    'ou', 'u', 'ua', 'uai', 'uan', 'uang', 'ui', 'un', 'uo', 'v', 'van', 've',
+    'vn'
+]
+
+
+def process_sentence(
+        config: Dict[str, Any],
+        fp: Path,
+        sentences: Dict,
+        output_dir: Path,
+        mel_extractor=None,
+        pitch_extractor=None,
+        energy_extractor=None,
+        cut_sil: bool=True,
+        spk_emb_dir: Path=None, ):
+    utt_id = fp.stem
+    record = None
+    if utt_id in sentences:
+        # reading, resampling may occur
+        wav, _ = librosa.load(str(fp), sr=config.fs)
+        if len(wav.shape) != 1:
+            return record
+        max_value = np.abs(wav).max()
+        if max_value > 1.0:
+            wav = wav / max_value
+        assert len(wav.shape) == 1, f"{utt_id} is not a mono-channel audio."
+        assert np.abs(wav).max(
+        ) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM."
+        phones = sentences[utt_id][0]
+        durations = sentences[utt_id][1]
+        note = sentences[utt_id][2]
+        note_dur = sentences[utt_id][3]
+        is_slur = sentences[utt_id][4]
+        speaker = sentences[utt_id][-1]
+
+        # extract mel feats
+        logmel = mel_extractor.get_log_mel_fbank(wav)
+        # change duration according to mel_length
+        compare_duration_and_mel_length(sentences, utt_id, logmel)
+        # utt_id may be popped in compare_duration_and_mel_length
+        if utt_id not in sentences:
+            return None
+        phones = sentences[utt_id][0]
+        durations = sentences[utt_id][1]
+        num_frames = logmel.shape[0]
+
+        assert sum(
+            durations
+        ) == num_frames, "the sum of durations doesn't equal to the num of mel frames. "
+        speech_dir = output_dir / "data_speech"
+        speech_dir.mkdir(parents=True, exist_ok=True)
+        speech_path = speech_dir / (utt_id + "_speech.npy")
+        np.save(speech_path, logmel)
+        # extract pitch and energy
+        pitch = pitch_extractor.get_pitch(wav)
+        assert pitch.shape[0] == num_frames
+        pitch_dir = output_dir / "data_pitch"
+        pitch_dir.mkdir(parents=True, exist_ok=True)
+        pitch_path = pitch_dir / (utt_id + "_pitch.npy")
+        np.save(pitch_path, pitch)
+        energy = energy_extractor.get_energy(wav)
+        assert energy.shape[0] == num_frames
+        energy_dir = output_dir / "data_energy"
+        energy_dir.mkdir(parents=True, exist_ok=True)
+        energy_path = energy_dir / (utt_id + "_energy.npy")
+        np.save(energy_path, energy)
+
+        record = {
+            "utt_id": utt_id,
+            "phones": phones,
+            "text_lengths": len(phones),
+            "speech_lengths": num_frames,
+            "durations": durations,
+            "speech": str(speech_path),
+            "pitch": str(pitch_path),
+            "energy": str(energy_path),
+            "speaker": speaker,
+            "note": note,
+            "note_dur": note_dur,
+            "is_slur": is_slur,
+        }
+        if spk_emb_dir:
+            if speaker in os.listdir(spk_emb_dir):
+                embed_name = utt_id + ".npy"
+                embed_path = spk_emb_dir / speaker / embed_name
+                if embed_path.is_file():
+                    record["spk_emb"] = str(embed_path)
+                else:
+                    return None
+    return record
+
+
+def process_sentences(
+        config,
+        fps: List[Path],
+        sentences: Dict,
+        output_dir: Path,
+        mel_extractor=None,
+        pitch_extractor=None,
+        energy_extractor=None,
+        nprocs: int=1,
+        cut_sil: bool=True,
+        spk_emb_dir: Path=None,
+        write_metadata_method: str='w', ):
+    if nprocs == 1:
+        results = []
+        for fp in tqdm.tqdm(fps, total=len(fps)):
+            record = process_sentence(
+                config=config,
+                fp=fp,
+                sentences=sentences,
+                output_dir=output_dir,
+                mel_extractor=mel_extractor,
+                pitch_extractor=pitch_extractor,
+                energy_extractor=energy_extractor,
+                cut_sil=cut_sil,
+                spk_emb_dir=spk_emb_dir, )
+            if record:
+                results.append(record)
+    else:
+        with ThreadPoolExecutor(nprocs) as pool:
+            futures = []
+            with tqdm.tqdm(total=len(fps)) as progress:
+                for fp in fps:
+                    future = pool.submit(
+                        process_sentence,
+                        config,
+                        fp,
+                        sentences,
+                        output_dir,
+                        mel_extractor,
+                        pitch_extractor,
+                        energy_extractor,
+                        cut_sil,
+                        spk_emb_dir, )
+                    future.add_done_callback(lambda p: progress.update())
+                    futures.append(future)
+
+                results = []
+                for ft in futures:
+                    record = ft.result()
+                    if record:
+                        results.append(record)
+
+    results.sort(key=itemgetter("utt_id"))
+    with jsonlines.open(output_dir / "metadata.jsonl",
+                        write_metadata_method) as writer:
+        for item in results:
+            writer.write(item)
+    print("Done")
+
+
+def main():
+    # parse config and args
+    parser = argparse.ArgumentParser(
+        description="Preprocess audio and then extract features.")
+
+    parser.add_argument(
+        "--dataset",
+        default="opencpop",
+        type=str,
+        help="name of dataset, should in {opencpop} now")
+
+    parser.add_argument(
+        "--rootdir", default=None, type=str, help="directory to dataset.")
+
+    parser.add_argument(
+        "--dumpdir",
+        type=str,
+        required=True,
+        help="directory to dump feature files.")
+
+    parser.add_argument(
+        "--label-file", default=None, type=str, help="path to label file.")
+
+    parser.add_argument("--config", type=str, help="diffsinger config file.")
+
+    parser.add_argument(
+        "--num-cpu", type=int, default=1, help="number of process.")
+
+    parser.add_argument(
+        "--cut-sil",
+        type=str2bool,
+        default=True,
+        help="whether cut sil in the edge of audio")
+
+    parser.add_argument(
+        "--spk_emb_dir",
+        default=None,
+        type=str,
+        help="directory to speaker embedding files.")
+
+    parser.add_argument(
+        "--write_metadata_method",
+        default="w",
+        type=str,
+        choices=["w", "a"],
+        help="How the metadata.jsonl file is written.")
+    args = parser.parse_args()
+
+    rootdir = Path(args.rootdir).expanduser()
+    dumpdir = Path(args.dumpdir).expanduser()
+    # use absolute path
+    dumpdir = dumpdir.resolve()
+    dumpdir.mkdir(parents=True, exist_ok=True)
+    label_file = Path(args.label_file).expanduser()
+
+    if args.spk_emb_dir:
+        spk_emb_dir = Path(args.spk_emb_dir).expanduser().resolve()
+    else:
+        spk_emb_dir = None
+
+    assert rootdir.is_dir()
+    assert label_file.is_file()
+
+    with open(args.config, 'rt') as f:
+        config = CfgNode(yaml.safe_load(f))
+
+    sentences, speaker_set = get_sentences_svs(
+        label_file,
+        dataset=args.dataset,
+        sample_rate=config.fs,
+        n_shift=config.n_shift, )
+
+    phone_id_map_path = dumpdir / "phone_id_map.txt"
+    speaker_id_map_path = dumpdir / "speaker_id_map.txt"
+    get_input_token(sentences, phone_id_map_path, args.dataset)
+    get_spk_id_map(speaker_set, speaker_id_map_path)
+
+    if args.dataset == "opencpop":
+        wavdir = rootdir / "wavs"
+        # split data into 3 sections
+        train_file = rootdir / "train.txt"
+        train_wav_files = []
+        with open(train_file, "r") as f_train:
+            for line in f_train.readlines():
+                utt = line.split("|")[0]
+                wav_name = utt + ".wav"
+                wav_path = wavdir / wav_name
+                train_wav_files.append(wav_path)
+
+        test_file = rootdir / "test.txt"
+        dev_wav_files = []
+        test_wav_files = []
+        num_dev = 106
+        count = 0
+        with open(test_file, "r") as f_test:
+            for line in f_test.readlines():
+                count += 1
+                utt = line.split("|")[0]
+                wav_name = utt + ".wav"
+                wav_path = wavdir / wav_name
+                if count > num_dev:
+                    test_wav_files.append(wav_path)
+                else:
+                    dev_wav_files.append(wav_path)
+
+    else:
+        print("dataset should in {opencpop} now!")
+
+    train_dump_dir = dumpdir / "train" / "raw"
+    train_dump_dir.mkdir(parents=True, exist_ok=True)
+    dev_dump_dir = dumpdir / "dev" / "raw"
+    dev_dump_dir.mkdir(parents=True, exist_ok=True)
+    test_dump_dir = dumpdir / "test" / "raw"
+    test_dump_dir.mkdir(parents=True, exist_ok=True)
+
+    # Extractor
+    mel_extractor = LogMelFBank(
+        sr=config.fs,
+        n_fft=config.n_fft,
+        hop_length=config.n_shift,
+        win_length=config.win_length,
+        window=config.window,
+        n_mels=config.n_mels,
+        fmin=config.fmin,
+        fmax=config.fmax)
+    pitch_extractor = Pitch(
+        sr=config.fs,
+        hop_length=config.n_shift,
+        f0min=config.f0min,
+        f0max=config.f0max)
+    energy_extractor = Energy(
+        n_fft=config.n_fft,
+        hop_length=config.n_shift,
+        win_length=config.win_length,
+        window=config.window)
+
+    # process for the 3 sections
+    if train_wav_files:
+        process_sentences(
+            config=config,
+            fps=train_wav_files,
+            sentences=sentences,
+            output_dir=train_dump_dir,
+            mel_extractor=mel_extractor,
+            pitch_extractor=pitch_extractor,
+            energy_extractor=energy_extractor,
+            nprocs=args.num_cpu,
+            cut_sil=args.cut_sil,
+            spk_emb_dir=spk_emb_dir,
+            write_metadata_method=args.write_metadata_method)
+    if dev_wav_files:
+        process_sentences(
+            config=config,
+            fps=dev_wav_files,
+            sentences=sentences,
+            output_dir=dev_dump_dir,
+            mel_extractor=mel_extractor,
+            pitch_extractor=pitch_extractor,
+            energy_extractor=energy_extractor,
+            nprocs=args.num_cpu,
+            cut_sil=args.cut_sil,
+            spk_emb_dir=spk_emb_dir,
+            write_metadata_method=args.write_metadata_method)
+    if test_wav_files:
+        process_sentences(
+            config=config,
+            fps=test_wav_files,
+            sentences=sentences,
+            output_dir=test_dump_dir,
+            mel_extractor=mel_extractor,
+            pitch_extractor=pitch_extractor,
+            energy_extractor=energy_extractor,
+            nprocs=args.num_cpu,
+            cut_sil=args.cut_sil,
+            spk_emb_dir=spk_emb_dir,
+            write_metadata_method=args.write_metadata_method)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/paddlespeech/t2s/exps/diffsinger/train.py b/paddlespeech/t2s/exps/diffsinger/train.py
new file mode 100644
index 00000000..e79104c4
--- /dev/null
+++ b/paddlespeech/t2s/exps/diffsinger/train.py
@@ -0,0 +1,257 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import logging
+import os
+import shutil
+from pathlib import Path
+
+import jsonlines
+import numpy as np
+import paddle
+import yaml
+from paddle import DataParallel
+from paddle import distributed as dist
+from paddle import nn
+from paddle.io import DataLoader
+from paddle.io import DistributedBatchSampler
+from paddle.optimizer import AdamW
+from paddle.optimizer.lr import StepDecay
+from yacs.config import CfgNode
+
+from paddlespeech.t2s.datasets.am_batch_fn import diffsinger_multi_spk_batch_fn
+from paddlespeech.t2s.datasets.am_batch_fn import diffsinger_single_spk_batch_fn
+from paddlespeech.t2s.datasets.data_table import DataTable
+from paddlespeech.t2s.models.diffsinger import DiffSinger
+from paddlespeech.t2s.models.diffsinger import DiffSingerEvaluator
+from paddlespeech.t2s.models.diffsinger import DiffSingerUpdater
+from paddlespeech.t2s.models.diffsinger import DiffusionLoss
+from paddlespeech.t2s.models.diffsinger.fastspeech2midi import FastSpeech2MIDILoss
+from paddlespeech.t2s.training.extensions.snapshot import Snapshot
+from paddlespeech.t2s.training.extensions.visualizer import VisualDL
+from paddlespeech.t2s.training.optimizer import build_optimizers
+from paddlespeech.t2s.training.seeding import seed_everything
+from paddlespeech.t2s.training.trainer import Trainer
+
+
+def train_sp(args, config):
+    # decides device type and whether to run in parallel
+    # setup running environment correctly
+    if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0:
+        paddle.set_device("cpu")
+    else:
+        paddle.set_device("gpu")
+    world_size = paddle.distributed.get_world_size()
+    if world_size > 1:
+        paddle.distributed.init_parallel_env()
+
+    # set the random seed, it is a must for multiprocess training
+    seed_everything(config.seed)
+
+    print(
+        f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}",
+    )
+    fields = [
+        "text", "text_lengths", "speech", "speech_lengths", "durations",
+        "pitch", "energy", "note", "note_dur", "is_slur"
+    ]
+    converters = {"speech": np.load, "pitch": np.load, "energy": np.load}
+    spk_num = None
+    if args.speaker_dict is not None:
+        print("multiple speaker diffsinger!")
+        collate_fn = diffsinger_multi_spk_batch_fn
+        with open(args.speaker_dict, 'rt') as f:
+            spk_id = [line.strip().split() for line in f.readlines()]
+        spk_num = len(spk_id)
+        fields += ["spk_id"]
+    else:
+        collate_fn = diffsinger_single_spk_batch_fn
+        print("single speaker diffsinger!")
+
+    print("spk_num:", spk_num)
+
+    # dataloader has been too verbose
+    logging.getLogger("DataLoader").disabled = True
+
+    # construct dataset for training and validation
+    with jsonlines.open(args.train_metadata, 'r') as reader:
+        train_metadata = list(reader)
+    train_dataset = DataTable(
+        data=train_metadata,
+        fields=fields,
+        converters=converters, )
+    with jsonlines.open(args.dev_metadata, 'r') as reader:
+        dev_metadata = list(reader)
+    dev_dataset = DataTable(
+        data=dev_metadata,
+        fields=fields,
+        converters=converters, )
+
+    # collate function and dataloader
+    train_sampler = DistributedBatchSampler(
+        train_dataset,
+        batch_size=config.batch_size,
+        shuffle=True,
+        drop_last=True)
+
+    print("samplers done!")
+
+    train_dataloader = DataLoader(
+        train_dataset,
+        batch_sampler=train_sampler,
+        collate_fn=collate_fn,
+        num_workers=config.num_workers)
+
+    dev_dataloader = DataLoader(
+        dev_dataset,
+        shuffle=False,
+        drop_last=False,
+        batch_size=config.batch_size,
+        collate_fn=collate_fn,
+        num_workers=config.num_workers)
+    print("dataloaders done!")
+
+    with open(args.phones_dict, "r") as f:
+        phn_id = [line.strip().split() for line in f.readlines()]
+    vocab_size = len(phn_id)
+    print("vocab_size:", vocab_size)
+
+    with open(args.speech_stretchs, "r") as f:
+        spec_min = np.load(args.speech_stretchs)[0]
+        spec_max = np.load(args.speech_stretchs)[1]
+        spec_min = paddle.to_tensor(spec_min)
+        spec_max = paddle.to_tensor(spec_max)
+    print("min and max spec done!")
+
+    odim = config.n_mels
+    config["model"]["fastspeech2_params"]["spk_num"] = spk_num
+    model = DiffSinger(
+        spec_min=spec_min,
+        spec_max=spec_max,
+        idim=vocab_size,
+        odim=odim,
+        **config["model"], )
+    model_fs2 = model.fs2
+    model_ds = model.diffusion
+    if world_size > 1:
+        model = DataParallel(model)
+        model_fs2 = model._layers.fs2
+        model_ds = model._layers.diffusion
+    print("models done!")
+
+    criterion_fs2 = FastSpeech2MIDILoss(**config["fs2_updater"])
+    criterion_ds = DiffusionLoss(**config["ds_updater"])
+    print("criterions done!")
+
+    optimizer_fs2 = build_optimizers(model_fs2, **config["fs2_optimizer"])
+    lr_schedule_ds = StepDecay(**config["ds_scheduler_params"])
+    gradient_clip_ds = nn.ClipGradByGlobalNorm(config["ds_grad_norm"])
+    optimizer_ds = AdamW(
+        learning_rate=lr_schedule_ds,
+        grad_clip=gradient_clip_ds,
+        parameters=model_ds.parameters(),
+        **config["ds_optimizer_params"])
+    print("optimizer done!")
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    if dist.get_rank() == 0:
+        config_name = args.config.split("/")[-1]
+        # copy conf to output_dir
+        shutil.copyfile(args.config, output_dir / config_name)
+
+    updater = DiffSingerUpdater(
+        model=model,
+        optimizers={
+            "fs2": optimizer_fs2,
+            "ds": optimizer_ds,
+        },
+        criterions={
+            "fs2": criterion_fs2,
+            "ds": criterion_ds,
+        },
+        dataloader=train_dataloader,
+        ds_train_start_steps=config.ds_train_start_steps,
+        output_dir=output_dir,
+        only_train_diffusion=config["only_train_diffusion"])
+
+    evaluator = DiffSingerEvaluator(
+        model=model,
+        criterions={
+            "fs2": criterion_fs2,
+            "ds": criterion_ds,
+        },
+        dataloader=dev_dataloader,
+        output_dir=output_dir, )
+
+    trainer = Trainer(
+        updater,
+        stop_trigger=(config.train_max_steps, "iteration"),
+        out=output_dir, )
+
+    if dist.get_rank() == 0:
+        trainer.extend(
+            evaluator, trigger=(config.eval_interval_steps, 'iteration'))
+        trainer.extend(VisualDL(output_dir), trigger=(1, 'iteration'))
+    trainer.extend(
+        Snapshot(max_size=config.num_snapshots),
+        trigger=(config.save_interval_steps, 'iteration'))
+
+    print("Trainer Done!")
+    trainer.run()
+
+
+def main():
+    # parse args and config and redirect to train_sp
+    parser = argparse.ArgumentParser(description="Train a DiffSinger model.")
+    parser.add_argument("--config", type=str, help="diffsinger config file.")
+    parser.add_argument("--train-metadata", type=str, help="training data.")
+    parser.add_argument("--dev-metadata", type=str, help="dev data.")
+    parser.add_argument("--output-dir", type=str, help="output dir.")
+    parser.add_argument(
+        "--ngpu", type=int, default=1, help="if ngpu=0, use cpu.")
+    parser.add_argument(
+        "--phones-dict", type=str, default=None, help="phone vocabulary file.")
+    parser.add_argument(
+        "--speaker-dict",
+        type=str,
+        default=None,
+        help="speaker id map file for multiple speaker model.")
+    parser.add_argument(
+        "--speech-stretchs",
+        type=str,
+        help="The min and max values of the mel spectrum.")
+
+    args = parser.parse_args()
+
+    with open(args.config) as f:
+        config = CfgNode(yaml.safe_load(f))
+
+    print("========Args========")
+    print(yaml.safe_dump(vars(args)))
+    print("========Config========")
+    print(config)
+    print(
+        f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}"
+    )
+
+    # dispatch
+    if args.ngpu > 1:
+        dist.spawn(train_sp, (args, config), nprocs=args.ngpu)
+    else:
+        train_sp(args, config)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/paddlespeech/t2s/exps/dygraph_to_static.py b/paddlespeech/t2s/exps/dygraph_to_static.py
new file mode 100644
index 00000000..5e15ca4c
--- /dev/null
+++ b/paddlespeech/t2s/exps/dygraph_to_static.py
@@ -0,0 +1,170 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+
+import yaml
+from yacs.config import CfgNode
+
+from paddlespeech.t2s.exps.syn_utils import am_to_static
+from paddlespeech.t2s.exps.syn_utils import get_am_inference
+from paddlespeech.t2s.exps.syn_utils import get_voc_inference
+from paddlespeech.t2s.exps.syn_utils import voc_to_static
+
+
+def am_dygraph_to_static(args):
+    with open(args.am_config) as f:
+        am_config = CfgNode(yaml.safe_load(f))
+    am_inference = get_am_inference(
+        am=args.am,
+        am_config=am_config,
+        am_ckpt=args.am_ckpt,
+        am_stat=args.am_stat,
+        phones_dict=args.phones_dict,
+        tones_dict=args.tones_dict,
+        speaker_dict=args.speaker_dict)
+    print("acoustic model done!")
+
+    # dygraph to static
+    am_inference = am_to_static(
+        am_inference=am_inference,
+        am=args.am,
+        inference_dir=args.inference_dir,
+        speaker_dict=args.speaker_dict)
+    print("finish to convert dygraph acoustic model to static!")
+
+
+def voc_dygraph_to_static(args):
+    with open(args.voc_config) as f:
+        voc_config = CfgNode(yaml.safe_load(f))
+    voc_inference = get_voc_inference(
+        voc=args.voc,
+        voc_config=voc_config,
+        voc_ckpt=args.voc_ckpt,
+        voc_stat=args.voc_stat)
+    print("voc done!")
+
+    # dygraph to static
+    voc_inference = voc_to_static(
+        voc_inference=voc_inference,
+        voc=args.voc,
+        inference_dir=args.inference_dir)
+    print("finish to convert dygraph vocoder model to static!")
+
+
+def parse_args():
+    # parse args and config
+    parser = argparse.ArgumentParser(
+        description="Synthesize with acoustic model & vocoder")
+    parser.add_argument(
+        '--type',
+        type=str,
+        required=True,
+        choices=["am", "voc"],
+        help='Choose the model type of dynamic to static, am or voc')
+    # acoustic model
+    parser.add_argument(
+        '--am',
+        type=str,
+        default='fastspeech2_csmsc',
+        choices=[
+            'speedyspeech_csmsc',
+            'speedyspeech_aishell3',
+            'fastspeech2_csmsc',
+            'fastspeech2_ljspeech',
+            'fastspeech2_aishell3',
+            'fastspeech2_vctk',
+            'tacotron2_csmsc',
+            'tacotron2_ljspeech',
+            'fastspeech2_mix',
+            'fastspeech2_canton',
+            'fastspeech2_male-zh',
+            'fastspeech2_male-en',
+            'fastspeech2_male-mix',
+        ],
+        help='Choose acoustic model type of tts task.')
+    parser.add_argument(
+        '--am_config', type=str, default=None, help='Config of acoustic model.')
+    parser.add_argument(
+        '--am_ckpt',
+        type=str,
+        default=None,
+        help='Checkpoint file of acoustic model.')
+    parser.add_argument(
+        "--am_stat",
+        type=str,
+        default=None,
+        help="mean and standard deviation used to normalize spectrogram when training acoustic model."
+    )
+    parser.add_argument(
+        "--phones_dict", type=str, default=None, help="phone vocabulary file.")
+    parser.add_argument(
+        "--tones_dict", type=str, default=None, help="tone vocabulary file.")
+    parser.add_argument(
+        "--speaker_dict", type=str, default=None, help="speaker id map file.")
+    # vocoder
+    parser.add_argument(
+        '--voc',
+        type=str,
+        default='pwgan_csmsc',
+        choices=[
+            'pwgan_csmsc',
+            'pwgan_ljspeech',
+            'pwgan_aishell3',
+            'pwgan_vctk',
+            'mb_melgan_csmsc',
+            'style_melgan_csmsc',
+            'hifigan_csmsc',
+            'hifigan_ljspeech',
+            'hifigan_aishell3',
+            'hifigan_vctk',
+            'wavernn_csmsc',
+            'pwgan_male',
+            'hifigan_male',
+            'pwgan_opencpop',
+            'hifigan_opencpop',
+        ],
+        help='Choose vocoder type of tts task.')
+    parser.add_argument(
+        '--voc_config', type=str, default=None, help='Config of voc.')
+    parser.add_argument(
+        '--voc_ckpt', type=str, default=None, help='Checkpoint file of voc.')
+    parser.add_argument(
+        "--voc_stat",
+        type=str,
+        default=None,
+        help="mean and standard deviation used to normalize spectrogram when training voc."
+    )
+    # other
+    parser.add_argument(
+        "--inference_dir",
+        type=str,
+        default=None,
+        help="dir to save inference models")
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    if args.type == "am":
+        am_dygraph_to_static(args)
+    elif args.type == "voc":
+        voc_dygraph_to_static(args)
+    else:
+        print("type should be in ['am', 'voc'] !")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/paddlespeech/t2s/exps/ernie_sat/preprocess.py b/paddlespeech/t2s/exps/ernie_sat/preprocess.py
index 486ed13a..04bbc074 100644
--- a/paddlespeech/t2s/exps/ernie_sat/preprocess.py
+++ b/paddlespeech/t2s/exps/ernie_sat/preprocess.py
@@ -324,6 +324,7 @@ def main():
             sentences=sentences,
             output_dir=dev_dump_dir,
             mel_extractor=mel_extractor,
+            nprocs=args.num_cpu,
             cut_sil=args.cut_sil,
             spk_emb_dir=spk_emb_dir)
     if test_wav_files:
diff --git a/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py b/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py
index e450aa1a..c43dafb3 100644
--- a/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py
@@ -437,7 +437,7 @@ if __name__ == '__main__':
 
     vocab_phones = {}
 
-    with open(args.phones_dict, 'rt') as f:
+    with open(args.phones_dict, 'rt', encoding='utf-8') as f:
         phn_id = [line.strip().split() for line in f.readlines()]
     for phn, id in phn_id:
         vocab_phones[phn] = int(id)
diff --git a/paddlespeech/t2s/exps/ernie_sat/train.py b/paddlespeech/t2s/exps/ernie_sat/train.py
index 75a666bb..c98d691b 100644
--- a/paddlespeech/t2s/exps/ernie_sat/train.py
+++ b/paddlespeech/t2s/exps/ernie_sat/train.py
@@ -109,7 +109,7 @@ def train_sp(args, config):
         num_workers=config.num_workers)
     print("dataloaders done!")
 
-    with open(args.phones_dict, "r") as f:
+    with open(args.phones_dict, 'rt', encoding='utf-8') as f:
         phn_id = [line.strip().split() for line in f.readlines()]
     vocab_size = len(phn_id)
     print("vocab_size:", vocab_size)
diff --git a/paddlespeech/t2s/exps/fastspeech2/preprocess.py b/paddlespeech/t2s/exps/fastspeech2/preprocess.py
index f4acdc60..a2353242 100644
--- a/paddlespeech/t2s/exps/fastspeech2/preprocess.py
+++ b/paddlespeech/t2s/exps/fastspeech2/preprocess.py
@@ -54,8 +54,15 @@ def process_sentence(config: Dict[str, Any],
     record = None
     if utt_id in sentences:
         # reading, resampling may occur
-        wav, _ = librosa.load(str(fp), sr=config.fs)
-        if len(wav.shape) != 1:
+        wav, _ = librosa.load(
+            str(fp), sr=config.fs,
+            mono=False) if "canton" in str(fp) else librosa.load(
+                str(fp), sr=config.fs)
+        if len(wav.shape) == 2 and "canton" in str(fp):
+            # Remind that Cantonese datasets should be placed in ~/datasets/canton_all. Otherwise, it may cause problem.
+            wav = wav[0]
+            wav = np.ascontiguousarray(wav)
+        elif len(wav.shape) != 1:
             return record
         max_value = np.abs(wav).max()
         if max_value > 1.0:
@@ -102,6 +109,8 @@ def process_sentence(config: Dict[str, Any],
         np.save(mel_path, logmel)
         # extract pitch and energy
         f0 = pitch_extractor.get_pitch(wav, duration=np.array(durations))
+        if (f0 == 0).all():
+            return None
         assert f0.shape[0] == len(durations)
         f0_dir = output_dir / "data_pitch"
         f0_dir.mkdir(parents=True, exist_ok=True)
@@ -282,7 +291,20 @@ def main():
                 test_wav_files += wav_files[-sub_num_dev:]
             else:
                 train_wav_files += wav_files
-
+    elif args.dataset == "canton":
+        sub_num_dev = 5
+        wav_dir = rootdir / "WAV"
+        train_wav_files = []
+        dev_wav_files = []
+        test_wav_files = []
+        for speaker in os.listdir(wav_dir):
+            wav_files = sorted(list((wav_dir / speaker).rglob("*.wav")))
+            if len(wav_files) > 100:
+                train_wav_files += wav_files[:-sub_num_dev * 2]
+                dev_wav_files += wav_files[-sub_num_dev * 2:-sub_num_dev]
+                test_wav_files += wav_files[-sub_num_dev:]
+            else:
+                train_wav_files += wav_files
     elif args.dataset == "ljspeech":
         wav_files = sorted(list((rootdir / "wavs").rglob("*.wav")))
         # split data into 3 sections
@@ -360,6 +382,7 @@ def main():
             mel_extractor=mel_extractor,
             pitch_extractor=pitch_extractor,
             energy_extractor=energy_extractor,
+            nprocs=args.num_cpu,
             cut_sil=args.cut_sil,
             spk_emb_dir=spk_emb_dir,
             write_metadata_method=args.write_metadata_method)
diff --git a/paddlespeech/t2s/exps/fastspeech2/train.py b/paddlespeech/t2s/exps/fastspeech2/train.py
index d31e62a8..97626db0 100644
--- a/paddlespeech/t2s/exps/fastspeech2/train.py
+++ b/paddlespeech/t2s/exps/fastspeech2/train.py
@@ -67,7 +67,7 @@ def train_sp(args, config):
     if args.speaker_dict is not None:
         print("multiple speaker fastspeech2!")
         collate_fn = fastspeech2_multi_spk_batch_fn
-        with open(args.speaker_dict, 'rt') as f:
+        with open(args.speaker_dict, 'rt', encoding='utf-8') as f:
             spk_id = [line.strip().split() for line in f.readlines()]
         spk_num = len(spk_id)
         fields += ["spk_id"]
@@ -123,7 +123,7 @@ def train_sp(args, config):
         num_workers=config.num_workers)
     print("dataloaders done!")
 
-    with open(args.phones_dict, "r") as f:
+    with open(args.phones_dict, 'rt', encoding='utf-8') as f:
         phn_id = [line.strip().split() for line in f.readlines()]
     vocab_size = len(phn_id)
     print("vocab_size:", vocab_size)
diff --git a/paddlespeech/t2s/exps/gan_vocoder/preprocess.py b/paddlespeech/t2s/exps/gan_vocoder/preprocess.py
index 05c65768..a2629a90 100644
--- a/paddlespeech/t2s/exps/gan_vocoder/preprocess.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/preprocess.py
@@ -29,6 +29,7 @@ from yacs.config import CfgNode
 
 from paddlespeech.t2s.datasets.get_feats import LogMelFBank
 from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur
+from paddlespeech.t2s.datasets.preprocess_utils import get_sentences_svs
 from paddlespeech.t2s.datasets.preprocess_utils import merge_silence
 from paddlespeech.t2s.utils import str2bool
 
@@ -192,8 +193,15 @@ def main():
     with open(args.config, 'rt') as f:
         config = CfgNode(yaml.safe_load(f))
 
-    sentences, speaker_set = get_phn_dur(dur_file)
-    merge_silence(sentences)
+    if args.dataset == "opencpop":
+        sentences, speaker_set = get_sentences_svs(
+            dur_file,
+            dataset=args.dataset,
+            sample_rate=config.fs,
+            n_shift=config.n_shift, )
+    else:
+        sentences, speaker_set = get_phn_dur(dur_file)
+        merge_silence(sentences)
 
     # split data into 3 sections
     if args.dataset == "baker":
@@ -240,6 +248,33 @@ def main():
                 test_wav_files += wav_files[-sub_num_dev:]
             else:
                 train_wav_files += wav_files
+    elif args.dataset == "opencpop":
+        wavdir = rootdir / "wavs"
+        # split data into 3 sections
+        train_file = rootdir / "train.txt"
+        train_wav_files = []
+        with open(train_file, "r") as f_train:
+            for line in f_train.readlines():
+                utt = line.split("|")[0]
+                wav_name = utt + ".wav"
+                wav_path = wavdir / wav_name
+                train_wav_files.append(wav_path)
+
+        test_file = rootdir / "test.txt"
+        dev_wav_files = []
+        test_wav_files = []
+        num_dev = 106
+        count = 0
+        with open(test_file, "r") as f_test:
+            for line in f_test.readlines():
+                count += 1
+                utt = line.split("|")[0]
+                wav_name = utt + ".wav"
+                wav_path = wavdir / wav_name
+                if count > num_dev:
+                    test_wav_files.append(wav_path)
+                else:
+                    dev_wav_files.append(wav_path)
     else:
         print("dataset should in {baker, ljspeech, vctk, aishell3} now!")
 
diff --git a/paddlespeech/t2s/exps/inference.py b/paddlespeech/t2s/exps/inference.py
index e0ae20bb..31fe1449 100644
--- a/paddlespeech/t2s/exps/inference.py
+++ b/paddlespeech/t2s/exps/inference.py
@@ -42,6 +42,10 @@ def parse_args():
             'fastspeech2_vctk',
             'tacotron2_csmsc',
             'fastspeech2_mix',
+            'fastspeech2_male-zh',
+            'fastspeech2_male-en',
+            'fastspeech2_male-mix',
+            'fastspeech2_canton',
         ],
         help='Choose acoustic model type of tts task.')
     parser.add_argument(
@@ -71,6 +75,8 @@ def parse_args():
             'hifigan_ljspeech',
             'hifigan_vctk',
             'wavernn_csmsc',
+            'pwgan_male',
+            'hifigan_male',
         ],
         help='Choose vocoder type of tts task.')
     # other
@@ -88,20 +94,27 @@ def parse_args():
     parser.add_argument("--output_dir", type=str, help="output dir")
     # inference
     parser.add_argument(
-        "--int8",
+        "--use_trt",
         type=str2bool,
         default=False,
-        help="Whether to use int8 inference.", )
+        help="whether to use TensorRT or not in GPU", )
     parser.add_argument(
-        "--fp16",
+        "--use_mkldnn",
         type=str2bool,
         default=False,
-        help="Whether to use float16 inference.", )
+        help="whether to use MKLDNN or not in CPU.", )
+    parser.add_argument(
+        "--precision",
+        type=str,
+        default='fp32',
+        choices=['fp32', 'fp16', 'bf16', 'int8'],
+        help="mode of running")
     parser.add_argument(
         "--device",
         default="gpu",
         choices=["gpu", "cpu"],
         help="Device selected for inference.", )
+    parser.add_argument('--cpu_threads', type=int, default=1)
 
     args, _ = parser.parse_known_args()
     return args
@@ -124,7 +137,11 @@ def main():
         model_dir=args.inference_dir,
         model_file=args.am + ".pdmodel",
         params_file=args.am + ".pdiparams",
-        device=args.device)
+        device=args.device,
+        use_trt=args.use_trt,
+        use_mkldnn=args.use_mkldnn,
+        cpu_threads=args.cpu_threads,
+        precision=args.precision)
     # model: {model_name}_{dataset}
     am_dataset = args.am[args.am.rindex('_') + 1:]
 
@@ -133,7 +150,11 @@ def main():
         model_dir=args.inference_dir,
         model_file=args.voc + ".pdmodel",
         params_file=args.voc + ".pdiparams",
-        device=args.device)
+        device=args.device,
+        use_trt=args.use_trt,
+        use_mkldnn=args.use_mkldnn,
+        cpu_threads=args.cpu_threads,
+        precision=args.precision)
 
     output_dir = Path(args.output_dir)
     output_dir.mkdir(parents=True, exist_ok=True)
diff --git a/paddlespeech/t2s/exps/jets/__init__.py b/paddlespeech/t2s/exps/jets/__init__.py
new file mode 100644
index 00000000..97043fd7
--- /dev/null
+++ b/paddlespeech/t2s/exps/jets/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddlespeech/t2s/exps/jets/inference.py b/paddlespeech/t2s/exps/jets/inference.py
new file mode 100644
index 00000000..4f6882ed
--- /dev/null
+++ b/paddlespeech/t2s/exps/jets/inference.py
@@ -0,0 +1,172 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+from pathlib import Path
+
+import paddle
+import soundfile as sf
+from timer import timer
+
+from paddlespeech.t2s.exps.syn_utils import get_am_output
+from paddlespeech.t2s.exps.syn_utils import get_frontend
+from paddlespeech.t2s.exps.syn_utils import get_predictor
+from paddlespeech.t2s.exps.syn_utils import get_sentences
+from paddlespeech.t2s.utils import str2bool
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Paddle Infernce with acoustic model & vocoder.")
+    # acoustic model
+    parser.add_argument(
+        '--am',
+        type=str,
+        default='jets_csmsc',
+        choices=['jets_csmsc', 'jets_aishell3'],
+        help='Choose acoustic model type of tts task.')
+    parser.add_argument(
+        "--phones_dict", type=str, default=None, help="phone vocabulary file.")
+    parser.add_argument(
+        "--speaker_dict", type=str, default=None, help="speaker id map file.")
+    parser.add_argument(
+        '--spk_id',
+        type=int,
+        default=0,
+        help='spk id for multi speaker acoustic model')
+    # other
+    parser.add_argument(
+        '--lang',
+        type=str,
+        default='zh',
+        help='Choose model language. zh or en or mix')
+    parser.add_argument(
+        "--text",
+        type=str,
+        help="text to synthesize, a 'utt_id sentence' pair per line")
+    parser.add_argument(
+        "--add-blank",
+        type=str2bool,
+        default=True,
+        help="whether to add blank between phones")
+    parser.add_argument(
+        "--inference_dir", type=str, help="dir to save inference models")
+    parser.add_argument("--output_dir", type=str, help="output dir")
+    # inference
+    parser.add_argument(
+        "--use_trt",
+        type=str2bool,
+        default=False,
+        help="whether to use TensorRT or not in GPU", )
+    parser.add_argument(
+        "--use_mkldnn",
+        type=str2bool,
+        default=False,
+        help="whether to use MKLDNN or not in CPU.", )
+    parser.add_argument(
+        "--precision",
+        type=str,
+        default='fp32',
+        choices=['fp32', 'fp16', 'bf16', 'int8'],
+        help="mode of running")
+    parser.add_argument(
+        "--device",
+        default="gpu",
+        choices=["gpu", "cpu"],
+        help="Device selected for inference.", )
+    parser.add_argument('--cpu_threads', type=int, default=1)
+
+    args, _ = parser.parse_known_args()
+    return args
+
+
+# only inference for models trained with csmsc now
+def main():
+    args = parse_args()
+
+    paddle.set_device(args.device)
+
+    # frontend
+    frontend = get_frontend(lang=args.lang, phones_dict=args.phones_dict)
+
+    # am_predictor
+    am_predictor = get_predictor(
+        model_dir=args.inference_dir,
+        model_file=args.am + ".pdmodel",
+        params_file=args.am + ".pdiparams",
+        device=args.device,
+        use_trt=args.use_trt,
+        use_mkldnn=args.use_mkldnn,
+        cpu_threads=args.cpu_threads,
+        precision=args.precision)
+    # model: {model_name}_{dataset}
+    am_dataset = args.am[args.am.rindex('_') + 1:]
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    sentences = get_sentences(text_file=args.text, lang=args.lang)
+
+    merge_sentences = True
+    add_blank = args.add_blank
+    # jets's fs is 22050
+    fs = 22050
+    # warmup
+    for utt_id, sentence in sentences[:3]:
+        with timer() as t:
+            wav = get_am_output(
+                input=sentence,
+                am_predictor=am_predictor,
+                am=args.am,
+                frontend=frontend,
+                lang=args.lang,
+                merge_sentences=merge_sentences,
+                speaker_dict=args.speaker_dict,
+                spk_id=args.spk_id, )
+        speed = wav.size / t.elapse
+        rtf = fs / speed
+        print(
+            f"{utt_id}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
+        )
+
+    print("warm up done!")
+
+    N = 0
+    T = 0
+    for utt_id, sentence in sentences:
+        with timer() as t:
+            wav = get_am_output(
+                input=sentence,
+                am_predictor=am_predictor,
+                am=args.am,
+                frontend=frontend,
+                lang=args.lang,
+                merge_sentences=merge_sentences,
+                speaker_dict=args.speaker_dict,
+                spk_id=args.spk_id, )
+
+        N += wav.size
+        T += t.elapse
+        speed = wav.size / t.elapse
+        rtf = fs / speed
+        sf.write(output_dir / (utt_id + ".wav"), wav, samplerate=fs)
+        print(
+            f"{utt_id}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
+        )
+
+        print(f"{utt_id} done!")
+    print(f"generation speed: {N / T}Hz, RTF: {fs / (N / T) }")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/paddlespeech/t2s/exps/jets/normalize.py b/paddlespeech/t2s/exps/jets/normalize.py
new file mode 100644
index 00000000..8531f0db
--- /dev/null
+++ b/paddlespeech/t2s/exps/jets/normalize.py
@@ -0,0 +1,163 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Normalize feature files and dump them."""
+import argparse
+import logging
+from operator import itemgetter
+from pathlib import Path
+
+import jsonlines
+import numpy as np
+from sklearn.preprocessing import StandardScaler
+from tqdm import tqdm
+
+from paddlespeech.t2s.datasets.data_table import DataTable
+
+
+def main():
+    """Run preprocessing process."""
+    parser = argparse.ArgumentParser(
+        description="Normalize dumped raw features (See detail in parallel_wavegan/bin/normalize.py)."
+    )
+    parser.add_argument(
+        "--metadata",
+        type=str,
+        required=True,
+        help="directory including feature files to be normalized. "
+        "you need to specify either *-scp or rootdir.")
+
+    parser.add_argument(
+        "--dumpdir",
+        type=str,
+        required=True,
+        help="directory to dump normalized feature files.")
+    parser.add_argument(
+        "--feats-stats", type=str, required=True, help="feats statistics file.")
+    parser.add_argument(
+        "--pitch-stats", type=str, required=True, help="pitch statistics file.")
+    parser.add_argument(
+        "--energy-stats",
+        type=str,
+        required=True,
+        help="energy statistics file.")
+    parser.add_argument(
+        "--phones-dict", type=str, default=None, help="phone vocabulary file.")
+    parser.add_argument(
+        "--speaker-dict", type=str, default=None, help="speaker id map file.")
+
+    args = parser.parse_args()
+
+    dumpdir = Path(args.dumpdir).expanduser()
+    # use absolute path
+    dumpdir = dumpdir.resolve()
+    dumpdir.mkdir(parents=True, exist_ok=True)
+
+    # get dataset
+    with jsonlines.open(args.metadata, 'r') as reader:
+        metadata = list(reader)
+    dataset = DataTable(
+        metadata,
+        converters={
+            "feats": np.load,
+            "pitch": np.load,
+            "energy": np.load,
+            "wave": str,
+        })
+    logging.info(f"The number of files = {len(dataset)}.")
+
+    # restore scaler
+    feats_scaler = StandardScaler()
+    feats_scaler.mean_ = np.load(args.feats_stats)[0]
+    feats_scaler.scale_ = np.load(args.feats_stats)[1]
+    feats_scaler.n_features_in_ = feats_scaler.mean_.shape[0]
+
+    pitch_scaler = StandardScaler()
+    pitch_scaler.mean_ = np.load(args.pitch_stats)[0]
+    pitch_scaler.scale_ = np.load(args.pitch_stats)[1]
+    pitch_scaler.n_features_in_ = pitch_scaler.mean_.shape[0]
+
+    energy_scaler = StandardScaler()
+    energy_scaler.mean_ = np.load(args.energy_stats)[0]
+    energy_scaler.scale_ = np.load(args.energy_stats)[1]
+    energy_scaler.n_features_in_ = energy_scaler.mean_.shape[0]
+
+    vocab_phones = {}
+    with open(args.phones_dict, 'rt') as f:
+        phn_id = [line.strip().split() for line in f.readlines()]
+    for phn, id in phn_id:
+        vocab_phones[phn] = int(id)
+
+    vocab_speaker = {}
+    with open(args.speaker_dict, 'rt') as f:
+        spk_id = [line.strip().split() for line in f.readlines()]
+    for spk, id in spk_id:
+        vocab_speaker[spk] = int(id)
+
+    # process each file
+    output_metadata = []
+
+    for item in tqdm(dataset):
+        utt_id = item['utt_id']
+        feats = item['feats']
+        pitch = item['pitch']
+        energy = item['energy']
+        wave_path = item['wave']
+        # normalize
+        feats = feats_scaler.transform(feats)
+        feats_dir = dumpdir / "data_feats"
+        feats_dir.mkdir(parents=True, exist_ok=True)
+        feats_path = feats_dir / f"{utt_id}_feats.npy"
+        np.save(feats_path, feats.astype(np.float32), allow_pickle=False)
+
+        pitch = pitch_scaler.transform(pitch)
+        pitch_dir = dumpdir / "data_pitch"
+        pitch_dir.mkdir(parents=True, exist_ok=True)
+        pitch_path = pitch_dir / f"{utt_id}_pitch.npy"
+        np.save(pitch_path, pitch.astype(np.float32), allow_pickle=False)
+
+        energy = energy_scaler.transform(energy)
+        energy_dir = dumpdir / "data_energy"
+        energy_dir.mkdir(parents=True, exist_ok=True)
+        energy_path = energy_dir / f"{utt_id}_energy.npy"
+        np.save(energy_path, energy.astype(np.float32), allow_pickle=False)
+
+        phone_ids = [vocab_phones[p] for p in item['phones']]
+        spk_id = vocab_speaker[item["speaker"]]
+        record = {
+            "utt_id": item['utt_id'],
+            "spk_id": spk_id,
+            "text": phone_ids,
+            "text_lengths": item['text_lengths'],
+            "feats_lengths": item['feats_lengths'],
+            "durations": item['durations'],
+            "feats": str(feats_path),
+            "pitch": str(pitch_path),
+            "energy": str(energy_path),
+            "wave": str(wave_path),
+        }
+        # add spk_emb for voice cloning
+        if "spk_emb" in item:
+            record["spk_emb"] = str(item["spk_emb"])
+
+        output_metadata.append(record)
+    output_metadata.sort(key=itemgetter('utt_id'))
+    output_metadata_path = Path(args.dumpdir) / "metadata.jsonl"
+    with jsonlines.open(output_metadata_path, 'w') as writer:
+        for item in output_metadata:
+            writer.write(item)
+    logging.info(f"metadata dumped into {output_metadata_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/paddlespeech/t2s/exps/jets/preprocess.py b/paddlespeech/t2s/exps/jets/preprocess.py
new file mode 100644
index 00000000..468941ea
--- /dev/null
+++ b/paddlespeech/t2s/exps/jets/preprocess.py
@@ -0,0 +1,451 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+from concurrent.futures import ThreadPoolExecutor
+from operator import itemgetter
+from pathlib import Path
+from typing import Any
+from typing import Dict
+from typing import List
+
+import jsonlines
+import librosa
+import numpy as np
+import tqdm
+import yaml
+from yacs.config import CfgNode
+
+from paddlespeech.t2s.datasets.get_feats import Energy
+from paddlespeech.t2s.datasets.get_feats import LogMelFBank
+from paddlespeech.t2s.datasets.get_feats import Pitch
+from paddlespeech.t2s.datasets.preprocess_utils import compare_duration_and_mel_length
+from paddlespeech.t2s.datasets.preprocess_utils import get_input_token
+from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur
+from paddlespeech.t2s.datasets.preprocess_utils import get_spk_id_map
+from paddlespeech.t2s.datasets.preprocess_utils import merge_silence
+from paddlespeech.t2s.utils import str2bool
+
+
+def process_sentence(config: Dict[str, Any],
+                     fp: Path,
+                     sentences: Dict,
+                     output_dir: Path,
+                     mel_extractor=None,
+                     pitch_extractor=None,
+                     energy_extractor=None,
+                     cut_sil: bool=True,
+                     spk_emb_dir: Path=None,
+                     token_average: bool=True):
+    utt_id = fp.stem
+    # for vctk
+    if utt_id.endswith("_mic2"):
+        utt_id = utt_id[:-5]
+    record = None
+    if utt_id in sentences:
+        # reading, resampling may occur
+        wav, _ = librosa.load(
+            str(fp), sr=config.fs,
+            mono=False) if "canton" in str(fp) else librosa.load(
+                str(fp), sr=config.fs)
+        if len(wav.shape) == 2 and "canton" in str(fp):
+            # Remind that Cantonese datasets should be placed in ~/datasets/canton_all. Otherwise, it may cause problem.
+            wav = wav[0]
+            wav = np.ascontiguousarray(wav)
+        elif len(wav.shape) != 1:
+            return record
+        max_value = np.abs(wav).max()
+        if max_value > 1.0:
+            wav = wav / max_value
+        assert len(wav.shape) == 1, f"{utt_id} is not a mono-channel audio."
+        assert np.abs(wav).max(
+        ) <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM."
+        phones = sentences[utt_id][0]
+        durations = sentences[utt_id][1]
+        speaker = sentences[utt_id][2]
+        d_cumsum = np.pad(np.array(durations).cumsum(0), (1, 0), 'constant')
+        # little imprecise than use *.TextGrid directly
+        times = librosa.frames_to_time(
+            d_cumsum, sr=config.fs, hop_length=config.n_shift)
+        if cut_sil:
+            start = 0
+            end = d_cumsum[-1]
+            if phones[0] == "sil" and len(durations) > 1:
+                start = times[1]
+                durations = durations[1:]
+                phones = phones[1:]
+            if phones[-1] == 'sil' and len(durations) > 1:
+                end = times[-2]
+                durations = durations[:-1]
+                phones = phones[:-1]
+            sentences[utt_id][0] = phones
+            sentences[utt_id][1] = durations
+            start, end = librosa.time_to_samples([start, end], sr=config.fs)
+            wav = wav[start:end]
+        # extract mel feats
+        logmel = mel_extractor.get_log_mel_fbank(wav)
+        # change duration according to mel_length
+        compare_duration_and_mel_length(sentences, utt_id, logmel)
+        # utt_id may be popped in compare_duration_and_mel_length
+        if utt_id not in sentences:
+            return None
+        phones = sentences[utt_id][0]
+        durations = sentences[utt_id][1]
+        num_frames = logmel.shape[0]
+        assert sum(durations) == num_frames
+        mel_dir = output_dir / "data_feats"
+        mel_dir.mkdir(parents=True, exist_ok=True)
+        mel_path = mel_dir / (utt_id + "_feats.npy")
+        np.save(mel_path, logmel)
+
+        if wav.size < num_frames * config.n_shift:
+            wav = np.pad(
+                wav, (0, num_frames * config.n_shift - wav.size),
+                mode="reflect")
+        else:
+            wav = wav[:num_frames * config.n_shift]
+        wave_dir = output_dir / "data_wave"
+        wave_dir.mkdir(parents=True, exist_ok=True)
+        wav_path = wave_dir / (utt_id + "_wave.npy")
+        # (num_samples, )
+        np.save(wav_path, wav)
+        # extract pitch and energy
+        if token_average == True:
+            f0 = pitch_extractor.get_pitch(
+                wav,
+                duration=np.array(durations),
+                use_token_averaged_f0=token_average)
+            if (f0 == 0).all():
+                return None
+            assert f0.shape[0] == len(durations)
+        else:
+            f0 = pitch_extractor.get_pitch(
+                wav, use_token_averaged_f0=token_average)
+            if (f0 == 0).all():
+                return None
+            f0 = f0[:num_frames]
+            assert f0.shape[0] == num_frames
+        f0_dir = output_dir / "data_pitch"
+        f0_dir.mkdir(parents=True, exist_ok=True)
+        f0_path = f0_dir / (utt_id + "_pitch.npy")
+        np.save(f0_path, f0)
+        if token_average == True:
+            energy = energy_extractor.get_energy(
+                wav,
+                duration=np.array(durations),
+                use_token_averaged_energy=token_average)
+            assert energy.shape[0] == len(durations)
+        else:
+            energy = energy_extractor.get_energy(
+                wav, use_token_averaged_energy=token_average)
+            energy = energy[:num_frames]
+            assert energy.shape[0] == num_frames
+
+        energy_dir = output_dir / "data_energy"
+        energy_dir.mkdir(parents=True, exist_ok=True)
+        energy_path = energy_dir / (utt_id + "_energy.npy")
+        np.save(energy_path, energy)
+        record = {
+            "utt_id": utt_id,
+            "phones": phones,
+            "text_lengths": len(phones),
+            "feats_lengths": num_frames,
+            "durations": durations,
+            "feats": str(mel_path),
+            "pitch": str(f0_path),
+            "energy": str(energy_path),
+            "wave": str(wav_path),
+            "speaker": speaker
+        }
+        if spk_emb_dir:
+            if speaker in os.listdir(spk_emb_dir):
+                embed_name = utt_id + ".npy"
+                embed_path = spk_emb_dir / speaker / embed_name
+                if embed_path.is_file():
+                    record["spk_emb"] = str(embed_path)
+                else:
+                    return None
+    return record
+
+
+def process_sentences(config,
+                      fps: List[Path],
+                      sentences: Dict,
+                      output_dir: Path,
+                      mel_extractor=None,
+                      pitch_extractor=None,
+                      energy_extractor=None,
+                      nprocs: int=1,
+                      cut_sil: bool=True,
+                      spk_emb_dir: Path=None,
+                      write_metadata_method: str='w',
+                      token_average: bool=True):
+    if nprocs == 1:
+        results = []
+        for fp in tqdm.tqdm(fps, total=len(fps)):
+            record = process_sentence(
+                config=config,
+                fp=fp,
+                sentences=sentences,
+                output_dir=output_dir,
+                mel_extractor=mel_extractor,
+                pitch_extractor=pitch_extractor,
+                energy_extractor=energy_extractor,
+                cut_sil=cut_sil,
+                spk_emb_dir=spk_emb_dir,
+                token_average=token_average)
+            if record:
+                results.append(record)
+    else:
+        with ThreadPoolExecutor(nprocs) as pool:
+            futures = []
+            with tqdm.tqdm(total=len(fps)) as progress:
+                for fp in fps:
+                    future = pool.submit(process_sentence, config, fp,
+                                         sentences, output_dir, mel_extractor,
+                                         pitch_extractor, energy_extractor,
+                                         cut_sil, spk_emb_dir)
+                    future.add_done_callback(lambda p: progress.update())
+                    futures.append(future)
+
+                results = []
+                for ft in futures:
+                    record = ft.result()
+                    if record:
+                        results.append(record)
+
+    results.sort(key=itemgetter("utt_id"))
+    with jsonlines.open(output_dir / "metadata.jsonl",
+                        write_metadata_method) as writer:
+        for item in results:
+            writer.write(item)
+    print("Done")
+
+
+def main():
+    # parse config and args
+    parser = argparse.ArgumentParser(
+        description="Preprocess audio and then extract features.")
+
+    parser.add_argument(
+        "--dataset",
+        default="baker",
+        type=str,
+        help="name of dataset, should in {baker, aishell3, ljspeech, vctk} now")
+
+    parser.add_argument(
+        "--rootdir", default=None, type=str, help="directory to dataset.")
+
+    parser.add_argument(
+        "--dumpdir",
+        type=str,
+        required=True,
+        help="directory to dump feature files.")
+    parser.add_argument(
+        "--dur-file", default=None, type=str, help="path to durations.txt.")
+
+    parser.add_argument("--config", type=str, help="fastspeech2 config file.")
+
+    parser.add_argument(
+        "--num-cpu", type=int, default=1, help="number of process.")
+
+    parser.add_argument(
+        "--cut-sil",
+        type=str2bool,
+        default=True,
+        help="whether cut sil in the edge of audio")
+
+    parser.add_argument(
+        "--spk_emb_dir",
+        default=None,
+        type=str,
+        help="directory to speaker embedding files.")
+
+    parser.add_argument(
+        "--write_metadata_method",
+        default="w",
+        type=str,
+        choices=["w", "a"],
+        help="How the metadata.jsonl file is written.")
+
+    parser.add_argument(
+        "--token_average",
+        type=str2bool,
+        default=False,
+        help="Average the energy and pitch accroding to durations")
+    args = parser.parse_args()
+
+    rootdir = Path(args.rootdir).expanduser()
+    dumpdir = Path(args.dumpdir).expanduser()
+    # use absolute path
+    dumpdir = dumpdir.resolve()
+    dumpdir.mkdir(parents=True, exist_ok=True)
+    dur_file = Path(args.dur_file).expanduser()
+
+    if args.spk_emb_dir:
+        spk_emb_dir = Path(args.spk_emb_dir).expanduser().resolve()
+    else:
+        spk_emb_dir = None
+
+    assert rootdir.is_dir()
+    assert dur_file.is_file()
+
+    with open(args.config, 'rt') as f:
+        config = CfgNode(yaml.safe_load(f))
+
+    sentences, speaker_set = get_phn_dur(dur_file)
+
+    merge_silence(sentences)
+    phone_id_map_path = dumpdir / "phone_id_map.txt"
+    speaker_id_map_path = dumpdir / "speaker_id_map.txt"
+    get_input_token(sentences, phone_id_map_path, args.dataset)
+    get_spk_id_map(speaker_set, speaker_id_map_path)
+
+    if args.dataset == "baker":
+        wav_files = sorted(list((rootdir / "Wave").rglob("*.wav")))
+        # split data into 3 sections
+        num_train = 9800
+        num_dev = 100
+        train_wav_files = wav_files[:num_train]
+        dev_wav_files = wav_files[num_train:num_train + num_dev]
+        test_wav_files = wav_files[num_train + num_dev:]
+    elif args.dataset == "aishell3":
+        sub_num_dev = 5
+        wav_dir = rootdir / "train" / "wav"
+        train_wav_files = []
+        dev_wav_files = []
+        test_wav_files = []
+        for speaker in os.listdir(wav_dir):
+            wav_files = sorted(list((wav_dir / speaker).rglob("*.wav")))
+            if len(wav_files) > 100:
+                train_wav_files += wav_files[:-sub_num_dev * 2]
+                dev_wav_files += wav_files[-sub_num_dev * 2:-sub_num_dev]
+                test_wav_files += wav_files[-sub_num_dev:]
+            else:
+                train_wav_files += wav_files
+    elif args.dataset == "canton":
+        sub_num_dev = 5
+        wav_dir = rootdir / "WAV"
+        train_wav_files = []
+        dev_wav_files = []
+        test_wav_files = []
+        for speaker in os.listdir(wav_dir):
+            wav_files = sorted(list((wav_dir / speaker).rglob("*.wav")))
+            if len(wav_files) > 100:
+                train_wav_files += wav_files[:-sub_num_dev * 2]
+                dev_wav_files += wav_files[-sub_num_dev * 2:-sub_num_dev]
+                test_wav_files += wav_files[-sub_num_dev:]
+            else:
+                train_wav_files += wav_files
+    elif args.dataset == "ljspeech":
+        wav_files = sorted(list((rootdir / "wavs").rglob("*.wav")))
+        # split data into 3 sections
+        num_train = 12900
+        num_dev = 100
+        train_wav_files = wav_files[:num_train]
+        dev_wav_files = wav_files[num_train:num_train + num_dev]
+        test_wav_files = wav_files[num_train + num_dev:]
+    elif args.dataset == "vctk":
+        sub_num_dev = 5
+        wav_dir = rootdir / "wav48_silence_trimmed"
+        train_wav_files = []
+        dev_wav_files = []
+        test_wav_files = []
+        for speaker in os.listdir(wav_dir):
+            wav_files = sorted(list((wav_dir / speaker).rglob("*_mic2.flac")))
+            if len(wav_files) > 100:
+                train_wav_files += wav_files[:-sub_num_dev * 2]
+                dev_wav_files += wav_files[-sub_num_dev * 2:-sub_num_dev]
+                test_wav_files += wav_files[-sub_num_dev:]
+            else:
+                train_wav_files += wav_files
+
+    else:
+        print("dataset should in {baker, aishell3, ljspeech, vctk} now!")
+
+    train_dump_dir = dumpdir / "train" / "raw"
+    train_dump_dir.mkdir(parents=True, exist_ok=True)
+    dev_dump_dir = dumpdir / "dev" / "raw"
+    dev_dump_dir.mkdir(parents=True, exist_ok=True)
+    test_dump_dir = dumpdir / "test" / "raw"
+    test_dump_dir.mkdir(parents=True, exist_ok=True)
+
+    # Extractor
+    mel_extractor = LogMelFBank(
+        sr=config.fs,
+        n_fft=config.n_fft,
+        hop_length=config.n_shift,
+        win_length=config.win_length,
+        window=config.window,
+        n_mels=config.n_mels,
+        fmin=config.fmin,
+        fmax=config.fmax)
+    pitch_extractor = Pitch(
+        sr=config.fs,
+        hop_length=config.n_shift,
+        f0min=config.f0min,
+        f0max=config.f0max)
+    energy_extractor = Energy(
+        n_fft=config.n_fft,
+        hop_length=config.n_shift,
+        win_length=config.win_length,
+        window=config.window)
+
+    # process for the 3 sections
+    if train_wav_files:
+        process_sentences(
+            config=config,
+            fps=train_wav_files,
+            sentences=sentences,
+            output_dir=train_dump_dir,
+            mel_extractor=mel_extractor,
+            pitch_extractor=pitch_extractor,
+            energy_extractor=energy_extractor,
+            nprocs=args.num_cpu,
+            cut_sil=args.cut_sil,
+            spk_emb_dir=spk_emb_dir,
+            write_metadata_method=args.write_metadata_method,
+            token_average=args.token_average)
+    if dev_wav_files:
+        process_sentences(
+            config=config,
+            fps=dev_wav_files,
+            sentences=sentences,
+            output_dir=dev_dump_dir,
+            mel_extractor=mel_extractor,
+            pitch_extractor=pitch_extractor,
+            energy_extractor=energy_extractor,
+            nprocs=args.num_cpu,
+            cut_sil=args.cut_sil,
+            spk_emb_dir=spk_emb_dir,
+            write_metadata_method=args.write_metadata_method,
+            token_average=args.token_average)
+    if test_wav_files:
+        process_sentences(
+            config=config,
+            fps=test_wav_files,
+            sentences=sentences,
+            output_dir=test_dump_dir,
+            mel_extractor=mel_extractor,
+            pitch_extractor=pitch_extractor,
+            energy_extractor=energy_extractor,
+            nprocs=args.num_cpu,
+            cut_sil=args.cut_sil,
+            spk_emb_dir=spk_emb_dir,
+            write_metadata_method=args.write_metadata_method,
+            token_average=args.token_average)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/paddlespeech/t2s/exps/jets/synthesize.py b/paddlespeech/t2s/exps/jets/synthesize.py
new file mode 100644
index 00000000..ef26414d
--- /dev/null
+++ b/paddlespeech/t2s/exps/jets/synthesize.py
@@ -0,0 +1,153 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+from pathlib import Path
+
+import jsonlines
+import numpy as np
+import paddle
+import soundfile as sf
+import yaml
+from timer import timer
+from yacs.config import CfgNode
+
+from paddlespeech.t2s.datasets.data_table import DataTable
+from paddlespeech.t2s.models.jets import JETS
+from paddlespeech.t2s.utils import str2bool
+
+
+def evaluate(args):
+
+    # construct dataset for evaluation
+    with jsonlines.open(args.test_metadata, 'r') as reader:
+        test_metadata = list(reader)
+    # Init body.
+    with open(args.config) as f:
+        config = CfgNode(yaml.safe_load(f))
+
+    print("========Args========")
+    print(yaml.safe_dump(vars(args)))
+    print("========Config========")
+    print(config)
+
+    fields = ["utt_id", "text"]
+    converters = {}
+
+    spk_num = None
+    if args.speaker_dict is not None:
+        print("multiple speaker jets!")
+        with open(args.speaker_dict, 'rt') as f:
+            spk_id = [line.strip().split() for line in f.readlines()]
+        spk_num = len(spk_id)
+        fields += ["spk_id"]
+    elif args.voice_cloning:
+        print("Evaluating voice cloning!")
+        fields += ["spk_emb"]
+    else:
+        print("single speaker jets!")
+    print("spk_num:", spk_num)
+
+    test_dataset = DataTable(
+        data=test_metadata,
+        fields=fields,
+        converters=converters, )
+
+    with open(args.phones_dict, "r") as f:
+        phn_id = [line.strip().split() for line in f.readlines()]
+    vocab_size = len(phn_id)
+    print("vocab_size:", vocab_size)
+
+    odim = config.n_fft // 2 + 1
+    config["model"]["generator_params"]["spks"] = spk_num
+
+    jets = JETS(idim=vocab_size, odim=odim, **config["model"])
+    jets.set_state_dict(paddle.load(args.ckpt)["main_params"])
+    jets.eval()
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    N = 0
+    T = 0
+
+    for datum in test_dataset:
+        utt_id = datum["utt_id"]
+        phone_ids = paddle.to_tensor(datum["text"])
+        with timer() as t:
+            with paddle.no_grad():
+                spk_emb = None
+                spk_id = None
+                # multi speaker
+                if args.voice_cloning and "spk_emb" in datum:
+                    spk_emb = paddle.to_tensor(np.load(datum["spk_emb"]))
+                elif "spk_id" in datum:
+                    spk_id = paddle.to_tensor(datum["spk_id"])
+                out = jets.inference(
+                    text=phone_ids, sids=spk_id, spembs=spk_emb)
+            wav = out["wav"]
+            wav = wav.numpy()
+            N += wav.size
+            T += t.elapse
+            speed = wav.size / t.elapse
+            rtf = config.fs / speed
+        print(
+            f"{utt_id}, wave: {wav.size}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
+        )
+        sf.write(str(output_dir / (utt_id + ".wav")), wav, samplerate=config.fs)
+        print(f"{utt_id} done!")
+    print(f"generation speed: {N / T}Hz, RTF: {config.fs / (N / T) }")
+
+
+def parse_args():
+    # parse args and config
+    parser = argparse.ArgumentParser(description="Synthesize with JETS")
+    # model
+    parser.add_argument(
+        '--config', type=str, default=None, help='Config of JETS.')
+    parser.add_argument(
+        '--ckpt', type=str, default=None, help='Checkpoint file of JETS.')
+    parser.add_argument(
+        "--phones_dict", type=str, default=None, help="phone vocabulary file.")
+    parser.add_argument(
+        "--speaker_dict", type=str, default=None, help="speaker id map file.")
+    parser.add_argument(
+        "--voice-cloning",
+        type=str2bool,
+        default=False,
+        help="whether training voice cloning model.")
+    # other
+    parser.add_argument(
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
+    parser.add_argument("--test_metadata", type=str, help="test metadata.")
+    parser.add_argument("--output_dir", type=str, help="output dir.")
+
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    if args.ngpu == 0:
+        paddle.set_device("cpu")
+    elif args.ngpu > 0:
+        paddle.set_device("gpu")
+    else:
+        print("ngpu should >= 0 !")
+
+    evaluate(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/paddlespeech/t2s/exps/jets/synthesize_e2e.py b/paddlespeech/t2s/exps/jets/synthesize_e2e.py
new file mode 100644
index 00000000..1c713c06
--- /dev/null
+++ b/paddlespeech/t2s/exps/jets/synthesize_e2e.py
@@ -0,0 +1,189 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+from pathlib import Path
+
+import paddle
+import soundfile as sf
+import yaml
+from timer import timer
+from yacs.config import CfgNode
+
+from paddlespeech.t2s.exps.syn_utils import am_to_static
+from paddlespeech.t2s.exps.syn_utils import get_frontend
+from paddlespeech.t2s.exps.syn_utils import get_sentences
+from paddlespeech.t2s.models.jets import JETS
+from paddlespeech.t2s.models.jets import JETSInference
+from paddlespeech.t2s.utils import str2bool
+
+
+def evaluate(args):
+    # Init body.
+    with open(args.config) as f:
+        config = CfgNode(yaml.safe_load(f))
+
+    print("========Args========")
+    print(yaml.safe_dump(vars(args)))
+    print("========Config========")
+    print(config)
+
+    sentences = get_sentences(text_file=args.text, lang=args.lang)
+
+    # frontend
+    frontend = get_frontend(lang=args.lang, phones_dict=args.phones_dict)
+    # acoustic model
+    am_name = args.am[:args.am.rindex('_')]
+    am_dataset = args.am[args.am.rindex('_') + 1:]
+
+    spk_num = None
+    if args.speaker_dict is not None:
+        print("multiple speaker jets!")
+        with open(args.speaker_dict, 'rt') as f:
+            spk_id = [line.strip().split() for line in f.readlines()]
+        spk_num = len(spk_id)
+    else:
+        print("single speaker jets!")
+    print("spk_num:", spk_num)
+
+    with open(args.phones_dict, "r") as f:
+        phn_id = [line.strip().split() for line in f.readlines()]
+    vocab_size = len(phn_id)
+    print("vocab_size:", vocab_size)
+
+    odim = config.n_fft // 2 + 1
+    config["model"]["generator_params"]["spks"] = spk_num
+
+    jets = JETS(idim=vocab_size, odim=odim, **config["model"])
+    jets.set_state_dict(paddle.load(args.ckpt)["main_params"])
+    jets.eval()
+
+    jets_inference = JETSInference(jets)
+    # whether dygraph to static
+    if args.inference_dir:
+        jets_inference = am_to_static(
+            am_inference=jets_inference,
+            am=args.am,
+            inference_dir=args.inference_dir,
+            speaker_dict=args.speaker_dict)
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    merge_sentences = False
+
+    N = 0
+    T = 0
+    for utt_id, sentence in sentences:
+        with timer() as t:
+            if args.lang == 'zh':
+                input_ids = frontend.get_input_ids(
+                    sentence, merge_sentences=merge_sentences)
+                phone_ids = input_ids["phone_ids"]
+            elif args.lang == 'en':
+                input_ids = frontend.get_input_ids(
+                    sentence, merge_sentences=merge_sentences)
+                phone_ids = input_ids["phone_ids"]
+            else:
+                print("lang should in {'zh', 'en'}!")
+            with paddle.no_grad():
+                flags = 0
+                for i in range(len(phone_ids)):
+                    part_phone_ids = phone_ids[i]
+                    spk_id = None
+                    if am_dataset in {"aishell3",
+                                      "vctk"} and spk_num is not None:
+                        spk_id = paddle.to_tensor(args.spk_id)
+                        wav = jets_inference(part_phone_ids, spk_id)
+                    else:
+                        wav = jets_inference(part_phone_ids)
+                    if flags == 0:
+                        wav_all = wav
+                        flags = 1
+                    else:
+                        wav_all = paddle.concat([wav_all, wav])
+        wav = wav_all.numpy()
+        N += wav.size
+        T += t.elapse
+        speed = wav.size / t.elapse
+        rtf = config.fs / speed
+        print(
+            f"{utt_id}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
+        )
+        sf.write(str(output_dir / (utt_id + ".wav")), wav, samplerate=config.fs)
+        print(f"{utt_id} done!")
+    print(f"generation speed: {N / T}Hz, RTF: {config.fs / (N / T) }")
+
+
+def parse_args():
+    # parse args and config 
+    parser = argparse.ArgumentParser(description="Synthesize with JETS")
+
+    # model
+    parser.add_argument(
+        '--config', type=str, default=None, help='Config of JETS.')
+    parser.add_argument(
+        '--ckpt', type=str, default=None, help='Checkpoint file of JETS.')
+    parser.add_argument(
+        "--phones_dict", type=str, default=None, help="phone vocabulary file.")
+    parser.add_argument(
+        "--speaker_dict", type=str, default=None, help="speaker id map file.")
+    parser.add_argument(
+        '--spk_id',
+        type=int,
+        default=0,
+        help='spk id for multi speaker acoustic model')
+    # other
+    parser.add_argument(
+        '--lang',
+        type=str,
+        default='zh',
+        help='Choose model language. zh or en')
+
+    parser.add_argument(
+        "--inference_dir",
+        type=str,
+        default=None,
+        help="dir to save inference models")
+    parser.add_argument(
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
+    parser.add_argument(
+        "--text",
+        type=str,
+        help="text to synthesize, a 'utt_id sentence' pair per line.")
+    parser.add_argument("--output_dir", type=str, help="output dir.")
+
+    parser.add_argument(
+        '--am',
+        type=str,
+        default='jets_csmsc',
+        help='Choose acoustic model type of tts task.')
+
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    if args.ngpu == 0:
+        paddle.set_device("cpu")
+    elif args.ngpu > 0:
+        paddle.set_device("gpu")
+    else:
+        print("ngpu should >= 0 !")
+
+    evaluate(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/paddlespeech/t2s/exps/jets/train.py b/paddlespeech/t2s/exps/jets/train.py
new file mode 100644
index 00000000..7eb4031a
--- /dev/null
+++ b/paddlespeech/t2s/exps/jets/train.py
@@ -0,0 +1,305 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import logging
+import os
+import shutil
+from pathlib import Path
+
+import jsonlines
+import numpy as np
+import paddle
+import yaml
+from paddle import DataParallel
+from paddle import distributed as dist
+from paddle.io import DataLoader
+from paddle.optimizer import AdamW
+from yacs.config import CfgNode
+
+from paddlespeech.t2s.datasets.am_batch_fn import jets_multi_spk_batch_fn
+from paddlespeech.t2s.datasets.am_batch_fn import jets_single_spk_batch_fn
+from paddlespeech.t2s.datasets.data_table import DataTable
+from paddlespeech.t2s.datasets.sampler import ErnieSATSampler
+from paddlespeech.t2s.models.jets import JETS
+from paddlespeech.t2s.models.jets import JETSEvaluator
+from paddlespeech.t2s.models.jets import JETSUpdater
+from paddlespeech.t2s.modules.losses import DiscriminatorAdversarialLoss
+from paddlespeech.t2s.modules.losses import FeatureMatchLoss
+from paddlespeech.t2s.modules.losses import ForwardSumLoss
+from paddlespeech.t2s.modules.losses import GeneratorAdversarialLoss
+from paddlespeech.t2s.modules.losses import MelSpectrogramLoss
+from paddlespeech.t2s.modules.losses import VarianceLoss
+from paddlespeech.t2s.training.extensions.snapshot import Snapshot
+from paddlespeech.t2s.training.extensions.visualizer import VisualDL
+from paddlespeech.t2s.training.optimizer import scheduler_classes
+from paddlespeech.t2s.training.seeding import seed_everything
+from paddlespeech.t2s.training.trainer import Trainer
+from paddlespeech.t2s.utils import str2bool
+
+
+def train_sp(args, config):
+    # decides device type and whether to run in parallel
+    # setup running environment correctly
+    world_size = paddle.distributed.get_world_size()
+    if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0:
+        paddle.set_device("cpu")
+    else:
+        paddle.set_device("gpu")
+        if world_size > 1:
+            paddle.distributed.init_parallel_env()
+
+    # set the random seed, it is a must for multiprocess training
+    seed_everything(config.seed)
+
+    print(
+        f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}",
+    )
+
+    # dataloader has been too verbose
+    logging.getLogger("DataLoader").disabled = True
+
+    fields = [
+        "text", "text_lengths", "feats", "feats_lengths", "wave", "durations",
+        "pitch", "energy"
+    ]
+
+    converters = {
+        "wave": np.load,
+        "feats": np.load,
+        "pitch": np.load,
+        "energy": np.load,
+    }
+    spk_num = None
+    if args.speaker_dict is not None:
+        print("multiple speaker jets!")
+        collate_fn = jets_multi_spk_batch_fn
+        with open(args.speaker_dict, 'rt', encoding='utf-8') as f:
+            spk_id = [line.strip().split() for line in f.readlines()]
+        spk_num = len(spk_id)
+        fields += ["spk_id"]
+    elif args.voice_cloning:
+        print("Training voice cloning!")
+        collate_fn = jets_multi_spk_batch_fn
+        fields += ["spk_emb"]
+        converters["spk_emb"] = np.load
+    else:
+        print("single speaker jets!")
+        collate_fn = jets_single_spk_batch_fn
+    print("spk_num:", spk_num)
+
+    # construct dataset for training and validation
+    with jsonlines.open(args.train_metadata, 'r') as reader:
+        train_metadata = list(reader)
+    train_dataset = DataTable(
+        data=train_metadata,
+        fields=fields,
+        converters=converters, )
+    with jsonlines.open(args.dev_metadata, 'r') as reader:
+        dev_metadata = list(reader)
+    dev_dataset = DataTable(
+        data=dev_metadata,
+        fields=fields,
+        converters=converters, )
+
+    # collate function and dataloader
+    train_sampler = ErnieSATSampler(
+        train_dataset,
+        batch_size=config.batch_size,
+        shuffle=False,
+        drop_last=True)
+    dev_sampler = ErnieSATSampler(
+        dev_dataset,
+        batch_size=config.batch_size,
+        shuffle=False,
+        drop_last=False)
+    print("samplers done!")
+
+    train_dataloader = DataLoader(
+        train_dataset,
+        batch_sampler=train_sampler,
+        collate_fn=collate_fn,
+        num_workers=config.num_workers)
+
+    dev_dataloader = DataLoader(
+        dev_dataset,
+        batch_sampler=dev_sampler,
+        collate_fn=collate_fn,
+        num_workers=config.num_workers)
+    print("dataloaders done!")
+
+    with open(args.phones_dict, 'rt', encoding='utf-8') as f:
+        phn_id = [line.strip().split() for line in f.readlines()]
+    vocab_size = len(phn_id)
+    print("vocab_size:", vocab_size)
+
+    odim = config.n_mels
+    config["model"]["generator_params"]["spks"] = spk_num
+    model = JETS(idim=vocab_size, odim=odim, **config["model"])
+    gen_parameters = model.generator.parameters()
+    dis_parameters = model.discriminator.parameters()
+    if world_size > 1:
+        model = DataParallel(model)
+        gen_parameters = model._layers.generator.parameters()
+        dis_parameters = model._layers.discriminator.parameters()
+
+    print("model done!")
+
+    # loss
+    criterion_mel = MelSpectrogramLoss(
+        **config["mel_loss_params"], )
+    criterion_feat_match = FeatureMatchLoss(
+        **config["feat_match_loss_params"], )
+    criterion_gen_adv = GeneratorAdversarialLoss(
+        **config["generator_adv_loss_params"], )
+    criterion_dis_adv = DiscriminatorAdversarialLoss(
+        **config["discriminator_adv_loss_params"], )
+    criterion_var = VarianceLoss()
+    criterion_forwardsum = ForwardSumLoss()
+
+    print("criterions done!")
+
+    lr_schedule_g = scheduler_classes[config["generator_scheduler"]](
+        **config["generator_scheduler_params"])
+    optimizer_g = AdamW(
+        learning_rate=lr_schedule_g,
+        parameters=gen_parameters,
+        **config["generator_optimizer_params"])
+
+    lr_schedule_d = scheduler_classes[config["discriminator_scheduler"]](
+        **config["discriminator_scheduler_params"])
+    optimizer_d = AdamW(
+        learning_rate=lr_schedule_d,
+        parameters=dis_parameters,
+        **config["discriminator_optimizer_params"])
+
+    print("optimizers done!")
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    if dist.get_rank() == 0:
+        config_name = args.config.split("/")[-1]
+        # copy conf to output_dir
+        shutil.copyfile(args.config, output_dir / config_name)
+
+    updater = JETSUpdater(
+        model=model,
+        optimizers={
+            "generator": optimizer_g,
+            "discriminator": optimizer_d,
+        },
+        criterions={
+            "mel": criterion_mel,
+            "feat_match": criterion_feat_match,
+            "gen_adv": criterion_gen_adv,
+            "dis_adv": criterion_dis_adv,
+            "var": criterion_var,
+            "forwardsum": criterion_forwardsum,
+        },
+        schedulers={
+            "generator": lr_schedule_g,
+            "discriminator": lr_schedule_d,
+        },
+        dataloader=train_dataloader,
+        lambda_adv=config.lambda_adv,
+        lambda_mel=config.lambda_mel,
+        lambda_feat_match=config.lambda_feat_match,
+        lambda_var=config.lambda_var,
+        lambda_align=config.lambda_align,
+        generator_first=config.generator_first,
+        use_alignment_module=config.use_alignment_module,
+        output_dir=output_dir)
+
+    evaluator = JETSEvaluator(
+        model=model,
+        criterions={
+            "mel": criterion_mel,
+            "feat_match": criterion_feat_match,
+            "gen_adv": criterion_gen_adv,
+            "dis_adv": criterion_dis_adv,
+            "var": criterion_var,
+            "forwardsum": criterion_forwardsum,
+        },
+        dataloader=dev_dataloader,
+        lambda_adv=config.lambda_adv,
+        lambda_mel=config.lambda_mel,
+        lambda_feat_match=config.lambda_feat_match,
+        lambda_var=config.lambda_var,
+        lambda_align=config.lambda_align,
+        generator_first=config.generator_first,
+        use_alignment_module=config.use_alignment_module,
+        output_dir=output_dir)
+
+    trainer = Trainer(
+        updater,
+        stop_trigger=(config.train_max_steps, "iteration"),
+        out=output_dir)
+
+    if dist.get_rank() == 0:
+        trainer.extend(
+            evaluator, trigger=(config.eval_interval_steps, 'iteration'))
+        trainer.extend(VisualDL(output_dir), trigger=(1, 'iteration'))
+    trainer.extend(
+        Snapshot(max_size=config.num_snapshots),
+        trigger=(config.save_interval_steps, 'iteration'))
+
+    print("Trainer Done!")
+    trainer.run()
+
+
+def main():
+    # parse args and config and redirect to train_sp
+
+    parser = argparse.ArgumentParser(description="Train a JETS model.")
+    parser.add_argument("--config", type=str, help="JETS config file")
+    parser.add_argument("--train-metadata", type=str, help="training data.")
+    parser.add_argument("--dev-metadata", type=str, help="dev data.")
+    parser.add_argument("--output-dir", type=str, help="output dir.")
+    parser.add_argument(
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
+    parser.add_argument(
+        "--phones-dict", type=str, default=None, help="phone vocabulary file.")
+    parser.add_argument(
+        "--speaker-dict",
+        type=str,
+        default=None,
+        help="speaker id map file for multiple speaker model.")
+
+    parser.add_argument(
+        "--voice-cloning",
+        type=str2bool,
+        default=False,
+        help="whether training voice cloning model.")
+
+    args = parser.parse_args()
+
+    with open(args.config, 'rt') as f:
+        config = CfgNode(yaml.safe_load(f))
+
+    print("========Args========")
+    print(yaml.safe_dump(vars(args)))
+    print("========Config========")
+    print(config)
+    print(
+        f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}"
+    )
+
+    # dispatch
+    if args.ngpu > 1:
+        dist.spawn(train_sp, (args, config), nprocs=args.ngpu)
+    else:
+        train_sp(args, config)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/paddlespeech/t2s/exps/lite_syn_utils.py b/paddlespeech/t2s/exps/lite_syn_utils.py
index 2c67edae..65783e1a 100644
--- a/paddlespeech/t2s/exps/lite_syn_utils.py
+++ b/paddlespeech/t2s/exps/lite_syn_utils.py
@@ -19,15 +19,15 @@ def get_lite_predictor(model_dir: Optional[os.PathLike]=None,
     return predictor
 
 
-def get_lite_am_output(
-        input: str,
-        am_predictor,
-        am: str,
-        frontend: object,
-        lang: str='zh',
-        merge_sentences: bool=True,
-        speaker_dict: Optional[os.PathLike]=None,
-        spk_id: int=0, ):
+def get_lite_am_output(input: str,
+                       am_predictor,
+                       am: str,
+                       frontend: object,
+                       lang: str='zh',
+                       merge_sentences: bool=True,
+                       speaker_dict: Optional[os.PathLike]=None,
+                       spk_id: int=0,
+                       add_blank: bool=False):
     am_name = am[:am.rindex('_')]
     am_dataset = am[am.rindex('_') + 1:]
     get_spk_id = False
@@ -43,7 +43,8 @@ def get_lite_am_output(
         text=input,
         merge_sentences=merge_sentences,
         get_tone_ids=get_tone_ids,
-        lang=lang)
+        lang=lang,
+        add_blank=add_blank, )
 
     if get_tone_ids:
         tone_ids = frontend_dict['tone_ids']
diff --git a/paddlespeech/t2s/exps/ort_predict_e2e.py b/paddlespeech/t2s/exps/ort_predict_e2e.py
index 75284f7b..5e4d273e 100644
--- a/paddlespeech/t2s/exps/ort_predict_e2e.py
+++ b/paddlespeech/t2s/exps/ort_predict_e2e.py
@@ -77,7 +77,7 @@ def ort_predict(args):
             else:
                 phone_ids = np.random.randint(1, 266, size=(T, ))
             am_input_feed.update({'text': phone_ids})
-            if am_dataset in {"aishell3", "vctk", "mix"}:
+            if am_dataset in {"aishell3", "vctk", "mix", "canton"}:
                 am_input_feed.update({'spk_id': spk_id})
         elif am_name == 'speedyspeech':
             phone_ids = np.random.randint(1, 92, size=(T, ))
@@ -112,7 +112,7 @@ def ort_predict(args):
                 part_phone_ids = phone_ids[i].numpy()
                 if am_name == 'fastspeech2':
                     am_input_feed.update({'text': part_phone_ids})
-                    if am_dataset in {"aishell3", "vctk", "mix"}:
+                    if am_dataset in {"aishell3", "vctk", "mix", "canton"}:
                         am_input_feed.update({'spk_id': spk_id})
                 elif am_name == 'speedyspeech':
                     part_tone_ids = frontend_dict['tone_ids'][i].numpy()
@@ -156,6 +156,10 @@ def parse_args():
             'fastspeech2_vctk',
             'speedyspeech_csmsc',
             'fastspeech2_mix',
+            'fastspeech2_male-zh',
+            'fastspeech2_male-en',
+            'fastspeech2_male-mix',
+            'fastspeech2_canton',
         ],
         help='Choose acoustic model type of tts task.')
     parser.add_argument(
@@ -183,6 +187,8 @@ def parse_args():
             'hifigan_ljspeech',
             'hifigan_vctk',
             'mb_melgan_csmsc',
+            'pwgan_male',
+            'hifigan_male',
         ],
         help='Choose vocoder type of tts task.')
     # other
diff --git a/paddlespeech/t2s/exps/sentences_canton.txt b/paddlespeech/t2s/exps/sentences_canton.txt
new file mode 100644
index 00000000..5eb3a780
--- /dev/null
+++ b/paddlespeech/t2s/exps/sentences_canton.txt
@@ -0,0 +1,21 @@
+001 白云山爬过一次嘅，好远啊，爬上去都成两个钟
+002 睇书咯，番屋企，而家好多人好少睇书噶喎
+003 因为如果唔考试嘅话，工资好低噶
+004 冇固定噶，你中意休边日就边日噶
+005 即系太迟嘅话咧，落班太迟嘅话就喺出边食啲咯
+006 是非有公理，慎言莫冒犯别人
+007 遇上冷风雨，休太认真
+008 痴线蜘蛛条蜘蛛丝痴住枝树枝
+009 一蚊一斤鸡，一蚊一斤龟，究竟係鸡贵定係龟贵
+010 错就要认，打要企定
+011 宜家唔系事必要你讲，但系你所讲嘅说话将会成为呈堂证供
+012 人生有几多个十年，不如活得痛快
+013 嘢可以乱食，话唔可以乱讲
+014 你唔好噉心急入市先喇，淡淡定，有钱剩，睇定啲先再决定喇
+015 仔，你唔好喺度搞搞震，冇帮衬喇
+016 米话我地人穷就要任人踩，滴水都会成流水浸街
+017 佢晨早啪奶茶，同场追加奶绿，又狂怼西米露，喫啫啫猪脚煲
+018 喂！三点几嚟，饮茶先啦，做咁多都冇用嘅，老细唔锡你嘅嚟
+019 嗱嗱声即刻走去搵嘢做，人必须知道自己嘅用途
+020 人人都揸住枝苏格兰场非工业用国际线路自动溶雪16哇佬风油軚垂直升降镭射彩色洗衣干衣气垫毛笔一枝
+021 各个国家有各个国家嘅国歌
\ No newline at end of file
diff --git a/paddlespeech/t2s/exps/sentences_sing.txt b/paddlespeech/t2s/exps/sentences_sing.txt
new file mode 100644
index 00000000..7b9c6272
--- /dev/null
+++ b/paddlespeech/t2s/exps/sentences_sing.txt
@@ -0,0 +1,2 @@
+{"utt_id": "2093003457", "input_type": "word", "text": "小酒窝长睫毛AP是你最美的记号", "notes": "C#4/Db4 | F#4/Gb4 | G#4/Ab4 | A#4/Bb4 F#4/Gb4 | F#4/Gb4 C#4/Db4 | C#4/Db4 | rest | C#4/Db4 | A#4/Bb4 | G#4/Ab4 | A#4/Bb4 | G#4/Ab4 | F4 | C#4/Db4", "note_durs": "0.407140 | 0.376190 | 0.242180 | 0.509550 0.183420 | 0.315400 0.235020 | 0.361660 | 0.223070 | 0.377270 | 0.340550 | 0.299620 | 0.344510 | 0.283770 | 0.323390 | 0.360340"}
+{"utt_id": "2093003458", "input_type": "phoneme", "phones": "w o m ei t ian sh ui ui b u u zh ao AP x iang n ian n i d e w ei x iao iao AP" , "notes": "C#4/Db4 C#4/Db4 D#4/Eb4 D#4/Eb4 F4 F4 F#4/Gb4 F#4/Gb4 D#4/Eb4 D#4/Eb4 D#4/Eb4 A#3/Bb3 A#3/Bb3 A#3/Bb3 rest F#4/Gb4 F#4/Gb4 F4 F4 F#4/Gb4 F#4/Gb4 F4 F4 G#4/Ab4 G#4/Ab4 D#4/Eb4 D#4/Eb4 C#4/Db4 rest", "note_durs": "0.221750 0.221750 0.414460 0.414460 0.223160 0.223160 0.430900 0.430900 0.335990 0.269270 0.269270 0.289060 0.522690 0.522690 0.355060 0.397130 0.397130 0.247690 0.247690 0.406720 0.406720 0.246830 0.246830 0.307540 0.307540 0.429910 0.429910 0.519130 0.342300", "is_slurs": "0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0"}
\ No newline at end of file
diff --git a/paddlespeech/t2s/exps/speedyspeech/preprocess.py b/paddlespeech/t2s/exps/speedyspeech/preprocess.py
index e4084c14..75a1b079 100644
--- a/paddlespeech/t2s/exps/speedyspeech/preprocess.py
+++ b/paddlespeech/t2s/exps/speedyspeech/preprocess.py
@@ -280,6 +280,7 @@ def main():
             sentences=sentences,
             output_dir=dev_dump_dir,
             mel_extractor=mel_extractor,
+            nprocs=args.num_cpu,
             cut_sil=args.cut_sil,
             use_relative_path=args.use_relative_path)
     if test_wav_files:
diff --git a/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py b/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py
index 644ec250..d05dfafc 100644
--- a/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py
@@ -39,18 +39,18 @@ def evaluate(args, speedyspeech_config, pwg_config):
 
     # construct dataset for evaluation
     sentences = []
-    with open(args.text, 'rt') as f:
+    with open(args.text, 'rt', encoding='utf-8') as f:
         for line in f:
             items = line.strip().split()
             utt_id = items[0]
             sentence = "".join(items[1:])
             sentences.append((utt_id, sentence))
 
-    with open(args.phones_dict, "r") as f:
+    with open(args.phones_dict, 'rt', encoding='utf-8') as f:
         phn_id = [line.strip().split() for line in f.readlines()]
     vocab_size = len(phn_id)
     print("vocab_size:", vocab_size)
-    with open(args.tones_dict, "r") as f:
+    with open(args.tones_dict, 'rt', encoding='utf-8') as f:
         tone_id = [line.strip().split() for line in f.readlines()]
     tone_size = len(tone_id)
     print("tone_size:", tone_size)
diff --git a/paddlespeech/t2s/exps/speedyspeech/train.py b/paddlespeech/t2s/exps/speedyspeech/train.py
index 7b422e64..c90090da 100644
--- a/paddlespeech/t2s/exps/speedyspeech/train.py
+++ b/paddlespeech/t2s/exps/speedyspeech/train.py
@@ -70,7 +70,7 @@ def train_sp(args, config):
     if args.speaker_dict is not None:
         print("multiple speaker speedyspeech!")
         collate_fn = speedyspeech_multi_spk_batch_fn
-        with open(args.speaker_dict, 'rt') as f:
+        with open(args.speaker_dict, 'rt', encoding='utf-8') as f:
             spk_id = [line.strip().split() for line in f.readlines()]
         spk_num = len(spk_id)
         fields += ["spk_id"]
@@ -133,11 +133,11 @@ def train_sp(args, config):
         collate_fn=collate_fn,
         num_workers=config.num_workers)
     print("dataloaders done!")
-    with open(args.phones_dict, "r") as f:
+    with open(args.phones_dict, 'rt', encoding='utf-8') as f:
         phn_id = [line.strip().split() for line in f.readlines()]
     vocab_size = len(phn_id)
     print("vocab_size:", vocab_size)
-    with open(args.tones_dict, "r") as f:
+    with open(args.tones_dict, 'rt', encoding='utf-8') as f:
         tone_id = [line.strip().split() for line in f.readlines()]
     tone_size = len(tone_id)
     print("tone_size:", tone_size)
diff --git a/paddlespeech/t2s/exps/starganv2_vc/__init__.py b/paddlespeech/t2s/exps/starganv2_vc/__init__.py
new file mode 100644
index 00000000..595add0a
--- /dev/null
+++ b/paddlespeech/t2s/exps/starganv2_vc/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddlespeech/t2s/exps/starganv2_vc/normalize.py b/paddlespeech/t2s/exps/starganv2_vc/normalize.py
new file mode 100644
index 00000000..c063c46f
--- /dev/null
+++ b/paddlespeech/t2s/exps/starganv2_vc/normalize.py
@@ -0,0 +1,101 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Normalize feature files and dump them."""
+import argparse
+import logging
+from operator import itemgetter
+from pathlib import Path
+
+import jsonlines
+import numpy as np
+import tqdm
+
+from paddlespeech.t2s.datasets.data_table import DataTable
+
+
+def main():
+    """Run preprocessing process."""
+    parser = argparse.ArgumentParser(
+        description="Normalize dumped raw features (See detail in parallel_wavegan/bin/normalize.py)."
+    )
+    parser.add_argument(
+        "--metadata",
+        type=str,
+        required=True,
+        help="directory including feature files to be normalized. "
+        "you need to specify either *-scp or rootdir.")
+
+    parser.add_argument(
+        "--dumpdir",
+        type=str,
+        required=True,
+        help="directory to dump normalized feature files.")
+
+    parser.add_argument(
+        "--speaker-dict", type=str, default=None, help="speaker id map file.")
+
+    args = parser.parse_args()
+
+    dumpdir = Path(args.dumpdir).expanduser()
+    # use absolute path
+    dumpdir = dumpdir.resolve()
+    dumpdir.mkdir(parents=True, exist_ok=True)
+
+    # get dataset
+    with jsonlines.open(args.metadata, 'r') as reader:
+        metadata = list(reader)
+    dataset = DataTable(
+        metadata, converters={
+            "speech": np.load,
+        })
+    logging.info(f"The number of files = {len(dataset)}.")
+
+    vocab_speaker = {}
+    with open(args.speaker_dict, 'rt') as f:
+        spk_id = [line.strip().split() for line in f.readlines()]
+    for spk, id in spk_id:
+        vocab_speaker[spk] = int(id)
+
+    # process each file
+    output_metadata = []
+
+    for item in tqdm.tqdm(dataset):
+        utt_id = item['utt_id']
+        speech = item['speech']
+
+        # normalize
+        # 这里暂时写死
+        mean, std = -4, 4
+        speech = (speech - mean) / std
+        speech_path = dumpdir / f"{utt_id}_speech.npy"
+        np.save(speech_path, speech.astype(np.float32), allow_pickle=False)
+
+        spk_id = vocab_speaker[item["speaker"]]
+        record = {
+            "utt_id": item['utt_id'],
+            "spk_id": spk_id,
+            "speech": str(speech_path),
+        }
+
+        output_metadata.append(record)
+    output_metadata.sort(key=itemgetter('utt_id'))
+    output_metadata_path = Path(args.dumpdir) / "metadata.jsonl"
+    with jsonlines.open(output_metadata_path, 'w') as writer:
+        for item in output_metadata:
+            writer.write(item)
+    logging.info(f"metadata dumped into {output_metadata_path}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/paddlespeech/t2s/exps/starganv2_vc/preprocess.py b/paddlespeech/t2s/exps/starganv2_vc/preprocess.py
new file mode 100644
index 00000000..053c3b32
--- /dev/null
+++ b/paddlespeech/t2s/exps/starganv2_vc/preprocess.py
@@ -0,0 +1,214 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+from concurrent.futures import ThreadPoolExecutor
+from operator import itemgetter
+from pathlib import Path
+from typing import Any
+from typing import Dict
+from typing import List
+
+import jsonlines
+import librosa
+import numpy as np
+import tqdm
+import yaml
+from yacs.config import CfgNode
+
+from paddlespeech.t2s.datasets.get_feats import LogMelFBank
+from paddlespeech.t2s.datasets.preprocess_utils import get_spk_id_map
+
+speaker_set = set()
+
+
+def process_sentence(config: Dict[str, Any],
+                     fp: Path,
+                     output_dir: Path,
+                     mel_extractor=None):
+    utt_id = fp.stem
+    # for vctk
+    if utt_id.endswith("_mic2"):
+        utt_id = utt_id[:-5]
+        speaker = utt_id.split('_')[0]
+        speaker_set.add(speaker)
+    # 需要额外获取 speaker
+    record = None
+    # reading, resampling may occur
+    # 源码的 bug, 读取的时候按照 24000 读取，但是提取 mel 的时候按照 16000 提取
+    # 具体参考 https://github.com/PaddlePaddle/PaddleSpeech/blob/c7d24ba42c377fe4c0765c6b1faa202a9aeb136f/paddlespeech/t2s/exps/starganv2_vc/vc.py#L165
+    # 之后需要换成按照 24000 读取和按照 24000 提取 mel
+    wav, _ = librosa.load(str(fp), sr=24000)
+    max_value = np.abs(wav).max()
+    if max_value > 1.0:
+        wav = wav / max_value
+    assert len(wav.shape) == 1, f"{utt_id} is not a mono-channel audio."
+    assert np.abs(
+        wav).max() <= 1.0, f"{utt_id} is seems to be different that 16 bit PCM."
+    # extract mel feats
+    # 注意这里 base = 'e', 后续需要换成 base='10', 我们其他 TTS 模型都是 base='10'
+    logmel = mel_extractor.get_log_mel_fbank(wav, base='e')
+    mel_path = output_dir / (utt_id + "_speech.npy")
+    np.save(mel_path, logmel)
+    record = {"utt_id": utt_id, "speech": str(mel_path), "speaker": speaker}
+    return record
+
+
+def process_sentences(
+        config,
+        fps: List[Path],
+        output_dir: Path,
+        mel_extractor=None,
+        nprocs: int=1, ):
+    if nprocs == 1:
+        results = []
+        for fp in tqdm.tqdm(fps, total=len(fps)):
+            record = process_sentence(
+                config=config,
+                fp=fp,
+                output_dir=output_dir,
+                mel_extractor=mel_extractor)
+            if record:
+                results.append(record)
+    else:
+        with ThreadPoolExecutor(nprocs) as pool:
+            futures = []
+            with tqdm.tqdm(total=len(fps)) as progress:
+                for fp in fps:
+                    future = pool.submit(process_sentence, config, fp,
+                                         output_dir, mel_extractor)
+                    future.add_done_callback(lambda p: progress.update())
+                    futures.append(future)
+
+                results = []
+                for ft in futures:
+                    record = ft.result()
+                    if record:
+                        results.append(record)
+
+    results.sort(key=itemgetter("utt_id"))
+    with jsonlines.open(output_dir / "metadata.jsonl", 'w') as writer:
+        for item in results:
+            writer.write(item)
+    print("Done")
+
+
+def main():
+    # parse config and args
+    parser = argparse.ArgumentParser(
+        description="Preprocess audio and then extract features.")
+
+    parser.add_argument(
+        "--dataset",
+        default="vctk",
+        type=str,
+        help="name of dataset, should in {vctk} now")
+
+    parser.add_argument(
+        "--rootdir", default=None, type=str, help="directory to dataset.")
+
+    parser.add_argument(
+        "--dumpdir",
+        type=str,
+        required=True,
+        help="directory to dump feature files.")
+
+    parser.add_argument("--config", type=str, help="StarGANv2VC config file.")
+
+    parser.add_argument(
+        "--num-cpu", type=int, default=1, help="number of process.")
+
+    args = parser.parse_args()
+
+    rootdir = Path(args.rootdir).expanduser()
+    dumpdir = Path(args.dumpdir).expanduser()
+    # use absolute path
+    dumpdir = dumpdir.resolve()
+    dumpdir.mkdir(parents=True, exist_ok=True)
+
+    assert rootdir.is_dir()
+
+    with open(args.config, 'rt') as f:
+        config = CfgNode(yaml.safe_load(f))
+
+    if args.dataset == "vctk":
+        sub_num_dev = 5
+        wav_dir = rootdir / "wav48_silence_trimmed"
+        train_wav_files = []
+        dev_wav_files = []
+        test_wav_files = []
+        # only for test
+        for speaker in os.listdir(wav_dir):
+            wav_files = sorted(list((wav_dir / speaker).rglob("*_mic2.flac")))
+            if len(wav_files) > 100:
+                train_wav_files += wav_files[:-sub_num_dev * 2]
+                dev_wav_files += wav_files[-sub_num_dev * 2:-sub_num_dev]
+                test_wav_files += wav_files[-sub_num_dev:]
+            else:
+                train_wav_files += wav_files
+
+    else:
+        print("dataset should in {vctk} now!")
+
+    train_dump_dir = dumpdir / "train" / "raw"
+    train_dump_dir.mkdir(parents=True, exist_ok=True)
+    dev_dump_dir = dumpdir / "dev" / "raw"
+    dev_dump_dir.mkdir(parents=True, exist_ok=True)
+    test_dump_dir = dumpdir / "test" / "raw"
+    test_dump_dir.mkdir(parents=True, exist_ok=True)
+
+    # Extractor
+    mel_extractor = LogMelFBank(
+        sr=config.fs,
+        n_fft=config.n_fft,
+        hop_length=config.n_shift,
+        win_length=config.win_length,
+        window=config.window,
+        n_mels=config.n_mels,
+        fmin=config.fmin,
+        fmax=config.fmax,
+        # None here
+        norm=config.norm,
+        htk=config.htk,
+        power=config.power)
+
+    # process for the 3 sections
+    if train_wav_files:
+        process_sentences(
+            config=config,
+            fps=train_wav_files,
+            output_dir=train_dump_dir,
+            mel_extractor=mel_extractor,
+            nprocs=args.num_cpu)
+    if dev_wav_files:
+        process_sentences(
+            config=config,
+            fps=dev_wav_files,
+            output_dir=dev_dump_dir,
+            mel_extractor=mel_extractor,
+            nprocs=args.num_cpu)
+    if test_wav_files:
+        process_sentences(
+            config=config,
+            fps=test_wav_files,
+            output_dir=test_dump_dir,
+            mel_extractor=mel_extractor,
+            nprocs=args.num_cpu)
+
+    speaker_id_map_path = dumpdir / "speaker_id_map.txt"
+    get_spk_id_map(speaker_set, speaker_id_map_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/paddlespeech/t2s/exps/starganv2_vc/train.py b/paddlespeech/t2s/exps/starganv2_vc/train.py
new file mode 100644
index 00000000..94fa3032
--- /dev/null
+++ b/paddlespeech/t2s/exps/starganv2_vc/train.py
@@ -0,0 +1,274 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import logging
+import os
+import shutil
+from pathlib import Path
+
+import jsonlines
+import numpy as np
+import paddle
+import yaml
+from paddle import DataParallel
+from paddle import distributed as dist
+from paddle.io import DataLoader
+from paddle.io import DistributedBatchSampler
+from paddle.optimizer import AdamW
+from paddle.optimizer.lr import OneCycleLR
+from yacs.config import CfgNode
+
+from paddlespeech.cli.utils import download_and_decompress
+from paddlespeech.resource.pretrained_models import StarGANv2VC_source
+from paddlespeech.t2s.datasets.am_batch_fn import build_starganv2_vc_collate_fn
+from paddlespeech.t2s.datasets.data_table import StarGANv2VCDataTable
+from paddlespeech.t2s.models.starganv2_vc import ASRCNN
+from paddlespeech.t2s.models.starganv2_vc import Discriminator
+from paddlespeech.t2s.models.starganv2_vc import Generator
+from paddlespeech.t2s.models.starganv2_vc import JDCNet
+from paddlespeech.t2s.models.starganv2_vc import MappingNetwork
+from paddlespeech.t2s.models.starganv2_vc import StarGANv2VCEvaluator
+from paddlespeech.t2s.models.starganv2_vc import StarGANv2VCUpdater
+from paddlespeech.t2s.models.starganv2_vc import StyleEncoder
+from paddlespeech.t2s.training.extensions.snapshot import Snapshot
+from paddlespeech.t2s.training.extensions.visualizer import VisualDL
+from paddlespeech.t2s.training.seeding import seed_everything
+from paddlespeech.t2s.training.trainer import Trainer
+from paddlespeech.utils.env import MODEL_HOME
+
+
+def train_sp(args, config):
+    # decides device type and whether to run in parallel
+    # setup running environment correctly
+    world_size = paddle.distributed.get_world_size()
+    if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0:
+        paddle.set_device("cpu")
+    else:
+        paddle.set_device("gpu")
+        if world_size > 1:
+            paddle.distributed.init_parallel_env()
+
+    # set the random seed, it is a must for multiprocess training
+    seed_everything(config.seed)
+
+    print(
+        f"rank: {dist.get_rank()}, pid: {os.getpid()}, parent_pid: {os.getppid()}",
+    )
+    # to edit
+    fields = ["speech", "speech_lengths"]
+    converters = {"speech": np.load}
+
+    collate_fn = build_starganv2_vc_collate_fn(
+        latent_dim=config['mapping_network_params']['latent_dim'],
+        max_mel_length=config['max_mel_length'])
+
+    # dataloader has been too verbose
+    logging.getLogger("DataLoader").disabled = True
+
+    # construct dataset for training and validation
+    with jsonlines.open(args.train_metadata, 'r') as reader:
+        train_metadata = list(reader)
+    train_dataset = StarGANv2VCDataTable(data=train_metadata)
+    with jsonlines.open(args.dev_metadata, 'r') as reader:
+        dev_metadata = list(reader)
+    dev_dataset = StarGANv2VCDataTable(data=dev_metadata)
+
+    # collate function and dataloader
+    train_sampler = DistributedBatchSampler(
+        train_dataset,
+        batch_size=config.batch_size,
+        shuffle=True,
+        drop_last=True)
+
+    print("samplers done!")
+
+    train_dataloader = DataLoader(
+        train_dataset,
+        batch_sampler=train_sampler,
+        collate_fn=collate_fn,
+        num_workers=config.num_workers)
+
+    dev_dataloader = DataLoader(
+        dev_dataset,
+        shuffle=False,
+        drop_last=False,
+        batch_size=config.batch_size,
+        collate_fn=collate_fn,
+        num_workers=config.num_workers)
+
+    print("dataloaders done!")
+
+    # load model
+    model_version = '1.0'
+    uncompress_path = download_and_decompress(StarGANv2VC_source[model_version],
+                                              MODEL_HOME)
+    # 根据 speaker 的个数修改 num_domains
+    # 源码的预训练模型和 default.yaml 里面默认是 20
+    if args.speaker_dict is not None:
+        with open(args.speaker_dict, 'rt', encoding='utf-8') as f:
+            spk_id = [line.strip().split() for line in f.readlines()]
+        spk_num = len(spk_id)
+        print("spk_num:", spk_num)
+        config['mapping_network_params']['num_domains'] = spk_num
+        config['style_encoder_params']['num_domains'] = spk_num
+        config['discriminator_params']['num_domains'] = spk_num
+
+    generator = Generator(**config['generator_params'])
+    mapping_network = MappingNetwork(**config['mapping_network_params'])
+    style_encoder = StyleEncoder(**config['style_encoder_params'])
+    discriminator = Discriminator(**config['discriminator_params'])
+
+    # load pretrained model
+    jdc_model_dir = os.path.join(uncompress_path, 'jdcnet.pdz')
+    asr_model_dir = os.path.join(uncompress_path, 'asr.pdz')
+
+    F0_model = JDCNet(num_class=1, seq_len=config['max_mel_length'])
+    F0_model.set_state_dict(paddle.load(jdc_model_dir)['main_params'])
+    F0_model.eval()
+
+    asr_model = ASRCNN(**config['asr_params'])
+    asr_model.set_state_dict(paddle.load(asr_model_dir)['main_params'])
+    asr_model.eval()
+
+    if world_size > 1:
+        generator = DataParallel(generator)
+        discriminator = DataParallel(discriminator)
+    print("models done!")
+
+    lr_schedule_g = OneCycleLR(**config["generator_scheduler_params"])
+    optimizer_g = AdamW(
+        learning_rate=lr_schedule_g,
+        parameters=generator.parameters(),
+        **config["generator_optimizer_params"])
+
+    lr_schedule_s = OneCycleLR(**config["style_encoder_scheduler_params"])
+    optimizer_s = AdamW(
+        learning_rate=lr_schedule_s,
+        parameters=style_encoder.parameters(),
+        **config["style_encoder_optimizer_params"])
+
+    lr_schedule_m = OneCycleLR(**config["mapping_network_scheduler_params"])
+    optimizer_m = AdamW(
+        learning_rate=lr_schedule_m,
+        parameters=mapping_network.parameters(),
+        **config["mapping_network_optimizer_params"])
+
+    lr_schedule_d = OneCycleLR(**config["discriminator_scheduler_params"])
+    optimizer_d = AdamW(
+        learning_rate=lr_schedule_d,
+        parameters=discriminator.parameters(),
+        **config["discriminator_optimizer_params"])
+    print("optimizers done!")
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    if dist.get_rank() == 0:
+        config_name = args.config.split("/")[-1]
+        # copy conf to output_dir
+        shutil.copyfile(args.config, output_dir / config_name)
+
+    updater = StarGANv2VCUpdater(
+        models={
+            "generator": generator,
+            "style_encoder": style_encoder,
+            "mapping_network": mapping_network,
+            "discriminator": discriminator,
+            "F0_model": F0_model,
+            "asr_model": asr_model,
+        },
+        optimizers={
+            "generator": optimizer_g,
+            "style_encoder": optimizer_s,
+            "mapping_network": optimizer_m,
+            "discriminator": optimizer_d,
+        },
+        schedulers={
+            "generator": lr_schedule_g,
+            "style_encoder": lr_schedule_s,
+            "mapping_network": lr_schedule_m,
+            "discriminator": lr_schedule_d,
+        },
+        dataloader=train_dataloader,
+        g_loss_params=config.loss_params.g_loss,
+        d_loss_params=config.loss_params.d_loss,
+        adv_cls_epoch=config.loss_params.adv_cls_epoch,
+        con_reg_epoch=config.loss_params.con_reg_epoch,
+        output_dir=output_dir)
+
+    evaluator = StarGANv2VCEvaluator(
+        models={
+            "generator": generator,
+            "style_encoder": style_encoder,
+            "mapping_network": mapping_network,
+            "discriminator": discriminator,
+            "F0_model": F0_model,
+            "asr_model": asr_model,
+        },
+        dataloader=dev_dataloader,
+        g_loss_params=config.loss_params.g_loss,
+        d_loss_params=config.loss_params.d_loss,
+        adv_cls_epoch=config.loss_params.adv_cls_epoch,
+        con_reg_epoch=config.loss_params.con_reg_epoch,
+        output_dir=output_dir)
+
+    trainer = Trainer(updater, (config.max_epoch, 'epoch'), output_dir)
+
+    if dist.get_rank() == 0:
+        trainer.extend(evaluator, trigger=(1, "epoch"))
+        trainer.extend(VisualDL(output_dir), trigger=(1, "iteration"))
+    trainer.extend(
+        Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch'))
+    print("Trainer Done!")
+
+    trainer.run()
+
+
+def main():
+    # parse args and config and redirect to train_sp
+
+    parser = argparse.ArgumentParser(description="Train a HiFiGAN model.")
+    parser.add_argument("--config", type=str, help="HiFiGAN config file.")
+    parser.add_argument("--train-metadata", type=str, help="training data.")
+    parser.add_argument("--dev-metadata", type=str, help="dev data.")
+    parser.add_argument("--output-dir", type=str, help="output dir.")
+    parser.add_argument(
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
+    parser.add_argument(
+        "--speaker-dict",
+        type=str,
+        default=None,
+        help="speaker id map file for multiple speaker model.")
+
+    args = parser.parse_args()
+
+    with open(args.config, 'rt') as f:
+        config = CfgNode(yaml.safe_load(f))
+
+    print("========Args========")
+    print(yaml.safe_dump(vars(args)))
+    print("========Config========")
+    print(config)
+    print(
+        f"master see the word size: {dist.get_world_size()}, from pid: {os.getpid()}"
+    )
+
+    # dispatch
+    if args.ngpu > 1:
+        dist.spawn(train_sp, (args, config), nprocs=args.ngpu)
+    else:
+        train_sp(args, config)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/paddlespeech/t2s/exps/starganv2_vc/vc.py b/paddlespeech/t2s/exps/starganv2_vc/vc.py
new file mode 100644
index 00000000..24d3dcf8
--- /dev/null
+++ b/paddlespeech/t2s/exps/starganv2_vc/vc.py
@@ -0,0 +1,264 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import time
+from pathlib import Path
+
+import librosa
+import paddle
+import soundfile as sf
+import yaml
+from yacs.config import CfgNode
+
+from paddlespeech.cli.utils import download_and_decompress
+from paddlespeech.resource.pretrained_models import StarGANv2VC_source
+from paddlespeech.t2s.datasets.get_feats import LogMelFBank
+from paddlespeech.t2s.models.parallel_wavegan import PWGGenerator
+from paddlespeech.t2s.models.starganv2_vc import Generator
+from paddlespeech.t2s.models.starganv2_vc import JDCNet
+from paddlespeech.t2s.models.starganv2_vc import MappingNetwork
+from paddlespeech.t2s.models.starganv2_vc import StyleEncoder
+from paddlespeech.utils.env import MODEL_HOME
+
+
+def get_mel_extractor():
+    sr = 16000
+    n_fft = 2048
+    win_length = 1200
+    hop_length = 300
+    n_mels = 80
+    fmin = 0
+    fmax = sr // 2
+
+    mel_extractor = LogMelFBank(
+        sr=sr,
+        n_fft=n_fft,
+        hop_length=hop_length,
+        win_length=win_length,
+        n_mels=n_mels,
+        fmin=fmin,
+        fmax=fmax,
+        norm=None,
+        htk=True,
+        power=2.0)
+    return mel_extractor
+
+
+def preprocess(wave, mel_extractor):
+    # (T, 80)
+    logmel = mel_extractor.get_log_mel_fbank(wave, base='e')
+    mean, std = -4, 4
+    # [1, 80, T]
+    mel_tensor = (paddle.to_tensor(logmel.T).unsqueeze(0) - mean) / std
+    return mel_tensor
+
+
+def compute_style(speaker_dicts, mel_extractor, style_encoder, mapping_network):
+    reference_embeddings = {}
+    for key, (path, speaker) in speaker_dicts.items():
+        # path = ''
+        if path == '':
+            label = paddle.to_tensor([speaker], dtype=paddle.int64)
+            latent_dim = mapping_network.shared[0].weight.shape[0]
+            ref = mapping_network(paddle.randn([1, latent_dim]), label)
+        else:
+            wave, sr = librosa.load(path, sr=24000)
+            audio, index = librosa.effects.trim(wave, top_db=30)
+            if sr != 24000:
+                wave = librosa.resample(wave, sr, 24000)
+            mel_tensor = preprocess(wave=wave, mel_extractor=mel_extractor)
+            with paddle.no_grad():
+                label = paddle.to_tensor([speaker], dtype=paddle.int64)
+                ref = style_encoder(mel_tensor.unsqueeze(1), label)
+        reference_embeddings[key] = (ref, label)
+
+    return reference_embeddings
+
+
+def get_models(args, uncompress_path):
+    model_dict = {}
+    jdc_model_dir = os.path.join(uncompress_path, 'jdcnet.pdz')
+    voc_model_dir = os.path.join(uncompress_path, 'Vocoder/')
+    starganv2vc_model_dir = os.path.join(uncompress_path, 'starganv2vc.pdz')
+
+    F0_model = JDCNet(num_class=1, seq_len=192)
+    F0_model.set_state_dict(paddle.load(jdc_model_dir)['main_params'])
+    F0_model.eval()
+
+    voc_config_path = os.path.join(voc_model_dir, 'config.yml')
+    with open(voc_config_path) as f:
+        voc_config = CfgNode(yaml.safe_load(f))
+    voc_config["generator_params"].pop("upsample_net")
+    voc_config["generator_params"]["upsample_scales"] = voc_config[
+        "generator_params"].pop("upsample_params")["upsample_scales"]
+    vocoder = PWGGenerator(**voc_config["generator_params"])
+    vocoder.remove_weight_norm()
+    vocoder.eval()
+    voc_model_path = os.path.join(voc_model_dir, 'checkpoint-400000steps.pd')
+    vocoder.set_state_dict(paddle.load(voc_model_path))
+
+    with open(args.config_path) as f:
+        config = CfgNode(yaml.safe_load(f))
+
+    generator = Generator(**config['generator_params'])
+    mapping_network = MappingNetwork(**config['mapping_network_params'])
+    style_encoder = StyleEncoder(**config['style_encoder_params'])
+
+    starganv2vc_model_param = paddle.load(starganv2vc_model_dir)
+
+    generator.set_state_dict(starganv2vc_model_param['generator_params'])
+    mapping_network.set_state_dict(
+        starganv2vc_model_param['mapping_network_params'])
+    style_encoder.set_state_dict(
+        starganv2vc_model_param['style_encoder_params'])
+
+    generator.eval()
+    mapping_network.eval()
+    style_encoder.eval()
+
+    model_dict['F0_model'] = F0_model
+    model_dict['vocoder'] = vocoder
+    model_dict['generator'] = generator
+    model_dict['mapping_network'] = mapping_network
+    model_dict['style_encoder'] = style_encoder
+    return model_dict
+
+
+def voice_conversion(args, uncompress_path):
+    speakers = [
+        225, 228, 229, 230, 231, 233, 236, 239, 240, 244, 226, 227, 232, 243,
+        254, 256, 258, 259, 270, 273
+    ]
+    demo_dir = os.path.join(uncompress_path, 'Demo/VCTK-corpus/')
+    model_dict = get_models(args, uncompress_path=uncompress_path)
+    style_encoder = model_dict['style_encoder']
+    mapping_network = model_dict['mapping_network']
+    generator = model_dict['generator']
+    vocoder = model_dict['vocoder']
+    F0_model = model_dict['F0_model']
+
+    # 计算 Demo 文件夹下的说话人的风格
+    speaker_dicts = {}
+    selected_speakers = [273, 259, 258, 243, 254, 244, 236, 233, 230, 228]
+    for s in selected_speakers:
+        k = s
+        speaker_dicts['p' + str(s)] = (
+            demo_dir + 'p' + str(k) + '/p' + str(k) + '_023.wav',
+            speakers.index(s))
+    mel_extractor = get_mel_extractor()
+    reference_embeddings = compute_style(
+        speaker_dicts=speaker_dicts,
+        mel_extractor=mel_extractor,
+        style_encoder=style_encoder,
+        mapping_network=mapping_network)
+
+    wave, sr = librosa.load(args.source_path, sr=24000)
+    source = preprocess(wave=wave, mel_extractor=mel_extractor)
+    # # 测试 preprocess.py 的输出是否 ok
+    # # 直接用 raw 然后 norm 的在这里 ok
+    # # 直接用 norm 在这里 ok
+    # import numpy as np
+    # source = np.load("~/PaddleSpeech_stargan_preprocess/PaddleSpeech/examples/vctk/vc3/dump/train/norm/p329_414_speech.npy")
+    # # ！！！对 mel_extractor norm 后的操作
+    # # [1, 80, T]
+    # source = paddle.to_tensor(source.T).unsqueeze(0)
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    orig_wav_name = str(output_dir / 'orig_voc.wav')
+    print('原始语音 (使用声码器解码): %s' % orig_wav_name)
+    c = source.transpose([0, 2, 1]).squeeze()
+    with paddle.no_grad():
+        recon = vocoder.inference(c)
+        recon = recon.reshape([-1]).numpy()
+    sf.write(orig_wav_name, recon, samplerate=24000)
+
+    keys = []
+    converted_samples = {}
+    reconstructed_samples = {}
+    converted_mels = {}
+    start = time.time()
+
+    for key, (ref, _) in reference_embeddings.items():
+        with paddle.no_grad():
+            # F0_model 输入的特征是否可以不带 norm，或者 norm 是否一定要和 stargan 原作保持一致？
+            # !! 需要，ASR 和 F0_model 用的是一样的数据预处理方式
+            # 如果不想要重新训练 ASR 和 F0_model, 则我们的数据预处理需要和 stargan 原作保持一致
+            # 但是 vocoder 就无法复用
+            # 是否因为 asr 的输入是 16k 的，所以 torchaudio 的参数也是 16k 的？
+            f0_feat = F0_model.get_feature_GAN(source.unsqueeze(1))
+            # 输出是带 norm 的 mel, 所以可以直接用 vocoder.inference
+            out = generator(source.unsqueeze(1), ref, F0=f0_feat)
+            c = out.transpose([0, 1, 3, 2]).squeeze()
+            y_out = vocoder.inference(c)
+            y_out = y_out.reshape([-1])
+            if key not in speaker_dicts or speaker_dicts[key][0] == "":
+                recon = None
+            else:
+                wave, sr = librosa.load(speaker_dicts[key][0], sr=24000)
+                mel = preprocess(wave=wave, mel_extractor=mel_extractor)
+                c = mel.transpose([0, 2, 1]).squeeze()
+                recon = vocoder.inference(c)
+                recon = recon.reshape([-1]).numpy()
+
+        converted_samples[key] = y_out.numpy()
+        reconstructed_samples[key] = recon
+        converted_mels[key] = out
+        keys.append(key)
+    end = time.time()
+    print('总共花费时间: %.3f sec' % (end - start))
+    for key, wave in converted_samples.items():
+        wav_name = str(output_dir / ('vc_result_' + key + '.wav'))
+        print('语音转换结果: %s' % wav_name)
+        sf.write(wav_name, wave, samplerate=24000)
+        ref_wav_name = str(output_dir / ('ref_voc_' + key + '.wav'))
+        print('参考的说话人 (使用声码器解码): %s' % ref_wav_name)
+        if reconstructed_samples[key] is not None:
+            sf.write(ref_wav_name, reconstructed_samples[key], samplerate=24000)
+
+
+def parse_args():
+    # parse args and config  
+    parser = argparse.ArgumentParser(
+        description="StarGANv2-VC Voice Conversion.")
+    parser.add_argument("--source_path", type=str, help="source audio's path.")
+    parser.add_argument("--output_dir", type=str, help="output dir.")
+    parser.add_argument(
+        '--config_path',
+        type=str,
+        default=None,
+        help='Config of StarGANv2-VC model.')
+    parser.add_argument(
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    if args.ngpu == 0:
+        paddle.set_device("cpu")
+    elif args.ngpu > 0:
+        paddle.set_device("gpu")
+    else:
+        print("ngpu should >= 0 !")
+    model_version = '1.0'
+    uncompress_path = download_and_decompress(StarGANv2VC_source[model_version],
+                                              MODEL_HOME)
+    voice_conversion(args, uncompress_path=uncompress_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/paddlespeech/t2s/exps/syn_utils.py b/paddlespeech/t2s/exps/syn_utils.py
index 0ac79981..57c79dee 100644
--- a/paddlespeech/t2s/exps/syn_utils.py
+++ b/paddlespeech/t2s/exps/syn_utils.py
@@ -20,6 +20,7 @@ from typing import Dict
 from typing import List
 from typing import Optional
 
+import jsonlines
 import numpy as np
 import onnxruntime as ort
 import paddle
@@ -33,7 +34,9 @@ from paddlespeech.t2s.datasets.am_batch_fn import *
 from paddlespeech.t2s.datasets.data_table import DataTable
 from paddlespeech.t2s.datasets.vocoder_batch_fn import Clip_static
 from paddlespeech.t2s.frontend import English
+from paddlespeech.t2s.frontend.canton_frontend import CantonFrontend
 from paddlespeech.t2s.frontend.mix_frontend import MixFrontend
+from paddlespeech.t2s.frontend.sing_frontend import SingFrontend
 from paddlespeech.t2s.frontend.zh_frontend import Frontend
 from paddlespeech.t2s.modules.normalizer import ZScore
 from paddlespeech.utils.dynamic_import import dynamic_import
@@ -55,6 +58,11 @@ model_alias = {
     "paddlespeech.t2s.models.tacotron2:Tacotron2",
     "tacotron2_inference":
     "paddlespeech.t2s.models.tacotron2:Tacotron2Inference",
+    "diffsinger":
+    "paddlespeech.t2s.models.diffsinger:DiffSinger",
+    "diffsinger_inference":
+    "paddlespeech.t2s.models.diffsinger:DiffSingerInference",
+
     # voc
     "pwgan":
     "paddlespeech.t2s.models.parallel_wavegan:PWGGenerator",
@@ -106,12 +114,12 @@ def get_chunks(data, block_size: int, pad_size: int):
 def get_sentences(text_file: Optional[os.PathLike], lang: str='zh'):
     # construct dataset for evaluation
     sentences = []
-    with open(text_file, 'rt') as f:
+    with open(text_file, 'rt', encoding='utf-8') as f:
         for line in f:
             if line.strip() != "":
                 items = re.split(r"\s+", line.strip(), 1)
                 utt_id = items[0]
-                if lang == 'zh':
+                if lang in {'zh', 'canton'}:
                     sentence = "".join(items[1:])
                 elif lang == 'en':
                     sentence = " ".join(items[1:])
@@ -121,6 +129,19 @@ def get_sentences(text_file: Optional[os.PathLike], lang: str='zh'):
     return sentences
 
 
+# input for svs
+def get_sentences_svs(text_file: Optional[os.PathLike]):
+    # construct dataset for evaluation
+    sentences = []
+    with jsonlines.open(text_file, 'r') as reader:
+        svs_inputs = list(reader)
+    for svs_input in svs_inputs:
+        utt_id = svs_input['utt_id']
+        sentence = svs_input
+        sentences.append((utt_id, sentence))
+    return sentences
+
+
 # am only
 def get_test_dataset(test_metadata: List[Dict[str, Any]],
                      am: str,
@@ -132,8 +153,8 @@ def get_test_dataset(test_metadata: List[Dict[str, Any]],
     converters = {}
     if am_name == 'fastspeech2':
         fields = ["utt_id", "text"]
-        if am_dataset in {"aishell3", "vctk",
-                          "mix"} and speaker_dict is not None:
+        if am_dataset in {"aishell3", "vctk", "mix",
+                          "canton"} and speaker_dict is not None:
             print("multiple speaker fastspeech2!")
             fields += ["spk_id"]
         elif voice_cloning:
@@ -141,6 +162,8 @@ def get_test_dataset(test_metadata: List[Dict[str, Any]],
             fields += ["spk_emb"]
         else:
             print("single speaker fastspeech2!")
+    elif am_name == 'diffsinger':
+        fields = ["utt_id", "text", "note", "note_dur", "is_slur"]
     elif am_name == 'speedyspeech':
         fields = ["utt_id", "phones", "tones"]
     elif am_name == 'tacotron2':
@@ -177,8 +200,8 @@ def get_dev_dataloader(dev_metadata: List[Dict[str, Any]],
     converters = {}
     if am_name == 'fastspeech2':
         fields = ["utt_id", "text"]
-        if am_dataset in {"aishell3", "vctk",
-                          "mix"} and speaker_dict is not None:
+        if am_dataset in {"aishell3", "vctk", "mix",
+                          "canton"} and speaker_dict is not None:
             print("multiple speaker fastspeech2!")
             collate_fn = fastspeech2_multi_spk_batch_fn_static
             fields += ["spk_id"]
@@ -260,28 +283,37 @@ def get_dev_dataloader(dev_metadata: List[Dict[str, Any]],
 def get_frontend(lang: str='zh',
                  phones_dict: Optional[os.PathLike]=None,
                  tones_dict: Optional[os.PathLike]=None,
+                 pinyin_phone: Optional[os.PathLike]=None,
                  use_rhy=False):
     if lang == 'zh':
         frontend = Frontend(
             phone_vocab_path=phones_dict,
             tone_vocab_path=tones_dict,
             use_rhy=use_rhy)
+    elif lang == 'canton':
+        frontend = CantonFrontend(phone_vocab_path=phones_dict)
     elif lang == 'en':
         frontend = English(phone_vocab_path=phones_dict)
     elif lang == 'mix':
         frontend = MixFrontend(
             phone_vocab_path=phones_dict, tone_vocab_path=tones_dict)
+    elif lang == 'sing':
+        frontend = SingFrontend(
+            pinyin_phone_path=pinyin_phone, phone_vocab_path=phones_dict)
     else:
         print("wrong lang!")
     return frontend
 
 
-def run_frontend(frontend: object,
-                 text: str,
-                 merge_sentences: bool=False,
-                 get_tone_ids: bool=False,
-                 lang: str='zh',
-                 to_tensor: bool=True):
+def run_frontend(
+        frontend: object,
+        text: str,
+        merge_sentences: bool=False,
+        get_tone_ids: bool=False,
+        lang: str='zh',
+        to_tensor: bool=True,
+        add_blank: bool=False,
+        svs_input: Dict[str, str]=None, ):
     outs = dict()
     if lang == 'zh':
         input_ids = {}
@@ -297,11 +329,16 @@ def run_frontend(frontend: object,
                 text,
                 merge_sentences=merge_sentences,
                 get_tone_ids=get_tone_ids,
-                to_tensor=to_tensor)
+                to_tensor=to_tensor,
+                add_blank=add_blank)
         phone_ids = input_ids["phone_ids"]
         if get_tone_ids:
             tone_ids = input_ids["tone_ids"]
             outs.update({'tone_ids': tone_ids})
+    elif lang == 'canton':
+        input_ids = frontend.get_input_ids(
+            text, merge_sentences=merge_sentences, to_tensor=to_tensor)
+        phone_ids = input_ids["phone_ids"]
     elif lang == 'en':
         input_ids = frontend.get_input_ids(
             text, merge_sentences=merge_sentences, to_tensor=to_tensor)
@@ -310,32 +347,44 @@ def run_frontend(frontend: object,
         input_ids = frontend.get_input_ids(
             text, merge_sentences=merge_sentences, to_tensor=to_tensor)
         phone_ids = input_ids["phone_ids"]
+    elif lang == 'sing':
+        input_ids = frontend.get_input_ids(
+            svs_input=svs_input, to_tensor=to_tensor)
+        phone_ids = input_ids["phone_ids"]
+        note_ids = input_ids["note_ids"]
+        note_durs = input_ids["note_durs"]
+        is_slurs = input_ids["is_slurs"]
+        outs.update({'note_ids': note_ids})
+        outs.update({'note_durs': note_durs})
+        outs.update({'is_slurs': is_slurs})
     else:
-        print("lang should in {'zh', 'en', 'mix'}!")
+        print("lang should in {'zh', 'en', 'mix', 'canton', 'sing'}!")
     outs.update({'phone_ids': phone_ids})
     return outs
 
 
 # dygraph
-def get_am_inference(am: str='fastspeech2_csmsc',
-                     am_config: CfgNode=None,
-                     am_ckpt: Optional[os.PathLike]=None,
-                     am_stat: Optional[os.PathLike]=None,
-                     phones_dict: Optional[os.PathLike]=None,
-                     tones_dict: Optional[os.PathLike]=None,
-                     speaker_dict: Optional[os.PathLike]=None,
-                     return_am: bool=False):
-    with open(phones_dict, "r") as f:
+def get_am_inference(
+        am: str='fastspeech2_csmsc',
+        am_config: CfgNode=None,
+        am_ckpt: Optional[os.PathLike]=None,
+        am_stat: Optional[os.PathLike]=None,
+        phones_dict: Optional[os.PathLike]=None,
+        tones_dict: Optional[os.PathLike]=None,
+        speaker_dict: Optional[os.PathLike]=None,
+        return_am: bool=False,
+        speech_stretchs: Optional[os.PathLike]=None, ):
+    with open(phones_dict, 'rt', encoding='utf-8') as f:
         phn_id = [line.strip().split() for line in f.readlines()]
     vocab_size = len(phn_id)
     tone_size = None
     if tones_dict is not None:
-        with open(tones_dict, "r") as f:
+        with open(tones_dict, 'rt', encoding='utf-8') as f:
             tone_id = [line.strip().split() for line in f.readlines()]
         tone_size = len(tone_id)
     spk_num = None
     if speaker_dict is not None:
-        with open(speaker_dict, 'rt') as f:
+        with open(speaker_dict, 'rt', encoding='utf-8') as f:
             spk_id = [line.strip().split() for line in f.readlines()]
         spk_num = len(spk_id)
     odim = am_config.n_mels
@@ -347,6 +396,19 @@ def get_am_inference(am: str='fastspeech2_csmsc',
     if am_name == 'fastspeech2':
         am = am_class(
             idim=vocab_size, odim=odim, spk_num=spk_num, **am_config["model"])
+    elif am_name == 'diffsinger':
+        with open(speech_stretchs, "r") as f:
+            spec_min = np.load(speech_stretchs)[0]
+            spec_max = np.load(speech_stretchs)[1]
+            spec_min = paddle.to_tensor(spec_min)
+            spec_max = paddle.to_tensor(spec_max)
+        am_config["model"]["fastspeech2_params"]["spk_num"] = spk_num
+        am = am_class(
+            spec_min=spec_min,
+            spec_max=spec_max,
+            idim=vocab_size,
+            odim=odim,
+            **am_config["model"], )
     elif am_name == 'speedyspeech':
         am = am_class(
             vocab_size=vocab_size,
@@ -357,8 +419,6 @@ def get_am_inference(am: str='fastspeech2_csmsc',
         am = am_class(idim=vocab_size, odim=odim, **am_config["model"])
     elif am_name == 'erniesat':
         am = am_class(idim=vocab_size, odim=odim, **am_config["model"])
-    else:
-        print("wrong am, please input right am!!!")
 
     am.set_state_dict(paddle.load(am_ckpt)["main_params"])
     am.eval()
@@ -411,8 +471,8 @@ def am_to_static(am_inference,
     am_name = am[:am.rindex('_')]
     am_dataset = am[am.rindex('_') + 1:]
     if am_name == 'fastspeech2':
-        if am_dataset in {"aishell3", "vctk",
-                          "mix"} and speaker_dict is not None:
+        if am_dataset in {"aishell3", "vctk", "mix",
+                          "canton"} and speaker_dict is not None:
             am_inference = jit.to_static(
                 am_inference,
                 input_spec=[
@@ -424,8 +484,8 @@ def am_to_static(am_inference,
                 am_inference, input_spec=[InputSpec([-1], dtype=paddle.int64)])
 
     elif am_name == 'speedyspeech':
-        if am_dataset in {"aishell3", "vctk",
-                          "mix"} and speaker_dict is not None:
+        if am_dataset in {"aishell3", "vctk", "mix",
+                          "canton"} and speaker_dict is not None:
             am_inference = jit.to_static(
                 am_inference,
                 input_spec=[
@@ -446,8 +506,31 @@ def am_to_static(am_inference,
         am_inference = jit.to_static(
             am_inference, input_spec=[InputSpec([-1], dtype=paddle.int64)])
 
-    paddle.jit.save(am_inference, os.path.join(inference_dir, am))
-    am_inference = paddle.jit.load(os.path.join(inference_dir, am))
+    elif am_name == 'vits' or am_name == 'jets':
+        if am_dataset in {"aishell3", "vctk"} and speaker_dict is not None:
+            am_inference = jit.to_static(
+                am_inference,
+                input_spec=[
+                    InputSpec([-1], dtype=paddle.int64),
+                    InputSpec([1], dtype=paddle.int64),
+                ])
+        else:
+            am_inference = jit.to_static(
+                am_inference, input_spec=[InputSpec([-1], dtype=paddle.int64)])
+
+    elif am_name == 'diffsinger':
+        am_inference = jit.to_static(
+            am_inference,
+            input_spec=[
+                InputSpec([-1], dtype=paddle.int64),  # phone
+                InputSpec([-1], dtype=paddle.int64),  # note
+                InputSpec([-1], dtype=paddle.float32),  # note_dur
+                InputSpec([-1], dtype=paddle.int64),  # is_slur
+            ])
+
+    jit.save(am_inference, os.path.join(inference_dir, am))
+    am_inference = jit.load(os.path.join(inference_dir, am))
+
     return am_inference
 
 
@@ -458,37 +541,118 @@ def voc_to_static(voc_inference,
         voc_inference, input_spec=[
             InputSpec([-1, 80], dtype=paddle.float32),
         ])
-    paddle.jit.save(voc_inference, os.path.join(inference_dir, voc))
-    voc_inference = paddle.jit.load(os.path.join(inference_dir, voc))
+    jit.save(voc_inference, os.path.join(inference_dir, voc))
+    voc_inference = jit.load(os.path.join(inference_dir, voc))
     return voc_inference
 
 
 # inference
-def get_predictor(model_dir: Optional[os.PathLike]=None,
-                  model_file: Optional[os.PathLike]=None,
-                  params_file: Optional[os.PathLike]=None,
-                  device: str='cpu'):
+def get_predictor(
+        model_dir: Optional[os.PathLike]=None,
+        model_file: Optional[os.PathLike]=None,
+        params_file: Optional[os.PathLike]=None,
+        device: str='cpu',
+        # for gpu
+        use_trt: bool=False,
+        device_id: int=0,
+        # for trt
+        use_dynamic_shape: bool=True,
+        min_subgraph_size: int=5,
+        # for cpu
+        cpu_threads: int=1,
+        use_mkldnn: bool=False,
+        # for trt or mkldnn
+        precision: int="fp32"):
+    """
+    Args:
+        model_dir (os.PathLike): root path of model.pdmodel and model.pdiparams.
+        model_file (os.PathLike): name of model_file.
+        params_file (os.PathLike): name of params_file.
+        device (str): Choose the device you want to run, it can be: cpu/gpu, default is cpu.
+        use_trt (bool): whether to use TensorRT or not in GPU.
+        device_id (int): Choose your device id, only valid when the device is gpu, default 0.
+        use_dynamic_shape (bool): use dynamic shape or not in TensorRT.
+        use_mkldnn (bool): whether to use MKLDNN or not in CPU.
+        cpu_threads (int): num of thread when use CPU.
+        precision (str): mode of running (fp32/fp16/bf16/int8).  
+    """
+    rerun_flag = False
+    if device != "gpu" and use_trt:
+        raise ValueError(
+            "Predict by TensorRT mode: {}, expect device=='gpu', but device == {}".
+            format(precision, device))
 
     config = inference.Config(
         str(Path(model_dir) / model_file), str(Path(model_dir) / params_file))
+    config.enable_memory_optim()
+    config.switch_ir_optim(True)
     if device == "gpu":
-        config.enable_use_gpu(100, 0)
-    elif device == "cpu":
+        config.enable_use_gpu(100, device_id)
+    else:
         config.disable_gpu()
-    config.enable_memory_optim()
+        config.set_cpu_math_library_num_threads(cpu_threads)
+        if use_mkldnn:
+            # fp32
+            config.enable_mkldnn()
+            if precision == "int8":
+                config.enable_mkldnn_int8({
+                    "conv2d_transpose", "conv2d", "depthwise_conv2d", "pool2d",
+                    "transpose2", "elementwise_mul"
+                })
+                # config.enable_mkldnn_int8()
+            elif precision in {"fp16", "bf16"}:
+                config.enable_mkldnn_bfloat16()
+            print("MKLDNN with {}".format(precision))
+    if use_trt:
+        if precision == "bf16":
+            print("paddle trt does not support bf16, switching to fp16.")
+            precision = "fp16"
+        precision_map = {
+            "int8": inference.Config.Precision.Int8,
+            "fp32": inference.Config.Precision.Float32,
+            "fp16": inference.Config.Precision.Half,
+        }
+        assert precision in precision_map.keys()
+        pdtxt_name = model_file.split(".")[0] + "_" + precision + ".txt"
+        if use_dynamic_shape:
+            dynamic_shape_file = os.path.join(model_dir, pdtxt_name)
+            if os.path.exists(dynamic_shape_file):
+                config.enable_tuned_tensorrt_dynamic_shape(dynamic_shape_file,
+                                                           True)
+                # for fastspeech2
+                config.exp_disable_tensorrt_ops(["reshape2"])
+                print("trt set dynamic shape done!")
+            else:
+                # In order to avoid memory overflow when collecting dynamic shapes, it is changed to use CPU.
+                config.disable_gpu()
+                config.set_cpu_math_library_num_threads(10)
+                config.collect_shape_range_info(dynamic_shape_file)
+                print("Start collect dynamic shape...")
+                rerun_flag = True
+
+        if not rerun_flag:
+            print("Tensor RT with {}".format(precision))
+            config.enable_tensorrt_engine(
+                workspace_size=1 << 30,
+                max_batch_size=1,
+                min_subgraph_size=min_subgraph_size,
+                precision_mode=precision_map[precision],
+                use_static=True,
+                use_calib_mode=False, )
+
     predictor = inference.create_predictor(config)
     return predictor
 
 
-def get_am_output(
-        input: str,
-        am_predictor: paddle.nn.Layer,
-        am: str,
-        frontend: object,
-        lang: str='zh',
-        merge_sentences: bool=True,
-        speaker_dict: Optional[os.PathLike]=None,
-        spk_id: int=0, ):
+def get_am_output(input: str,
+                  am_predictor: paddle.nn.Layer,
+                  am: str,
+                  frontend: object,
+                  lang: str='zh',
+                  merge_sentences: bool=True,
+                  speaker_dict: Optional[os.PathLike]=None,
+                  spk_id: int=0,
+                  add_blank: bool=False):
     am_name = am[:am.rindex('_')]
     am_dataset = am[am.rindex('_') + 1:]
     am_input_names = am_predictor.get_input_names()
@@ -496,7 +660,7 @@ def get_am_output(
     get_tone_ids = False
     if am_name == 'speedyspeech':
         get_tone_ids = True
-    if am_dataset in {"aishell3", "vctk", "mix"} and speaker_dict:
+    if am_dataset in {"aishell3", "vctk", "mix", "canton"} and speaker_dict:
         get_spk_id = True
         spk_id = np.array([spk_id])
 
@@ -505,7 +669,8 @@ def get_am_output(
         text=input,
         merge_sentences=merge_sentences,
         get_tone_ids=get_tone_ids,
-        lang=lang)
+        lang=lang,
+        add_blank=add_blank, )
 
     if get_tone_ids:
         tone_ids = frontend_dict['tone_ids']
diff --git a/paddlespeech/t2s/exps/synthesize.py b/paddlespeech/t2s/exps/synthesize.py
index a8e18150..6189522d 100644
--- a/paddlespeech/t2s/exps/synthesize.py
+++ b/paddlespeech/t2s/exps/synthesize.py
@@ -60,7 +60,8 @@ def evaluate(args):
         am_stat=args.am_stat,
         phones_dict=args.phones_dict,
         tones_dict=args.tones_dict,
-        speaker_dict=args.speaker_dict)
+        speaker_dict=args.speaker_dict,
+        speech_stretchs=args.speech_stretchs, )
     test_dataset = get_test_dataset(
         test_metadata=test_metadata,
         am=args.am,
@@ -107,6 +108,20 @@ def evaluate(args):
                     if args.voice_cloning and "spk_emb" in datum:
                         spk_emb = paddle.to_tensor(np.load(datum["spk_emb"]))
                     mel = am_inference(phone_ids, spk_emb=spk_emb)
+                elif am_name == 'diffsinger':
+                    phone_ids = paddle.to_tensor(datum["text"])
+                    note = paddle.to_tensor(datum["note"])
+                    note_dur = paddle.to_tensor(datum["note_dur"])
+                    is_slur = paddle.to_tensor(datum["is_slur"])
+                    # get_mel_fs2 = False, means mel from diffusion, get_mel_fs2 = True, means mel from fastspeech2.
+                    get_mel_fs2 = False
+                    # mel: [T, mel_bin]
+                    mel = am_inference(
+                        phone_ids,
+                        note=note,
+                        note_dur=note_dur,
+                        is_slur=is_slur,
+                        get_mel_fs2=get_mel_fs2)
                 # vocoder
                 wav = voc_inference(mel)
 
@@ -134,9 +149,17 @@ def parse_args():
         type=str,
         default='fastspeech2_csmsc',
         choices=[
-            'speedyspeech_csmsc', 'fastspeech2_csmsc', 'fastspeech2_ljspeech',
-            'fastspeech2_aishell3', 'fastspeech2_vctk', 'tacotron2_csmsc',
-            'tacotron2_ljspeech', 'tacotron2_aishell3', 'fastspeech2_mix'
+            'speedyspeech_csmsc',
+            'fastspeech2_csmsc',
+            'fastspeech2_ljspeech',
+            'fastspeech2_aishell3',
+            'fastspeech2_vctk',
+            'tacotron2_csmsc',
+            'tacotron2_ljspeech',
+            'tacotron2_aishell3',
+            'fastspeech2_mix',
+            'fastspeech2_canton',
+            'diffsinger_opencpop',
         ],
         help='Choose acoustic model type of tts task.')
     parser.add_argument(
@@ -169,10 +192,19 @@ def parse_args():
         type=str,
         default='pwgan_csmsc',
         choices=[
-            'pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3', 'pwgan_vctk',
-            'mb_melgan_csmsc', 'wavernn_csmsc', 'hifigan_csmsc',
-            'hifigan_ljspeech', 'hifigan_aishell3', 'hifigan_vctk',
-            'style_melgan_csmsc'
+            'pwgan_csmsc',
+            'pwgan_ljspeech',
+            'pwgan_aishell3',
+            'pwgan_vctk',
+            'mb_melgan_csmsc',
+            'wavernn_csmsc',
+            'hifigan_csmsc',
+            'hifigan_ljspeech',
+            'hifigan_aishell3',
+            'hifigan_vctk',
+            'style_melgan_csmsc',
+            "pwgan_opencpop",
+            "hifigan_opencpop",
         ],
         help='Choose vocoder type of tts task.')
     parser.add_argument(
@@ -190,6 +222,11 @@ def parse_args():
         "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
     parser.add_argument("--test_metadata", type=str, help="test metadata.")
     parser.add_argument("--output_dir", type=str, help="output dir.")
+    parser.add_argument(
+        "--speech_stretchs",
+        type=str,
+        default=None,
+        help="The min and max values of the mel spectrum.")
 
     args = parser.parse_args()
     return args
diff --git a/paddlespeech/t2s/exps/synthesize_e2e.py b/paddlespeech/t2s/exps/synthesize_e2e.py
index 62500247..0c7b34b0 100644
--- a/paddlespeech/t2s/exps/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/synthesize_e2e.py
@@ -24,6 +24,7 @@ from paddlespeech.t2s.exps.syn_utils import am_to_static
 from paddlespeech.t2s.exps.syn_utils import get_am_inference
 from paddlespeech.t2s.exps.syn_utils import get_frontend
 from paddlespeech.t2s.exps.syn_utils import get_sentences
+from paddlespeech.t2s.exps.syn_utils import get_sentences_svs
 from paddlespeech.t2s.exps.syn_utils import get_voc_inference
 from paddlespeech.t2s.exps.syn_utils import run_frontend
 from paddlespeech.t2s.exps.syn_utils import voc_to_static
@@ -44,20 +45,18 @@ def evaluate(args):
     print(am_config)
     print(voc_config)
 
-    sentences = get_sentences(text_file=args.text, lang=args.lang)
-
     # frontend
     frontend = get_frontend(
         lang=args.lang,
         phones_dict=args.phones_dict,
         tones_dict=args.tones_dict,
+        pinyin_phone=args.pinyin_phone,
         use_rhy=args.use_rhy)
     print("frontend done!")
 
     # acoustic model
     am_name = args.am[:args.am.rindex('_')]
     am_dataset = args.am[args.am.rindex('_') + 1:]
-
     am_inference = get_am_inference(
         am=args.am,
         am_config=am_config,
@@ -65,8 +64,10 @@ def evaluate(args):
         am_stat=args.am_stat,
         phones_dict=args.phones_dict,
         tones_dict=args.tones_dict,
-        speaker_dict=args.speaker_dict)
+        speaker_dict=args.speaker_dict,
+        speech_stretchs=args.speech_stretchs, )
     print("acoustic model done!")
+
     # vocoder
     voc_inference = get_voc_inference(
         voc=args.voc,
@@ -103,14 +104,25 @@ def evaluate(args):
 
     N = 0
     T = 0
+    if am_name == 'diffsinger':
+        sentences = get_sentences_svs(text_file=args.text)
+    else:
+        sentences = get_sentences(text_file=args.text, lang=args.lang)
     for utt_id, sentence in sentences:
         with timer() as t:
+            if am_name == "diffsinger":
+                text = ""
+                svs_input = sentence
+            else:
+                text = sentence
+                svs_input = None
             frontend_dict = run_frontend(
                 frontend=frontend,
-                text=sentence,
+                text=text,
                 merge_sentences=merge_sentences,
                 get_tone_ids=get_tone_ids,
-                lang=args.lang)
+                lang=args.lang,
+                svs_input=svs_input)
             phone_ids = frontend_dict['phone_ids']
             with paddle.no_grad():
                 flags = 0
@@ -119,7 +131,7 @@ def evaluate(args):
                     # acoustic model
                     if am_name == 'fastspeech2':
                         # multi speaker
-                        if am_dataset in {"aishell3", "vctk", "mix"}:
+                        if am_dataset in {"aishell3", "vctk", "mix", "canton"}:
                             spk_id = paddle.to_tensor(args.spk_id)
                             mel = am_inference(part_phone_ids, spk_id)
                         else:
@@ -134,6 +146,15 @@ def evaluate(args):
                             mel = am_inference(part_phone_ids, part_tone_ids)
                     elif am_name == 'tacotron2':
                         mel = am_inference(part_phone_ids)
+                    elif am_name == 'diffsinger':
+                        part_note_ids = frontend_dict['note_ids'][i]
+                        part_note_durs = frontend_dict['note_durs'][i]
+                        part_is_slurs = frontend_dict['is_slurs'][i]
+                        mel = am_inference(
+                            text=part_phone_ids,
+                            note=part_note_ids,
+                            note_dur=part_note_durs,
+                            is_slur=part_is_slurs, )
                     # vocoder
                     wav = voc_inference(mel)
                     if flags == 0:
@@ -165,9 +186,20 @@ def parse_args():
         type=str,
         default='fastspeech2_csmsc',
         choices=[
-            'speedyspeech_csmsc', 'speedyspeech_aishell3', 'fastspeech2_csmsc',
-            'fastspeech2_ljspeech', 'fastspeech2_aishell3', 'fastspeech2_vctk',
-            'tacotron2_csmsc', 'tacotron2_ljspeech', 'fastspeech2_mix'
+            'speedyspeech_csmsc',
+            'speedyspeech_aishell3',
+            'fastspeech2_csmsc',
+            'fastspeech2_ljspeech',
+            'fastspeech2_aishell3',
+            'fastspeech2_vctk',
+            'tacotron2_csmsc',
+            'tacotron2_ljspeech',
+            'fastspeech2_mix',
+            'fastspeech2_canton',
+            'fastspeech2_male-zh',
+            'fastspeech2_male-en',
+            'fastspeech2_male-mix',
+            'diffsinger_opencpop',
         ],
         help='Choose acoustic model type of tts task.')
     parser.add_argument(
@@ -211,6 +243,10 @@ def parse_args():
             'hifigan_aishell3',
             'hifigan_vctk',
             'wavernn_csmsc',
+            'pwgan_male',
+            'hifigan_male',
+            'pwgan_opencpop',
+            'hifigan_opencpop',
         ],
         help='Choose vocoder type of tts task.')
     parser.add_argument(
@@ -228,6 +264,7 @@ def parse_args():
         '--lang',
         type=str,
         default='zh',
+        choices=['zh', 'en', 'mix', 'canton', 'sing'],
         help='Choose model language. zh or en or mix')
 
     parser.add_argument(
@@ -247,6 +284,17 @@ def parse_args():
         type=str2bool,
         default=False,
         help="run rhythm frontend or not")
+    parser.add_argument(
+        "--pinyin_phone",
+        type=str,
+        default=None,
+        help="pinyin to phone map file, using on sing_frontend.")
+    parser.add_argument(
+        "--speech_stretchs",
+        type=str,
+        default=None,
+        help="The min and max values of the mel spectrum, using on diffusion of diffsinger."
+    )
 
     args = parser.parse_args()
     return args
diff --git a/paddlespeech/t2s/exps/tacotron2/preprocess.py b/paddlespeech/t2s/exps/tacotron2/preprocess.py
index c27b9769..46b72591 100644
--- a/paddlespeech/t2s/exps/tacotron2/preprocess.py
+++ b/paddlespeech/t2s/exps/tacotron2/preprocess.py
@@ -311,6 +311,7 @@ def main():
             sentences=sentences,
             output_dir=dev_dump_dir,
             mel_extractor=mel_extractor,
+            nprocs=args.num_cpu,
             cut_sil=args.cut_sil,
             spk_emb_dir=spk_emb_dir)
     if test_wav_files:
diff --git a/paddlespeech/t2s/exps/tacotron2/train.py b/paddlespeech/t2s/exps/tacotron2/train.py
index 69ff80e4..db88009a 100644
--- a/paddlespeech/t2s/exps/tacotron2/train.py
+++ b/paddlespeech/t2s/exps/tacotron2/train.py
@@ -119,7 +119,7 @@ def train_sp(args, config):
         num_workers=config.num_workers)
     print("dataloaders done!")
 
-    with open(args.phones_dict, "r") as f:
+    with open(args.phones_dict, 'rt', encoding='utf-8') as f:
         phn_id = [line.strip().split() for line in f.readlines()]
     vocab_size = len(phn_id)
     print("vocab_size:", vocab_size)
diff --git a/paddlespeech/t2s/exps/transformer_tts/train.py b/paddlespeech/t2s/exps/transformer_tts/train.py
index da48b6b9..d49baad9 100644
--- a/paddlespeech/t2s/exps/transformer_tts/train.py
+++ b/paddlespeech/t2s/exps/transformer_tts/train.py
@@ -114,7 +114,7 @@ def train_sp(args, config):
         num_workers=config.num_workers)
     print("dataloaders done!")
 
-    with open(args.phones_dict, "r") as f:
+    with open(args.phones_dict, 'rt', encoding='utf-8') as f:
         phn_id = [line.strip().split() for line in f.readlines()]
     vocab_size = len(phn_id)
     print("vocab_size:", vocab_size)
diff --git a/paddlespeech/t2s/exps/vits/inference.py b/paddlespeech/t2s/exps/vits/inference.py
new file mode 100644
index 00000000..08c1ac56
--- /dev/null
+++ b/paddlespeech/t2s/exps/vits/inference.py
@@ -0,0 +1,174 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+from pathlib import Path
+
+import paddle
+import soundfile as sf
+from timer import timer
+
+from paddlespeech.t2s.exps.syn_utils import get_am_output
+from paddlespeech.t2s.exps.syn_utils import get_frontend
+from paddlespeech.t2s.exps.syn_utils import get_predictor
+from paddlespeech.t2s.exps.syn_utils import get_sentences
+from paddlespeech.t2s.utils import str2bool
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Paddle Infernce with acoustic model & vocoder.")
+    # acoustic model
+    parser.add_argument(
+        '--am',
+        type=str,
+        default='vits_csmsc',
+        choices=['vits_csmsc', 'vits_aishell3'],
+        help='Choose acoustic model type of tts task.')
+    parser.add_argument(
+        "--phones_dict", type=str, default=None, help="phone vocabulary file.")
+    parser.add_argument(
+        "--speaker_dict", type=str, default=None, help="speaker id map file.")
+    parser.add_argument(
+        '--spk_id',
+        type=int,
+        default=0,
+        help='spk id for multi speaker acoustic model')
+    # other
+    parser.add_argument(
+        '--lang',
+        type=str,
+        default='zh',
+        help='Choose model language. zh or en or mix')
+    parser.add_argument(
+        "--text",
+        type=str,
+        help="text to synthesize, a 'utt_id sentence' pair per line")
+    parser.add_argument(
+        "--add-blank",
+        type=str2bool,
+        default=True,
+        help="whether to add blank between phones")
+    parser.add_argument(
+        "--inference_dir", type=str, help="dir to save inference models")
+    parser.add_argument("--output_dir", type=str, help="output dir")
+    # inference
+    parser.add_argument(
+        "--use_trt",
+        type=str2bool,
+        default=False,
+        help="whether to use TensorRT or not in GPU", )
+    parser.add_argument(
+        "--use_mkldnn",
+        type=str2bool,
+        default=False,
+        help="whether to use MKLDNN or not in CPU.", )
+    parser.add_argument(
+        "--precision",
+        type=str,
+        default='fp32',
+        choices=['fp32', 'fp16', 'bf16', 'int8'],
+        help="mode of running")
+    parser.add_argument(
+        "--device",
+        default="gpu",
+        choices=["gpu", "cpu"],
+        help="Device selected for inference.", )
+    parser.add_argument('--cpu_threads', type=int, default=1)
+
+    args, _ = parser.parse_known_args()
+    return args
+
+
+# only inference for models trained with csmsc now
+def main():
+    args = parse_args()
+
+    paddle.set_device(args.device)
+
+    # frontend
+    frontend = get_frontend(lang=args.lang, phones_dict=args.phones_dict)
+
+    # am_predictor
+    am_predictor = get_predictor(
+        model_dir=args.inference_dir,
+        model_file=args.am + ".pdmodel",
+        params_file=args.am + ".pdiparams",
+        device=args.device,
+        use_trt=args.use_trt,
+        use_mkldnn=args.use_mkldnn,
+        cpu_threads=args.cpu_threads,
+        precision=args.precision)
+    # model: {model_name}_{dataset}
+    am_dataset = args.am[args.am.rindex('_') + 1:]
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    sentences = get_sentences(text_file=args.text, lang=args.lang)
+
+    merge_sentences = True
+    add_blank = args.add_blank
+    # vits's fs is 22050
+    fs = 22050
+    # warmup
+    for utt_id, sentence in sentences[:3]:
+        with timer() as t:
+            wav = get_am_output(
+                input=sentence,
+                am_predictor=am_predictor,
+                am=args.am,
+                frontend=frontend,
+                lang=args.lang,
+                merge_sentences=merge_sentences,
+                speaker_dict=args.speaker_dict,
+                spk_id=args.spk_id,
+                add_blank=add_blank)
+        speed = wav.size / t.elapse
+        rtf = fs / speed
+        print(
+            f"{utt_id}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
+        )
+
+    print("warm up done!")
+
+    N = 0
+    T = 0
+    for utt_id, sentence in sentences:
+        with timer() as t:
+            wav = get_am_output(
+                input=sentence,
+                am_predictor=am_predictor,
+                am=args.am,
+                frontend=frontend,
+                lang=args.lang,
+                merge_sentences=merge_sentences,
+                speaker_dict=args.speaker_dict,
+                spk_id=args.spk_id,
+                add_blank=add_blank)
+
+        N += wav.size
+        T += t.elapse
+        speed = wav.size / t.elapse
+        rtf = fs / speed
+        sf.write(output_dir / (utt_id + ".wav"), wav, samplerate=fs)
+        print(
+            f"{utt_id}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
+        )
+
+        print(f"{utt_id} done!")
+    print(f"generation speed: {N / T}Hz, RTF: {fs / (N / T) }")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/paddlespeech/t2s/exps/vits/lite_predict.py b/paddlespeech/t2s/exps/vits/lite_predict.py
new file mode 100644
index 00000000..32a544b7
--- /dev/null
+++ b/paddlespeech/t2s/exps/vits/lite_predict.py
@@ -0,0 +1,148 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+from pathlib import Path
+
+import soundfile as sf
+from timer import timer
+
+from paddlespeech.t2s.exps.lite_syn_utils import get_lite_am_output
+from paddlespeech.t2s.exps.lite_syn_utils import get_lite_predictor
+from paddlespeech.t2s.exps.syn_utils import get_frontend
+from paddlespeech.t2s.exps.syn_utils import get_sentences
+from paddlespeech.t2s.utils import str2bool
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Paddle Infernce with acoustic model & vocoder.")
+    # acoustic model
+    parser.add_argument(
+        '--am',
+        type=str,
+        default='vits_csmsc',
+        choices=[
+            'vits_csmsc',
+            'vits_aishell3',
+        ],
+        help='Choose acoustic model type of tts task.')
+    parser.add_argument(
+        "--phones_dict", type=str, default=None, help="phone vocabulary file.")
+    parser.add_argument(
+        "--speaker_dict", type=str, default=None, help="speaker id map file.")
+    parser.add_argument(
+        '--spk_id',
+        type=int,
+        default=0,
+        help='spk id for multi speaker acoustic model')
+    # other
+    parser.add_argument(
+        '--lang',
+        type=str,
+        default='zh',
+        help='Choose model language. zh or en or mix')
+    parser.add_argument(
+        "--text",
+        type=str,
+        help="text to synthesize, a 'utt_id sentence' pair per line")
+    parser.add_argument(
+        "--add-blank",
+        type=str2bool,
+        default=True,
+        help="whether to add blank between phones")
+    parser.add_argument(
+        "--inference_dir", type=str, help="dir to save inference models")
+    parser.add_argument("--output_dir", type=str, help="output dir")
+
+    args, _ = parser.parse_known_args()
+    return args
+
+
+# only inference for models trained with csmsc now
+def main():
+    args = parse_args()
+
+    # frontend
+    frontend = get_frontend(
+        lang=args.lang,
+        phones_dict=args.phones_dict)
+
+    # am_predictor
+    # vits can only run in arm
+    am_predictor = get_lite_predictor(
+        model_dir=args.inference_dir, model_file=args.am + "_arm.nb")
+    # model: {model_name}_{dataset}
+    am_dataset = args.am[args.am.rindex('_') + 1:]
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    sentences = get_sentences(text_file=args.text, lang=args.lang)
+
+    merge_sentences = True
+    add_blank = args.add_blank
+    fs = 22050
+    # warmup
+    for utt_id, sentence in sentences[:3]:
+        with timer() as t:
+            wav = get_lite_am_output(
+                input=sentence,
+                am_predictor=am_predictor,
+                am=args.am,
+                frontend=frontend,
+                lang=args.lang,
+                merge_sentences=merge_sentences,
+                speaker_dict=args.speaker_dict,
+                spk_id=args.spk_id,
+                add_blank=add_blank)
+
+        speed = wav.size / t.elapse
+        rtf = fs / speed
+        print(
+            f"{utt_id}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
+        )
+
+    print("warm up done!")
+
+    N = 0
+    T = 0
+    for utt_id, sentence in sentences:
+        with timer() as t:
+            wav = get_lite_am_output(
+                input=sentence,
+                am_predictor=am_predictor,
+                am=args.am,
+                frontend=frontend,
+                lang=args.lang,
+                merge_sentences=merge_sentences,
+                speaker_dict=args.speaker_dict,
+                spk_id=args.spk_id,
+                add_blank=add_blank)
+
+        N += wav.size
+        T += t.elapse
+        speed = wav.size / t.elapse
+        rtf = fs / speed
+
+        sf.write(output_dir / (utt_id + ".wav"), wav, samplerate=fs)
+        print(
+            f"{utt_id}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
+        )
+
+        print(f"{utt_id} done!")
+    print(f"generation speed: {N / T}Hz, RTF: {fs / (N / T) }")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/paddlespeech/t2s/exps/vits/normalize.py b/paddlespeech/t2s/exps/vits/normalize.py
index 5881ae95..24e15765 100644
--- a/paddlespeech/t2s/exps/vits/normalize.py
+++ b/paddlespeech/t2s/exps/vits/normalize.py
@@ -187,7 +187,7 @@ def main():
             record["spk_emb"] = str(item["spk_emb"])
 
         output_metadata.append(record)
-    output_metadata.sort(key=itemgetter('utt_id'))
+    output_metadata.sort(key=itemgetter('feats_lengths'), reverse=True)
     output_metadata_path = Path(args.dumpdir) / "metadata.jsonl"
     with jsonlines.open(output_metadata_path, 'w') as writer:
         for item in output_metadata:
diff --git a/paddlespeech/t2s/exps/vits/preprocess.py b/paddlespeech/t2s/exps/vits/preprocess.py
index f89ab356..23c959d4 100644
--- a/paddlespeech/t2s/exps/vits/preprocess.py
+++ b/paddlespeech/t2s/exps/vits/preprocess.py
@@ -166,7 +166,7 @@ def process_sentences(config,
                     if record:
                         results.append(record)
 
-    results.sort(key=itemgetter("utt_id"))
+    results.sort(key=itemgetter("feats_lengths"), reverse=True)
     with jsonlines.open(output_dir / "metadata.jsonl", 'w') as writer:
         for item in results:
             writer.write(item)
@@ -321,6 +321,7 @@ def main():
             sentences=sentences,
             output_dir=dev_dump_dir,
             spec_extractor=spec_extractor,
+            nprocs=args.num_cpu,
             cut_sil=args.cut_sil,
             spk_emb_dir=spk_emb_dir)
     if test_wav_files:
diff --git a/paddlespeech/t2s/exps/vits/synthesize_e2e.py b/paddlespeech/t2s/exps/vits/synthesize_e2e.py
index f9d10ea6..9768a16e 100644
--- a/paddlespeech/t2s/exps/vits/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/vits/synthesize_e2e.py
@@ -20,14 +20,15 @@ import yaml
 from timer import timer
 from yacs.config import CfgNode
 
+from paddlespeech.t2s.exps.syn_utils import am_to_static
 from paddlespeech.t2s.exps.syn_utils import get_frontend
 from paddlespeech.t2s.exps.syn_utils import get_sentences
 from paddlespeech.t2s.models.vits import VITS
+from paddlespeech.t2s.models.vits import VITSInference
 from paddlespeech.t2s.utils import str2bool
 
 
 def evaluate(args):
-
     # Init body.
     with open(args.config) as f:
         config = CfgNode(yaml.safe_load(f))
@@ -41,6 +42,9 @@ def evaluate(args):
 
     # frontend
     frontend = get_frontend(lang=args.lang, phones_dict=args.phones_dict)
+    # acoustic model
+    am_name = args.am[:args.am.rindex('_')]
+    am_dataset = args.am[args.am.rindex('_') + 1:]
 
     spk_num = None
     if args.speaker_dict is not None:
@@ -64,6 +68,15 @@ def evaluate(args):
     vits.set_state_dict(paddle.load(args.ckpt)["main_params"])
     vits.eval()
 
+    vits_inference = VITSInference(vits)
+    # whether dygraph to static
+    if args.inference_dir:
+        vits_inference = am_to_static(
+            am_inference=vits_inference,
+            am=args.am,
+            inference_dir=args.inference_dir,
+            speaker_dict=args.speaker_dict)
+
     output_dir = Path(args.output_dir)
     output_dir.mkdir(parents=True, exist_ok=True)
     merge_sentences = False
@@ -90,10 +103,12 @@ def evaluate(args):
                 for i in range(len(phone_ids)):
                     part_phone_ids = phone_ids[i]
                     spk_id = None
-                    if spk_num is not None:
+                    if am_dataset in {"aishell3",
+                                      "vctk"} and spk_num is not None:
                         spk_id = paddle.to_tensor(args.spk_id)
-                    out = vits.inference(text=part_phone_ids, sids=spk_id)
-                    wav = out["wav"]
+                        wav = vits_inference(part_phone_ids, spk_id)
+                    else:
+                        wav = vits_inference(part_phone_ids)
                     if flags == 0:
                         wav_all = wav
                         flags = 1
@@ -155,6 +170,11 @@ def parse_args():
         type=str2bool,
         default=True,
         help="whether to add blank between phones")
+    parser.add_argument(
+        '--am',
+        type=str,
+        default='vits_csmsc',
+        help='Choose acoustic model type of tts task.')
 
     args = parser.parse_args()
     return args
diff --git a/paddlespeech/t2s/exps/vits/train.py b/paddlespeech/t2s/exps/vits/train.py
index c994faa5..0e74bf63 100644
--- a/paddlespeech/t2s/exps/vits/train.py
+++ b/paddlespeech/t2s/exps/vits/train.py
@@ -24,13 +24,13 @@ import yaml
 from paddle import DataParallel
 from paddle import distributed as dist
 from paddle.io import DataLoader
-from paddle.io import DistributedBatchSampler
-from paddle.optimizer import Adam
+from paddle.optimizer import AdamW
 from yacs.config import CfgNode
 
 from paddlespeech.t2s.datasets.am_batch_fn import vits_multi_spk_batch_fn
 from paddlespeech.t2s.datasets.am_batch_fn import vits_single_spk_batch_fn
 from paddlespeech.t2s.datasets.data_table import DataTable
+from paddlespeech.t2s.datasets.sampler import ErnieSATSampler
 from paddlespeech.t2s.models.vits import VITS
 from paddlespeech.t2s.models.vits import VITSEvaluator
 from paddlespeech.t2s.models.vits import VITSUpdater
@@ -78,7 +78,7 @@ def train_sp(args, config):
     if args.speaker_dict is not None:
         print("multiple speaker vits!")
         collate_fn = vits_multi_spk_batch_fn
-        with open(args.speaker_dict, 'rt') as f:
+        with open(args.speaker_dict, 'rt', encoding='utf-8') as f:
             spk_id = [line.strip().split() for line in f.readlines()]
         spk_num = len(spk_id)
         fields += ["spk_id"]
@@ -107,12 +107,12 @@ def train_sp(args, config):
         converters=converters, )
 
     # collate function and dataloader
-    train_sampler = DistributedBatchSampler(
+    train_sampler = ErnieSATSampler(
         train_dataset,
         batch_size=config.batch_size,
-        shuffle=True,
+        shuffle=False,
         drop_last=True)
-    dev_sampler = DistributedBatchSampler(
+    dev_sampler = ErnieSATSampler(
         dev_dataset,
         batch_size=config.batch_size,
         shuffle=False,
@@ -132,7 +132,7 @@ def train_sp(args, config):
         num_workers=config.num_workers)
     print("dataloaders done!")
 
-    with open(args.phones_dict, "r") as f:
+    with open(args.phones_dict, 'rt', encoding='utf-8') as f:
         phn_id = [line.strip().split() for line in f.readlines()]
     vocab_size = len(phn_id)
     print("vocab_size:", vocab_size)
@@ -164,14 +164,14 @@ def train_sp(args, config):
 
     lr_schedule_g = scheduler_classes[config["generator_scheduler"]](
         **config["generator_scheduler_params"])
-    optimizer_g = Adam(
+    optimizer_g = AdamW(
         learning_rate=lr_schedule_g,
         parameters=gen_parameters,
         **config["generator_optimizer_params"])
 
     lr_schedule_d = scheduler_classes[config["discriminator_scheduler"]](
         **config["discriminator_scheduler_params"])
-    optimizer_d = Adam(
+    optimizer_d = AdamW(
         learning_rate=lr_schedule_d,
         parameters=dis_parameters,
         **config["discriminator_optimizer_params"])
diff --git a/paddlespeech/t2s/frontend/canton_frontend.py b/paddlespeech/t2s/frontend/canton_frontend.py
new file mode 100644
index 00000000..f2c7175f
--- /dev/null
+++ b/paddlespeech/t2s/frontend/canton_frontend.py
@@ -0,0 +1,113 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Dict
+from typing import List
+
+import numpy as np
+import paddle
+import ToJyutping
+
+from paddlespeech.t2s.frontend.zh_normalization.text_normlization import TextNormalizer
+
+INITIALS = [
+    'aa', 'aai', 'aak', 'aap', 'aat', 'aau', 'ai', 'au', 'ap', 'at', 'ak', 'a',
+    'p', 'b', 'e', 'ts', 't', 'dz', 'd', 'kw', 'k', 'gw', 'g', 'f', 'h', 'l',
+    'm', 'ng', 'n', 's', 'y', 'w', 'c', 'z', 'j', 'ong', 'on', 'ou', 'oi', 'ok',
+    'o', 'uk', 'ung'
+]
+INITIALS += ['sp', 'spl', 'spn', 'sil']
+
+
+def get_lines(cantons: List[str]):
+    phones = []
+    for canton in cantons:
+        for consonant in INITIALS:
+            if canton.startswith(consonant):
+                if canton.startswith("nga"):
+                    c, v = canton[:len(consonant)], canton[len(consonant):]
+                    phones = phones + [canton[2:]]
+                else:
+                    c, v = canton[:len(consonant)], canton[len(consonant):]
+                    phones = phones + [c, v]
+                break
+    return phones
+
+
+class CantonFrontend():
+    def __init__(self, phone_vocab_path: str):
+        self.text_normalizer = TextNormalizer()
+        self.punc = "：，；。？！“”‘’':,;.?!"
+
+        self.vocab_phones = {}
+        if phone_vocab_path:
+            with open(phone_vocab_path, 'rt', encoding='utf-8') as f:
+                phn_id = [line.strip().split() for line in f.readlines()]
+            for phn, id in phn_id:
+                self.vocab_phones[phn] = int(id)
+
+    # if merge_sentences, merge all sentences into one phone sequence
+    def _g2p(self, sentences: List[str],
+             merge_sentences: bool=True) -> List[List[str]]:
+        phones_list = []
+        for sentence in sentences:
+            phones_str = ToJyutping.get_jyutping_text(sentence)
+            phones_split = get_lines(phones_str.split(' '))
+            phones_list.append(phones_split)
+        return phones_list
+
+    def _p2id(self, phonemes: List[str]) -> np.ndarray:
+        # replace unk phone with sp
+        phonemes = [
+            phn if phn in self.vocab_phones else "sp" for phn in phonemes
+        ]
+        phone_ids = [self.vocab_phones[item] for item in phonemes]
+        return np.array(phone_ids, np.int64)
+
+    def get_phonemes(self,
+                     sentence: str,
+                     merge_sentences: bool=True,
+                     print_info: bool=False) -> List[List[str]]:
+        sentences = self.text_normalizer.normalize(sentence)
+        phonemes = self._g2p(sentences, merge_sentences=merge_sentences)
+        if print_info:
+            print("----------------------------")
+            print("text norm results:")
+            print(sentences)
+            print("----------------------------")
+            print("g2p results:")
+            print(phonemes)
+            print("----------------------------")
+        return phonemes
+
+    def get_input_ids(self,
+                      sentence: str,
+                      merge_sentences: bool=True,
+                      print_info: bool=False,
+                      to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]:
+
+        phonemes = self.get_phonemes(
+            sentence, merge_sentences=merge_sentences, print_info=print_info)
+        result = {}
+        temp_phone_ids = []
+
+        for phones in phonemes:
+            if phones:
+                phone_ids = self._p2id(phones)
+                # if use paddle.to_tensor() in onnxruntime, the first time will be too low
+                if to_tensor:
+                    phone_ids = paddle.to_tensor(phone_ids)
+                temp_phone_ids.append(phone_ids)
+        if temp_phone_ids:
+            result["phone_ids"] = temp_phone_ids
+        return result
diff --git a/paddlespeech/t2s/frontend/g2pw/onnx_api.py b/paddlespeech/t2s/frontend/g2pw/onnx_api.py
index 47c26a61..3ce3d246 100644
--- a/paddlespeech/t2s/frontend/g2pw/onnx_api.py
+++ b/paddlespeech/t2s/frontend/g2pw/onnx_api.py
@@ -100,7 +100,7 @@ class G2PWOnnxConverter:
         ]
         self.non_polyphonic = {
             '一', '不', '和', '咋', '嗲', '剖', '差', '攢', '倒', '難', '奔', '勁', '拗',
-            '肖', '瘙', '誒', '泊', '听'
+            '肖', '瘙', '誒', '泊', '听', '噢'
         }
         self.non_monophonic = {'似', '攢'}
         self.monophonic_chars = [
diff --git a/paddlespeech/t2s/frontend/generate_lexicon.py b/paddlespeech/t2s/frontend/generate_lexicon.py
index 6b467d00..4fb748a6 100644
--- a/paddlespeech/t2s/frontend/generate_lexicon.py
+++ b/paddlespeech/t2s/frontend/generate_lexicon.py
@@ -45,7 +45,7 @@ def rule(C, V, R, T):
     'u' in syllables when certain conditions are satisfied.
 
     'i' is distinguished when appeared in phonemes, and separated into 3 categories, 'i', 'ii' and 'iii'.
-    Erhua is is possibly applied to every finals, except for finals that already ends with 'r'.
+    Erhua is possibly applied to every finals, except for finals that already ends with 'r'.
     When a syllable is impossible or does not have any characters with this pronunciation, return None
     to filter it out.
     """
diff --git a/paddlespeech/t2s/frontend/mix_frontend.py b/paddlespeech/t2s/frontend/mix_frontend.py
index 19c98d53..b8c16097 100644
--- a/paddlespeech/t2s/frontend/mix_frontend.py
+++ b/paddlespeech/t2s/frontend/mix_frontend.py
@@ -11,13 +11,16 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import re
 from typing import Dict
 from typing import List
 
+import numpy as np
 import paddle
 
 from paddlespeech.t2s.frontend import English
 from paddlespeech.t2s.frontend.zh_frontend import Frontend
+from paddlespeech.t2s.ssml.xml_processor import MixTextProcessor
 
 
 class MixFrontend():
@@ -30,6 +33,7 @@ class MixFrontend():
             phone_vocab_path=phone_vocab_path, tone_vocab_path=tone_vocab_path)
         self.en_frontend = English(phone_vocab_path=phone_vocab_path)
         self.sp_id = self.zh_frontend.vocab_phones["sp"]
+        self.sp_id_numpy = np.array([self.sp_id])
         self.sp_id_tensor = paddle.to_tensor([self.sp_id])
 
     def is_chinese(self, char):
@@ -106,8 +110,39 @@ class MixFrontend():
                       get_tone_ids: bool=False,
                       add_sp: bool=True,
                       to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]:
-
-        segments = self.get_segment(sentence)
+        ''' 1. 添加SSML支持，先列出 文字 和 <say-as>标签内容，
+                然后添加到tmpSegments数组里
+        '''
+        d_inputs = MixTextProcessor.get_dom_split(sentence)
+        tmpSegments = []
+        for instr in d_inputs:
+            ''' 暂时只支持 say-as '''
+            if instr.lower().startswith("<say-as"):
+                tmpSegments.append((instr, "zh"))
+            else:
+                tmpSegments.extend(self.get_segment(instr))
+        ''' 2. 把zh的merge到一起，避免合成结果中间停顿
+        '''
+        segments = []
+        currentSeg = ["", ""]
+        for seg in tmpSegments:
+            if seg[1] == "en" or seg[1] == "other":
+                if currentSeg[0] == '':
+                    segments.append(seg)
+                else:
+                    currentSeg[0] = "<speak>" + currentSeg[0] + "</speak>"
+                    segments.append(tuple(currentSeg))
+                    segments.append(seg)
+                    currentSeg = ["", ""]
+            else:
+                if currentSeg[0] == '':
+                    currentSeg[0] = seg[0]
+                    currentSeg[1] = seg[1]
+                else:
+                    currentSeg[0] = currentSeg[0] + seg[0]
+        if currentSeg[0] != '':
+            currentSeg[0] = "<speak>" + currentSeg[0] + "</speak>"
+            segments.append(tuple(currentSeg))
 
         phones_list = []
         result = {}
@@ -120,14 +155,28 @@ class MixFrontend():
                     input_ids = self.en_frontend.get_input_ids(
                         content, merge_sentences=False, to_tensor=to_tensor)
                 else:
-                    input_ids = self.zh_frontend.get_input_ids(
-                        content,
-                        merge_sentences=False,
-                        get_tone_ids=get_tone_ids,
-                        to_tensor=to_tensor)
+                    ''' 3. 把带speak tag的中文和普通文字分开处理
+                    '''
+                    if content.strip() != "" and \
+                        re.match(r".*?<speak>.*?</speak>.*", content, re.DOTALL):
+                        input_ids = self.zh_frontend.get_input_ids_ssml(
+                            content,
+                            merge_sentences=False,
+                            get_tone_ids=get_tone_ids,
+                            to_tensor=to_tensor)
+                    else:
+                        input_ids = self.zh_frontend.get_input_ids(
+                            content,
+                            merge_sentences=False,
+                            get_tone_ids=get_tone_ids,
+                            to_tensor=to_tensor)
                 if add_sp:
-                    input_ids["phone_ids"][-1] = paddle.concat(
-                        [input_ids["phone_ids"][-1], self.sp_id_tensor])
+                    if to_tensor:
+                        input_ids["phone_ids"][-1] = paddle.concat(
+                            [input_ids["phone_ids"][-1], self.sp_id_tensor])
+                    else:
+                        input_ids["phone_ids"][-1] = np.concatenate(
+                            (input_ids["phone_ids"][-1], self.sp_id_numpy))
 
                 for phones in input_ids["phone_ids"]:
                     phones_list.append(phones)
@@ -136,7 +185,8 @@ class MixFrontend():
             merge_list = paddle.concat(phones_list)
             # rm the last 'sp' to avoid the noise at the end
             # cause in the training data, no 'sp' in the end
-            if merge_list[-1] == self.sp_id_tensor:
+            if (to_tensor and merge_list[-1] == self.sp_id_tensor) or (
+                    not to_tensor and merge_list[-1] == self.sp_id_numpy):
                 merge_list = merge_list[:-1]
             phones_list = []
             phones_list.append(merge_list)
diff --git a/paddlespeech/t2s/frontend/phonectic.py b/paddlespeech/t2s/frontend/phonectic.py
index 261db80a..af86d9b8 100644
--- a/paddlespeech/t2s/frontend/phonectic.py
+++ b/paddlespeech/t2s/frontend/phonectic.py
@@ -58,7 +58,7 @@ class English(Phonetics):
         self.punc = "：，；。？！“”‘’':,;.?!"
         self.text_normalizer = TextNormalizer()
         if phone_vocab_path:
-            with open(phone_vocab_path, 'rt') as f:
+            with open(phone_vocab_path, 'rt', encoding='utf-8') as f:
                 phn_id = [line.strip().split() for line in f.readlines()]
             for phn, id in phn_id:
                 self.vocab_phones[phn] = int(id)
diff --git a/paddlespeech/t2s/frontend/sing_frontend.py b/paddlespeech/t2s/frontend/sing_frontend.py
new file mode 100644
index 00000000..c2aecf27
--- /dev/null
+++ b/paddlespeech/t2s/frontend/sing_frontend.py
@@ -0,0 +1,175 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import re
+from typing import Dict
+from typing import List
+
+import librosa
+import numpy as np
+import paddle
+from pypinyin import lazy_pinyin
+
+
+class SingFrontend():
+    def __init__(self, pinyin_phone_path: str, phone_vocab_path: str):
+        """SVS Frontend
+
+        Args:
+            pinyin_phone_path (str): pinyin to phone file path, a 'pinyin|phones' (like: ba|b a ) pair per line.
+            phone_vocab_path (str): phone to phone id file path, a 'phone phone id' (like: a 4 ) pair per line.
+        """
+        self.punc = '[：，；。？！“”‘’\':,;.?!]'
+
+        self.pinyin_phones = {'AP': 'AP', 'SP': 'SP'}
+        if pinyin_phone_path:
+            with open(pinyin_phone_path, 'rt', encoding='utf-8') as f:
+                for line in f.readlines():
+                    pinyin_phn = [
+                        x.strip() for x in line.split('|') if x.strip() != ''
+                    ]
+                    self.pinyin_phones[pinyin_phn[0]] = pinyin_phn[1]
+
+        self.vocab_phones = {}
+        if phone_vocab_path:
+            with open(phone_vocab_path, 'rt', encoding='utf-8') as f:
+                phn_id = [line.strip().split() for line in f.readlines()]
+            for phn, id in phn_id:
+                self.vocab_phones[phn] = int(id)
+
+    def get_phones(self, sentence: str) -> List[int]:
+        """get phone list
+
+        Args:
+            sentence (str): sentence
+
+        Returns:
+            List[int]: phones list
+
+        Example:
+            sentence = "你好"
+            phones = ['n i', 'h ao']
+        """
+        # remove all punc
+        sentence = re.sub(self.punc, "", sentence)
+
+        # Pypinyin can't solve polyphonic words
+        sentence = sentence.replace('最长', '最常').replace('长睫毛', '常睫毛') \
+            .replace('那么长', '那么常').replace('多长', '多常') \
+            .replace('很长', '很常')
+
+        # lyric
+        pinyins = lazy_pinyin(sentence, strict=False)
+        # replace unk word with SP
+        pinyins = [
+            pinyin if pinyin in self.pinyin_phones.keys() else "SP"
+            for pinyin in pinyins
+        ]
+        phones = [
+            self.pinyin_phones[pinyin.strip()] for pinyin in pinyins
+            if pinyin.strip() in self.pinyin_phones
+        ]
+
+        return phones
+
+    def get_note_info(self, note_info: str) -> List[str]:
+        note_info = [x.strip() for x in note_info.split('|') if x.strip() != '']
+        return note_info
+
+    def process(
+            self,
+            phones: List[int],
+            notes: List[str],
+            note_durs: List[float], ) -> Dict[str, List[paddle.Tensor]]:
+        new_phones = []
+        new_notes = []
+        new_note_durs = []
+        is_slurs = []
+        assert len(phones) == len(notes) == len(
+            note_durs
+        ), "Please check the input, text, notes, note_durs should be the same length."
+        for i in range(len(phones)):
+            phone = phones[i].split()
+            note = notes[i].split()
+            note_dur = note_durs[i].split()
+
+            for phn in phone:
+                new_phones.append(phn)
+                new_notes.append(note[0])
+                new_note_durs.append(note_dur[0])
+                is_slurs.append(0)
+
+            if len(note) > 1:
+                for i in range(1, len(note)):
+                    new_phones.append(phone[-1])
+                    new_notes.append(note[i])
+                    new_note_durs.append(note_dur[i])
+                    is_slurs.append(1)
+
+        return new_phones, new_notes, new_note_durs, is_slurs
+
+    def get_input_ids(self, svs_input: Dict[str, str],
+                      to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]:
+        """convert input to int/float.
+
+        Args:
+            svs_input (Dict[str, str]): include keys: if input_type is phones, phones, notes, note_durs and is_slurs are needed.
+            if  input_type is word, text, notes, and note_durs sre needed.
+            to_tensor (bool, optional): whether to convert to Tensor. Defaults to True.
+
+        Returns:
+            Dict[str, List[paddle.Tensor]]: result include phone_ids, note_ids, note_durs, is_slurs.
+        """
+        result = {}
+        input_type = svs_input['input_type']
+        if input_type == 'phoneme':
+            assert "phones" in svs_input.keys() and "notes" in svs_input.keys() and "note_durs" in svs_input.keys() and "is_slurs" in svs_input.keys(), \
+                "When input_type is phoneme, phones, notes, note_durs, is_slurs should be in the svs_input."
+            phones = svs_input["phones"].split()
+            notes = svs_input["notes"].split()
+            note_durs = svs_input["note_durs"].split()
+            is_slurs = svs_input["is_slurs"].split()
+            assert len(phones) == len(notes) == len(note_durs) == len(
+                is_slurs
+            ), "Please check the input, phones, notes, note_durs is_slurs should be the same length."
+        elif input_type == "word":
+            assert "text" in svs_input.keys() and "notes" in svs_input.keys() and "note_durs" in svs_input.keys(), \
+                "When input_type is word, text, notes, note_durs, should be in the svs_input."
+            phones = self.get_phones(svs_input['text'])
+            notes = self.get_note_info(svs_input['notes'])
+            note_durs = self.get_note_info(svs_input['note_durs'])
+            phones, notes, note_durs, is_slurs = self.process(
+                phones=phones, notes=notes, note_durs=note_durs)
+
+        phone_ids = [self.vocab_phones[phn] for phn in phones]
+        phone_ids = np.array(phone_ids, np.int64)
+        note_ids = [
+            librosa.note_to_midi(note.split("/")[0]) if note != 'rest' else 0
+            for note in notes
+        ]
+        note_ids = np.array(note_ids, np.int64)
+        note_durs = np.array(note_durs, np.float32)
+        is_slurs = np.array(is_slurs, np.int64)
+
+        if to_tensor:
+            phone_ids = paddle.to_tensor(phone_ids)
+            note_ids = paddle.to_tensor(note_ids)
+            note_durs = paddle.to_tensor(note_durs)
+            is_slurs = paddle.to_tensor(is_slurs)
+
+        result['phone_ids'] = [phone_ids]
+        result['note_ids'] = [note_ids]
+        result['note_durs'] = [note_durs]
+        result['is_slurs'] = [is_slurs]
+
+        return result
diff --git a/paddlespeech/t2s/frontend/zh_frontend.py b/paddlespeech/t2s/frontend/zh_frontend.py
index ddd8cf5c..35b97a93 100644
--- a/paddlespeech/t2s/frontend/zh_frontend.py
+++ b/paddlespeech/t2s/frontend/zh_frontend.py
@@ -138,18 +138,18 @@ class Frontend():
             "拐儿", "聋儿", "乞儿", "患儿", "幼儿", "孤儿", "婴儿", "婴幼儿", "连体儿", "脑瘫儿",
             "流浪儿", "体弱儿", "混血儿", "蜜雪儿", "舫儿", "祖儿", "美儿", "应采儿", "可儿", "侄儿",
             "孙儿", "侄孙儿", "女儿", "男儿", "红孩儿", "花儿", "虫儿", "马儿", "鸟儿", "猪儿", "猫儿",
-            "狗儿"
+            "狗儿", "少儿"
         }
 
         self.vocab_phones = {}
         self.vocab_tones = {}
         if phone_vocab_path:
-            with open(phone_vocab_path, 'rt') as f:
+            with open(phone_vocab_path, 'rt', encoding='utf-8') as f:
                 phn_id = [line.strip().split() for line in f.readlines()]
             for phn, id in phn_id:
                 self.vocab_phones[phn] = int(id)
         if tone_vocab_path:
-            with open(tone_vocab_path, 'rt') as f:
+            with open(tone_vocab_path, 'rt', encoding='utf-8') as f:
                 tone_id = [line.strip().split() for line in f.readlines()]
             for tone, id in tone_id:
                 self.vocab_tones[tone] = int(id)
diff --git a/paddlespeech/t2s/models/diffsinger/__init__.py b/paddlespeech/t2s/models/diffsinger/__init__.py
new file mode 100644
index 00000000..785293ee
--- /dev/null
+++ b/paddlespeech/t2s/models/diffsinger/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .diffsinger import *
+from .diffsinger_updater import *
diff --git a/paddlespeech/t2s/models/diffsinger/diffsinger.py b/paddlespeech/t2s/models/diffsinger/diffsinger.py
new file mode 100644
index 00000000..990cfc56
--- /dev/null
+++ b/paddlespeech/t2s/models/diffsinger/diffsinger.py
@@ -0,0 +1,399 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+"""DiffSinger related modules for paddle"""
+from typing import Any
+from typing import Dict
+from typing import Tuple
+
+import numpy as np
+import paddle
+from paddle import nn
+from typeguard import check_argument_types
+
+from paddlespeech.t2s.models.diffsinger.fastspeech2midi import FastSpeech2MIDI
+from paddlespeech.t2s.modules.diffnet import DiffNet
+from paddlespeech.t2s.modules.diffusion import GaussianDiffusion
+
+
+class DiffSinger(nn.Layer):
+    """DiffSinger module.
+
+    This is a module of DiffSinger described in `DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism`._
+    .. _`DiffSinger: Singing Voice Synthesis via Shallow Diffusion Mechanism`:
+        https://arxiv.org/pdf/2105.02446.pdf
+
+    Args:
+
+    Returns:
+
+    """
+
+    def __init__(
+            self,
+            # min and max spec for stretching before diffusion
+            spec_min: paddle.Tensor,
+            spec_max: paddle.Tensor,
+            # fastspeech2midi config
+            idim: int,
+            odim: int,
+            use_energy_pred: bool=False,
+            use_postnet: bool=False,
+            # music score related 
+            note_num: int=300,
+            is_slur_num: int=2,
+            fastspeech2_params: Dict[str, Any]={
+                "adim": 256,
+                "aheads": 2,
+                "elayers": 4,
+                "eunits": 1024,
+                "dlayers": 4,
+                "dunits": 1024,
+                "positionwise_layer_type": "conv1d",
+                "positionwise_conv_kernel_size": 1,
+                "use_scaled_pos_enc": True,
+                "use_batch_norm": True,
+                "encoder_normalize_before": True,
+                "decoder_normalize_before": True,
+                "encoder_concat_after": False,
+                "decoder_concat_after": False,
+                "reduction_factor": 1,
+                # for transformer
+                "transformer_enc_dropout_rate": 0.1,
+                "transformer_enc_positional_dropout_rate": 0.1,
+                "transformer_enc_attn_dropout_rate": 0.1,
+                "transformer_dec_dropout_rate": 0.1,
+                "transformer_dec_positional_dropout_rate": 0.1,
+                "transformer_dec_attn_dropout_rate": 0.1,
+                "transformer_activation_type": "gelu",
+                # duration predictor
+                "duration_predictor_layers": 2,
+                "duration_predictor_chans": 384,
+                "duration_predictor_kernel_size": 3,
+                "duration_predictor_dropout_rate": 0.1,
+                # pitch predictor
+                "use_pitch_embed": True,
+                "pitch_predictor_layers": 2,
+                "pitch_predictor_chans": 384,
+                "pitch_predictor_kernel_size": 3,
+                "pitch_predictor_dropout": 0.5,
+                "pitch_embed_kernel_size": 9,
+                "pitch_embed_dropout": 0.5,
+                "stop_gradient_from_pitch_predictor": False,
+                # energy predictor
+                "use_energy_embed": False,
+                "energy_predictor_layers": 2,
+                "energy_predictor_chans": 384,
+                "energy_predictor_kernel_size": 3,
+                "energy_predictor_dropout": 0.5,
+                "energy_embed_kernel_size": 9,
+                "energy_embed_dropout": 0.5,
+                "stop_gradient_from_energy_predictor": False,
+                # postnet
+                "postnet_layers": 5,
+                "postnet_chans": 512,
+                "postnet_filts": 5,
+                "postnet_dropout_rate": 0.5,
+                # spk emb
+                "spk_num": None,
+                "spk_embed_dim": None,
+                "spk_embed_integration_type": "add",
+                # training related
+                "init_type": "xavier_uniform",
+                "init_enc_alpha": 1.0,
+                "init_dec_alpha": 1.0,
+                # speaker classifier
+                "enable_speaker_classifier": False,
+                "hidden_sc_dim": 256,
+            },
+            # denoiser config
+            denoiser_params: Dict[str, Any]={
+                "in_channels": 80,
+                "out_channels": 80,
+                "kernel_size": 3,
+                "layers": 20,
+                "stacks": 5,
+                "residual_channels": 256,
+                "gate_channels": 512,
+                "skip_channels": 256,
+                "aux_channels": 256,
+                "dropout": 0.,
+                "bias": True,
+                "use_weight_norm": False,
+                "init_type": "kaiming_normal",
+            },
+            # diffusion config
+            diffusion_params: Dict[str, Any]={
+                "num_train_timesteps": 100,
+                "beta_start": 0.0001,
+                "beta_end": 0.06,
+                "beta_schedule": "squaredcos_cap_v2",
+                "num_max_timesteps": 60,
+                "stretch": True,
+            }, ):
+        """Initialize DiffSinger module.
+
+        Args:
+            spec_min (paddle.Tensor): The minimum value of the feature(mel) to stretch before diffusion.
+            spec_max (paddle.Tensor): The maximum value of the feature(mel) to stretch before diffusion.
+            idim (int): Dimension of the inputs (Input vocabrary size.).
+            odim (int): Dimension of the outputs (Acoustic feature dimension.).
+            use_energy_pred (bool, optional): whether use energy predictor. Defaults False.
+            use_postnet (bool, optional): whether use postnet. Defaults False.
+            note_num (int, optional): The number of note. Defaults to 300.
+            is_slur_num (int, optional): The number of slur. Defaults to 2.
+            fastspeech2_params (Dict[str, Any]): Parameter dict for fastspeech2 module.
+            denoiser_params (Dict[str, Any]): Parameter dict for dinoiser module.
+            diffusion_params (Dict[str, Any]): Parameter dict for diffusion module.
+        """
+        assert check_argument_types()
+        super().__init__()
+        self.fs2 = FastSpeech2MIDI(
+            idim=idim,
+            odim=odim,
+            fastspeech2_params=fastspeech2_params,
+            note_num=note_num,
+            is_slur_num=is_slur_num,
+            use_energy_pred=use_energy_pred,
+            use_postnet=use_postnet, )
+        denoiser = DiffNet(**denoiser_params)
+        self.diffusion = GaussianDiffusion(
+            denoiser,
+            **diffusion_params,
+            min_values=spec_min,
+            max_values=spec_max, )
+
+    def forward(
+            self,
+            text: paddle.Tensor,
+            note: paddle.Tensor,
+            note_dur: paddle.Tensor,
+            is_slur: paddle.Tensor,
+            text_lengths: paddle.Tensor,
+            speech: paddle.Tensor,
+            speech_lengths: paddle.Tensor,
+            durations: paddle.Tensor,
+            pitch: paddle.Tensor,
+            energy: paddle.Tensor,
+            spk_emb: paddle.Tensor=None,
+            spk_id: paddle.Tensor=None,
+            only_train_fs2: bool=True,
+    ) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]:
+        """Calculate forward propagation.
+
+        Args:
+            text(Tensor(int64)): 
+                Batch of padded token (phone) ids (B, Tmax).
+            note(Tensor(int64)): 
+                Batch of padded note (element in music score) ids (B, Tmax).
+            note_dur(Tensor(float32)): 
+                Batch of padded note durations in seconds (element in music score) (B, Tmax).
+            is_slur(Tensor(int64)): 
+                Batch of padded slur (element in music score) ids (B, Tmax).
+            text_lengths(Tensor(int64)): 
+                Batch of phone lengths of each input (B,).
+            speech(Tensor[float32]): 
+                Batch of padded target features (e.g. mel) (B, Lmax, odim).
+            speech_lengths(Tensor(int64)): 
+                Batch of the lengths of each target features (B,).
+            durations(Tensor(int64)): 
+                Batch of padded token durations in frame (B, Tmax).
+            pitch(Tensor[float32]): 
+                Batch of padded frame-averaged pitch (B, Lmax, 1).
+            energy(Tensor[float32]): 
+                Batch of padded frame-averaged energy (B, Lmax, 1).
+            spk_emb(Tensor[float32], optional): 
+                Batch of speaker embeddings (B, spk_embed_dim).
+            spk_id(Tnesor[int64], optional(int64)): 
+                Batch of speaker ids (B,)
+            only_train_fs2(bool):
+                Whether to train only the fastspeech2 module
+
+        Returns:
+
+        """
+        # only train fastspeech2 module firstly
+        before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens, spk_logits = self.fs2(
+            text=text,
+            note=note,
+            note_dur=note_dur,
+            is_slur=is_slur,
+            text_lengths=text_lengths,
+            speech=speech,
+            speech_lengths=speech_lengths,
+            durations=durations,
+            pitch=pitch,
+            energy=energy,
+            spk_id=spk_id,
+            spk_emb=spk_emb)
+        if only_train_fs2:
+            return before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens, spk_logits
+
+        # get the encoder output from fastspeech2 as the condition of denoiser module
+        cond_fs2, mel_masks = self.fs2.encoder_infer_batch(
+            text=text,
+            note=note,
+            note_dur=note_dur,
+            is_slur=is_slur,
+            text_lengths=text_lengths,
+            speech_lengths=speech_lengths,
+            ds=durations,
+            ps=pitch,
+            es=energy)
+        cond_fs2 = cond_fs2.transpose((0, 2, 1))
+
+        # get the output(final mel) from diffusion module
+        noise_pred, noise_target = self.diffusion(
+            speech.transpose((0, 2, 1)), cond_fs2)
+        return noise_pred, noise_target, mel_masks
+
+    def inference(
+            self,
+            text: paddle.Tensor,
+            note: paddle.Tensor,
+            note_dur: paddle.Tensor,
+            is_slur: paddle.Tensor,
+            get_mel_fs2: bool=False, ):
+        """Run inference
+
+        Args:
+            text(Tensor(int64)): 
+                Batch of padded token (phone) ids (B, Tmax).
+            note(Tensor(int64)): 
+                Batch of padded note (element in music score) ids (B, Tmax).
+            note_dur(Tensor(float32)): 
+                Batch of padded note durations in seconds (element in music score) (B, Tmax).
+            is_slur(Tensor(int64)): 
+                Batch of padded slur (element in music score) ids (B, Tmax).
+            get_mel_fs2 (bool, optional): . Defaults to False.
+                Whether to get mel from fastspeech2 module.
+
+        Returns:
+            
+        """
+        mel_fs2, _, _, _ = self.fs2.inference(text, note, note_dur, is_slur)
+        if get_mel_fs2:
+            return mel_fs2
+        mel_fs2 = mel_fs2.unsqueeze(0).transpose((0, 2, 1))
+        cond_fs2 = self.fs2.encoder_infer(text, note, note_dur, is_slur)
+        cond_fs2 = cond_fs2.transpose((0, 2, 1))
+        noise = paddle.randn(mel_fs2.shape)
+        mel = self.diffusion.inference(
+            noise=noise,
+            cond=cond_fs2,
+            ref_x=mel_fs2,
+            scheduler_type="ddpm",
+            num_inference_steps=60)
+        mel = mel.transpose((0, 2, 1))
+        return mel[0]
+
+
+class DiffSingerInference(nn.Layer):
+    def __init__(self, normalizer, model):
+        super().__init__()
+        self.normalizer = normalizer
+        self.acoustic_model = model
+
+    def forward(self, text, note, note_dur, is_slur, get_mel_fs2: bool=False):
+        """Calculate forward propagation.
+
+        Args:
+            text(Tensor(int64)): 
+                Batch of padded token (phone) ids (B, Tmax).
+            note(Tensor(int64)): 
+                Batch of padded note (element in music score) ids (B, Tmax).
+            note_dur(Tensor(float32)): 
+                Batch of padded note durations in seconds (element in music score) (B, Tmax).
+            is_slur(Tensor(int64)): 
+                Batch of padded slur (element in music score) ids (B, Tmax).
+            get_mel_fs2 (bool, optional): . Defaults to False.
+                Whether to get mel from fastspeech2 module.
+
+        Returns:
+            logmel(Tensor(float32)): denorm logmel, [T, mel_bin]
+        """
+        normalized_mel = self.acoustic_model.inference(
+            text=text,
+            note=note,
+            note_dur=note_dur,
+            is_slur=is_slur,
+            get_mel_fs2=get_mel_fs2)
+        logmel = normalized_mel
+        return logmel
+
+
+class DiffusionLoss(nn.Layer):
+    """Loss function module for Diffusion module on DiffSinger."""
+
+    def __init__(self, use_masking: bool=True,
+                 use_weighted_masking: bool=False):
+        """Initialize feed-forward Transformer loss module.
+        Args:
+            use_masking (bool): 
+                Whether to apply masking for padded part in loss calculation.
+            use_weighted_masking (bool): 
+                Whether to weighted masking in loss calculation.
+        """
+        assert check_argument_types()
+        super().__init__()
+
+        assert (use_masking != use_weighted_masking) or not use_masking
+        self.use_masking = use_masking
+        self.use_weighted_masking = use_weighted_masking
+
+        # define criterions
+        reduction = "none" if self.use_weighted_masking else "mean"
+        self.l1_criterion = nn.L1Loss(reduction=reduction)
+
+    def forward(
+            self,
+            noise_pred: paddle.Tensor,
+            noise_target: paddle.Tensor,
+            mel_masks: paddle.Tensor, ) -> paddle.Tensor:
+        """Calculate forward propagation.
+
+        Args:
+            noise_pred(Tensor): 
+                Batch of outputs predict noise (B, Lmax, odim).
+            noise_target(Tensor):  
+                Batch of target noise (B, Lmax, odim).
+            mel_masks(Tensor): 
+                Batch of mask of real mel (B, Lmax, 1).
+        Returns:
+        
+        """
+        # apply mask to remove padded part
+        if self.use_masking:
+            noise_pred = noise_pred.masked_select(
+                mel_masks.broadcast_to(noise_pred.shape))
+            noise_target = noise_target.masked_select(
+                mel_masks.broadcast_to(noise_target.shape))
+
+        # calculate loss
+        l1_loss = self.l1_criterion(noise_pred, noise_target)
+
+        # make weighted mask and apply it
+        if self.use_weighted_masking:
+            mel_masks = mel_masks.unsqueeze(-1)
+            out_weights = mel_masks.cast(dtype=paddle.float32) / mel_masks.cast(
+                dtype=paddle.float32).sum(
+                    axis=1, keepdim=True)
+            out_weights /= noise_target.shape[0] * noise_target.shape[2]
+
+            # apply weight
+            l1_loss = l1_loss.multiply(out_weights)
+            l1_loss = l1_loss.masked_select(
+                mel_masks.broadcast_to(l1_loss.shape)).sum()
+
+        return l1_loss
diff --git a/paddlespeech/t2s/models/diffsinger/diffsinger_updater.py b/paddlespeech/t2s/models/diffsinger/diffsinger_updater.py
new file mode 100644
index 00000000..d89b09b2
--- /dev/null
+++ b/paddlespeech/t2s/models/diffsinger/diffsinger_updater.py
@@ -0,0 +1,302 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from pathlib import Path
+from typing import Dict
+
+import paddle
+from paddle import distributed as dist
+from paddle.io import DataLoader
+from paddle.nn import Layer
+from paddle.optimizer import Optimizer
+
+from paddlespeech.t2s.training.extensions.evaluator import StandardEvaluator
+from paddlespeech.t2s.training.reporter import report
+from paddlespeech.t2s.training.updaters.standard_updater import StandardUpdater
+from paddlespeech.t2s.training.updaters.standard_updater import UpdaterState
+
+logging.basicConfig(
+    format='%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s',
+    datefmt='[%Y-%m-%d %H:%M:%S]')
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+class DiffSingerUpdater(StandardUpdater):
+    def __init__(self,
+                 model: Layer,
+                 optimizers: Dict[str, Optimizer],
+                 criterions: Dict[str, Layer],
+                 dataloader: DataLoader,
+                 ds_train_start_steps: int=160000,
+                 output_dir: Path=None,
+                 only_train_diffusion: bool=True):
+        super().__init__(model, optimizers, dataloader, init_state=None)
+        self.model = model._layers if isinstance(model,
+                                                 paddle.DataParallel) else model
+        self.only_train_diffusion = only_train_diffusion
+
+        self.optimizers = optimizers
+        self.optimizer_fs2: Optimizer = optimizers['fs2']
+        self.optimizer_ds: Optimizer = optimizers['ds']
+
+        self.criterions = criterions
+        self.criterion_fs2 = criterions['fs2']
+        self.criterion_ds = criterions['ds']
+
+        self.dataloader = dataloader
+
+        self.ds_train_start_steps = ds_train_start_steps
+
+        self.state = UpdaterState(iteration=0, epoch=0)
+        self.train_iterator = iter(self.dataloader)
+
+        log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
+        self.filehandler = logging.FileHandler(str(log_file))
+        logger.addHandler(self.filehandler)
+        self.logger = logger
+        self.msg = ""
+
+    def update_core(self, batch):
+        self.msg = "Rank: {}, ".format(dist.get_rank())
+        losses_dict = {}
+        # spk_id!=None in multiple spk diffsinger 
+        spk_id = batch["spk_id"] if "spk_id" in batch else None
+        spk_emb = batch["spk_emb"] if "spk_emb" in batch else None
+        # No explicit speaker identifier labels are used during voice cloning training.
+        if spk_emb is not None:
+            spk_id = None
+
+        # only train fastspeech2 module firstly
+        if self.state.iteration < self.ds_train_start_steps:
+            before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens, spk_logits = self.model(
+                text=batch["text"],
+                note=batch["note"],
+                note_dur=batch["note_dur"],
+                is_slur=batch["is_slur"],
+                text_lengths=batch["text_lengths"],
+                speech=batch["speech"],
+                speech_lengths=batch["speech_lengths"],
+                durations=batch["durations"],
+                pitch=batch["pitch"],
+                energy=batch["energy"],
+                spk_id=spk_id,
+                spk_emb=spk_emb,
+                only_train_fs2=True, )
+
+            l1_loss_fs2, ssim_loss_fs2, duration_loss, pitch_loss, energy_loss, speaker_loss = self.criterion_fs2(
+                after_outs=after_outs,
+                before_outs=before_outs,
+                d_outs=d_outs,
+                p_outs=p_outs,
+                e_outs=e_outs,
+                ys=ys,
+                ds=batch["durations"],
+                ps=batch["pitch"],
+                es=batch["energy"],
+                ilens=batch["text_lengths"],
+                olens=olens,
+                spk_logits=spk_logits,
+                spk_ids=spk_id, )
+
+            loss_fs2 = l1_loss_fs2 + ssim_loss_fs2 + duration_loss + pitch_loss + energy_loss + speaker_loss
+
+            self.optimizer_fs2.clear_grad()
+            loss_fs2.backward()
+            self.optimizer_fs2.step()
+
+            report("train/loss_fs2", float(loss_fs2))
+            report("train/l1_loss_fs2", float(l1_loss_fs2))
+            report("train/ssim_loss_fs2", float(ssim_loss_fs2))
+            report("train/duration_loss", float(duration_loss))
+            report("train/pitch_loss", float(pitch_loss))
+
+            losses_dict["l1_loss_fs2"] = float(l1_loss_fs2)
+            losses_dict["ssim_loss_fs2"] = float(ssim_loss_fs2)
+            losses_dict["duration_loss"] = float(duration_loss)
+            losses_dict["pitch_loss"] = float(pitch_loss)
+
+            if speaker_loss != 0.:
+                report("train/speaker_loss", float(speaker_loss))
+                losses_dict["speaker_loss"] = float(speaker_loss)
+            if energy_loss != 0.:
+                report("train/energy_loss", float(energy_loss))
+                losses_dict["energy_loss"] = float(energy_loss)
+
+            losses_dict["loss_fs2"] = float(loss_fs2)
+            self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                                  for k, v in losses_dict.items())
+
+        # Then only train diffusion module, freeze fastspeech2 parameters.
+        if self.state.iteration > self.ds_train_start_steps:
+            for param in self.model.fs2.parameters():
+                param.trainable = False if self.only_train_diffusion else True
+
+            noise_pred, noise_target, mel_masks = self.model(
+                text=batch["text"],
+                note=batch["note"],
+                note_dur=batch["note_dur"],
+                is_slur=batch["is_slur"],
+                text_lengths=batch["text_lengths"],
+                speech=batch["speech"],
+                speech_lengths=batch["speech_lengths"],
+                durations=batch["durations"],
+                pitch=batch["pitch"],
+                energy=batch["energy"],
+                spk_id=spk_id,
+                spk_emb=spk_emb,
+                only_train_fs2=False, )
+
+            noise_pred = noise_pred.transpose((0, 2, 1))
+            noise_target = noise_target.transpose((0, 2, 1))
+            mel_masks = mel_masks.transpose((0, 2, 1))
+            l1_loss_ds = self.criterion_ds(
+                noise_pred=noise_pred,
+                noise_target=noise_target,
+                mel_masks=mel_masks, )
+
+            loss_ds = l1_loss_ds
+
+            self.optimizer_ds.clear_grad()
+            loss_ds.backward()
+            self.optimizer_ds.step()
+
+            report("train/loss_ds", float(loss_ds))
+            report("train/l1_loss_ds", float(l1_loss_ds))
+            losses_dict["l1_loss_ds"] = float(l1_loss_ds)
+            losses_dict["loss_ds"] = float(loss_ds)
+            self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                                  for k, v in losses_dict.items())
+
+        self.logger.info(self.msg)
+
+
+class DiffSingerEvaluator(StandardEvaluator):
+    def __init__(
+            self,
+            model: Layer,
+            criterions: Dict[str, Layer],
+            dataloader: DataLoader,
+            output_dir: Path=None, ):
+        super().__init__(model, dataloader)
+        self.model = model._layers if isinstance(model,
+                                                 paddle.DataParallel) else model
+
+        self.criterions = criterions
+        self.criterion_fs2 = criterions['fs2']
+        self.criterion_ds = criterions['ds']
+        self.dataloader = dataloader
+
+        log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
+        self.filehandler = logging.FileHandler(str(log_file))
+        logger.addHandler(self.filehandler)
+        self.logger = logger
+        self.msg = ""
+
+    def evaluate_core(self, batch):
+        self.msg = "Evaluate: "
+        losses_dict = {}
+        # spk_id!=None in multiple spk diffsinger 
+        spk_id = batch["spk_id"] if "spk_id" in batch else None
+        spk_emb = batch["spk_emb"] if "spk_emb" in batch else None
+        if spk_emb is not None:
+            spk_id = None
+
+        # Here show fastspeech2 eval 
+        before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens, spk_logits = self.model(
+            text=batch["text"],
+            note=batch["note"],
+            note_dur=batch["note_dur"],
+            is_slur=batch["is_slur"],
+            text_lengths=batch["text_lengths"],
+            speech=batch["speech"],
+            speech_lengths=batch["speech_lengths"],
+            durations=batch["durations"],
+            pitch=batch["pitch"],
+            energy=batch["energy"],
+            spk_id=spk_id,
+            spk_emb=spk_emb,
+            only_train_fs2=True, )
+
+        l1_loss_fs2, ssim_loss_fs2, duration_loss, pitch_loss, energy_loss, speaker_loss = self.criterion_fs2(
+            after_outs=after_outs,
+            before_outs=before_outs,
+            d_outs=d_outs,
+            p_outs=p_outs,
+            e_outs=e_outs,
+            ys=ys,
+            ds=batch["durations"],
+            ps=batch["pitch"],
+            es=batch["energy"],
+            ilens=batch["text_lengths"],
+            olens=olens,
+            spk_logits=spk_logits,
+            spk_ids=spk_id, )
+
+        loss_fs2 = l1_loss_fs2 + ssim_loss_fs2 + duration_loss + pitch_loss + energy_loss + speaker_loss
+
+        report("eval/loss_fs2", float(loss_fs2))
+        report("eval/l1_loss_fs2", float(l1_loss_fs2))
+        report("eval/ssim_loss_fs2", float(ssim_loss_fs2))
+        report("eval/duration_loss", float(duration_loss))
+        report("eval/pitch_loss", float(pitch_loss))
+
+        losses_dict["l1_loss_fs2"] = float(l1_loss_fs2)
+        losses_dict["ssim_loss_fs2"] = float(ssim_loss_fs2)
+        losses_dict["duration_loss"] = float(duration_loss)
+        losses_dict["pitch_loss"] = float(pitch_loss)
+
+        if speaker_loss != 0.:
+            report("eval/speaker_loss", float(speaker_loss))
+            losses_dict["speaker_loss"] = float(speaker_loss)
+        if energy_loss != 0.:
+            report("eval/energy_loss", float(energy_loss))
+            losses_dict["energy_loss"] = float(energy_loss)
+
+        losses_dict["loss_fs2"] = float(loss_fs2)
+
+        # Here show diffusion eval
+        noise_pred, noise_target, mel_masks = self.model(
+            text=batch["text"],
+            note=batch["note"],
+            note_dur=batch["note_dur"],
+            is_slur=batch["is_slur"],
+            text_lengths=batch["text_lengths"],
+            speech=batch["speech"],
+            speech_lengths=batch["speech_lengths"],
+            durations=batch["durations"],
+            pitch=batch["pitch"],
+            energy=batch["energy"],
+            spk_id=spk_id,
+            spk_emb=spk_emb,
+            only_train_fs2=False, )
+
+        noise_pred = noise_pred.transpose((0, 2, 1))
+        noise_target = noise_target.transpose((0, 2, 1))
+        mel_masks = mel_masks.transpose((0, 2, 1))
+        l1_loss_ds = self.criterion_ds(
+            noise_pred=noise_pred,
+            noise_target=noise_target,
+            mel_masks=mel_masks, )
+
+        loss_ds = l1_loss_ds
+
+        report("eval/loss_ds", float(loss_ds))
+        report("eval/l1_loss_ds", float(l1_loss_ds))
+        losses_dict["l1_loss_ds"] = float(l1_loss_ds)
+        losses_dict["loss_ds"] = float(loss_ds)
+        self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                              for k, v in losses_dict.items())
+
+        self.logger.info(self.msg)
diff --git a/paddlespeech/t2s/models/diffsinger/fastspeech2midi.py b/paddlespeech/t2s/models/diffsinger/fastspeech2midi.py
new file mode 100644
index 00000000..cce88d8a
--- /dev/null
+++ b/paddlespeech/t2s/models/diffsinger/fastspeech2midi.py
@@ -0,0 +1,654 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
+from typing import Any
+from typing import Dict
+from typing import Sequence
+from typing import Tuple
+
+import paddle
+from paddle import nn
+from typeguard import check_argument_types
+
+from paddlespeech.t2s.models.fastspeech2 import FastSpeech2
+from paddlespeech.t2s.models.fastspeech2 import FastSpeech2Loss
+from paddlespeech.t2s.modules.losses import ssim
+from paddlespeech.t2s.modules.masked_fill import masked_fill
+from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask
+from paddlespeech.t2s.modules.nets_utils import make_pad_mask
+
+
+class FastSpeech2MIDI(FastSpeech2):
+    """The Fastspeech2 module of DiffSinger.
+    """
+
+    def __init__(
+            self,
+            # fastspeech2 network structure related
+            idim: int,
+            odim: int,
+            fastspeech2_params: Dict[str, Any],
+            # note emb
+            note_num: int=300,
+            # is_slur emb
+            is_slur_num: int=2,
+            use_energy_pred: bool=False,
+            use_postnet: bool=False, ):
+        """Initialize FastSpeech2 module for svs.
+        Args:
+            fastspeech2_params (Dict):
+                The config of FastSpeech2 module on DiffSinger model
+            note_num (Optional[int]): 
+                Number of note. If not None, assume that the
+                note_ids will be provided as the input and use note_embedding_table.
+            is_slur_num (Optional[int]): 
+                Number of note. If not None, assume that the
+                is_slur_ids will be provided as the input
+    
+        """
+        assert check_argument_types()
+        super().__init__(idim=idim, odim=odim, **fastspeech2_params)
+        self.use_energy_pred = use_energy_pred
+        self.use_postnet = use_postnet
+        if not self.use_postnet:
+            self.postnet = None
+
+        self.note_embed_dim = self.is_slur_embed_dim = fastspeech2_params[
+            "adim"]
+
+        # note_ embed
+        self.note_embedding_table = nn.Embedding(
+            num_embeddings=note_num,
+            embedding_dim=self.note_embed_dim,
+            padding_idx=self.padding_idx)
+        self.note_dur_layer = nn.Linear(1, self.note_embed_dim)
+
+        # slur embed
+        self.is_slur_embedding_table = nn.Embedding(
+            num_embeddings=is_slur_num,
+            embedding_dim=self.is_slur_embed_dim,
+            padding_idx=self.padding_idx)
+
+    def forward(
+            self,
+            text: paddle.Tensor,
+            note: paddle.Tensor,
+            note_dur: paddle.Tensor,
+            is_slur: paddle.Tensor,
+            text_lengths: paddle.Tensor,
+            speech: paddle.Tensor,
+            speech_lengths: paddle.Tensor,
+            durations: paddle.Tensor,
+            pitch: paddle.Tensor,
+            energy: paddle.Tensor,
+            spk_emb: paddle.Tensor=None,
+            spk_id: paddle.Tensor=None,
+    ) -> Tuple[paddle.Tensor, Dict[str, paddle.Tensor], paddle.Tensor]:
+        """Calculate forward propagation.
+
+        Args:
+            text(Tensor(int64)): 
+                Batch of padded token (phone) ids (B, Tmax).
+            note(Tensor(int64)): 
+                Batch of padded note (element in music score) ids (B, Tmax).
+            note_dur(Tensor(float32)): 
+                Batch of padded note durations in seconds (element in music score) (B, Tmax).
+            is_slur(Tensor(int64)): 
+                Batch of padded slur (element in music score) ids (B, Tmax).
+            text_lengths(Tensor(int64)): 
+                Batch of phone lengths of each input (B,).
+            speech(Tensor[float32]): 
+                Batch of padded target features (e.g. mel) (B, Lmax, odim).
+            speech_lengths(Tensor(int64)): 
+                Batch of the lengths of each target features (B,).
+            durations(Tensor(int64)): 
+                Batch of padded token durations in frame (B, Tmax).
+            pitch(Tensor[float32]): 
+                Batch of padded frame-averaged pitch (B, Lmax, 1).
+            energy(Tensor[float32]): 
+                Batch of padded frame-averaged energy (B, Lmax, 1).
+            spk_emb(Tensor[float32], optional): 
+                Batch of speaker embeddings (B, spk_embed_dim).
+            spk_id(Tnesor[int64], optional(int64)): 
+                Batch of speaker ids (B,)
+
+        Returns:
+
+        """
+        xs = paddle.cast(text, 'int64')
+        note = paddle.cast(note, 'int64')
+        note_dur = paddle.cast(note_dur, 'float32')
+        is_slur = paddle.cast(is_slur, 'int64')
+        ilens = paddle.cast(text_lengths, 'int64')
+        olens = paddle.cast(speech_lengths, 'int64')
+        ds = paddle.cast(durations, 'int64')
+        ps = pitch
+        es = energy
+        ys = speech
+        olens = speech_lengths
+        if spk_id is not None:
+            spk_id = paddle.cast(spk_id, 'int64')
+        # forward propagation
+        before_outs, after_outs, d_outs, p_outs, e_outs, spk_logits = self._forward(
+            xs=xs,
+            note=note,
+            note_dur=note_dur,
+            is_slur=is_slur,
+            ilens=ilens,
+            olens=olens,
+            ds=ds,
+            ps=ps,
+            es=es,
+            is_inference=False,
+            spk_emb=spk_emb,
+            spk_id=spk_id, )
+        # modify mod part of groundtruth
+        if self.reduction_factor > 1:
+            olens = olens - olens % self.reduction_factor
+            max_olen = max(olens)
+            ys = ys[:, :max_olen]
+
+        return before_outs, after_outs, d_outs, p_outs, e_outs, ys, olens, spk_logits
+
+    def _forward(
+            self,
+            xs: paddle.Tensor,
+            note: paddle.Tensor,
+            note_dur: paddle.Tensor,
+            is_slur: paddle.Tensor,
+            ilens: paddle.Tensor,
+            olens: paddle.Tensor=None,
+            ds: paddle.Tensor=None,
+            ps: paddle.Tensor=None,
+            es: paddle.Tensor=None,
+            is_inference: bool=False,
+            is_train_diffusion: bool=False,
+            return_after_enc=False,
+            alpha: float=1.0,
+            spk_emb=None,
+            spk_id=None, ) -> Sequence[paddle.Tensor]:
+
+        before_outs = after_outs = d_outs = p_outs = e_outs = spk_logits = None
+        # forward encoder
+        masks = self._source_mask(ilens)
+        note_emb = self.note_embedding_table(note)
+        note_dur_emb = self.note_dur_layer(paddle.unsqueeze(note_dur, axis=-1))
+        is_slur_emb = self.is_slur_embedding_table(is_slur)
+
+        # (B, Tmax, adim)
+        hs, _ = self.encoder(
+            xs=xs,
+            masks=masks,
+            note_emb=note_emb,
+            note_dur_emb=note_dur_emb,
+            is_slur_emb=is_slur_emb, )
+
+        if self.spk_num and self.enable_speaker_classifier and not is_inference:
+            hs_for_spk_cls = self.grad_reverse(hs)
+            spk_logits = self.speaker_classifier(hs_for_spk_cls, ilens)
+        else:
+            spk_logits = None
+
+        # integrate speaker embedding
+        if self.spk_embed_dim is not None:
+            # spk_emb has a higher priority than spk_id
+            if spk_emb is not None:
+                hs = self._integrate_with_spk_embed(hs, spk_emb)
+            elif spk_id is not None:
+                spk_emb = self.spk_embedding_table(spk_id)
+                hs = self._integrate_with_spk_embed(hs, spk_emb)
+
+        # forward duration predictor (phone-level) and variance predictors (frame-level)
+        d_masks = make_pad_mask(ilens)
+        if olens is not None:
+            pitch_masks = make_pad_mask(olens).unsqueeze(-1)
+        else:
+            pitch_masks = None
+
+        # inference for decoder input for diffusion
+        if is_train_diffusion:
+            hs = self.length_regulator(hs, ds, is_inference=False)
+            p_outs = self.pitch_predictor(hs.detach(), pitch_masks)
+            p_embs = self.pitch_embed(p_outs.transpose((0, 2, 1))).transpose(
+                (0, 2, 1))
+            hs += p_embs
+            if self.use_energy_pred:
+                e_outs = self.energy_predictor(hs.detach(), pitch_masks)
+                e_embs = self.energy_embed(
+                    e_outs.transpose((0, 2, 1))).transpose((0, 2, 1))
+                hs += e_embs
+
+        elif is_inference:
+            # (B, Tmax)
+            if ds is not None:
+                d_outs = ds
+            else:
+                d_outs = self.duration_predictor.inference(hs, d_masks)
+
+            # (B, Lmax, adim)
+            hs = self.length_regulator(hs, d_outs, alpha, is_inference=True)
+
+            if ps is not None:
+                p_outs = ps
+            else:
+                if self.stop_gradient_from_pitch_predictor:
+                    p_outs = self.pitch_predictor(hs.detach(), pitch_masks)
+                else:
+                    p_outs = self.pitch_predictor(hs, pitch_masks)
+            p_embs = self.pitch_embed(p_outs.transpose((0, 2, 1))).transpose(
+                (0, 2, 1))
+            hs += p_embs
+
+            if self.use_energy_pred:
+                if es is not None:
+                    e_outs = es
+                else:
+                    if self.stop_gradient_from_energy_predictor:
+                        e_outs = self.energy_predictor(hs.detach(), pitch_masks)
+                    else:
+                        e_outs = self.energy_predictor(hs, pitch_masks)
+                e_embs = self.energy_embed(
+                    e_outs.transpose((0, 2, 1))).transpose((0, 2, 1))
+                hs += e_embs
+
+        # training
+        else:
+            d_outs = self.duration_predictor(hs, d_masks)
+            # (B, Lmax, adim)
+            hs = self.length_regulator(hs, ds, is_inference=False)
+            if self.stop_gradient_from_pitch_predictor:
+                p_outs = self.pitch_predictor(hs.detach(), pitch_masks)
+            else:
+                p_outs = self.pitch_predictor(hs, pitch_masks)
+            p_embs = self.pitch_embed(ps.transpose((0, 2, 1))).transpose(
+                (0, 2, 1))
+            hs += p_embs
+
+            if self.use_energy_pred:
+                if self.stop_gradient_from_energy_predictor:
+                    e_outs = self.energy_predictor(hs.detach(), pitch_masks)
+                else:
+                    e_outs = self.energy_predictor(hs, pitch_masks)
+                e_embs = self.energy_embed(es.transpose((0, 2, 1))).transpose(
+                    (0, 2, 1))
+                hs += e_embs
+
+        # forward decoder
+        if olens is not None and not is_inference:
+            if self.reduction_factor > 1:
+                olens_in = paddle.to_tensor(
+                    [olen // self.reduction_factor for olen in olens.numpy()])
+            else:
+                olens_in = olens
+            # (B, 1, T)
+            h_masks = self._source_mask(olens_in)
+        else:
+            h_masks = None
+
+        if return_after_enc:
+            return hs, h_masks
+
+        if self.decoder_type == 'cnndecoder':
+            # remove output masks for dygraph to static graph
+            zs = self.decoder(hs, h_masks)
+            before_outs = zs
+        else:
+            # (B, Lmax, adim)
+            zs, _ = self.decoder(hs, h_masks)
+            # (B, Lmax, odim)
+            before_outs = self.feat_out(zs).reshape(
+                (paddle.shape(zs)[0], -1, self.odim))
+
+        # postnet -> (B, Lmax//r * r, odim)
+        if self.postnet is None:
+            after_outs = before_outs
+        else:
+            after_outs = before_outs + self.postnet(
+                before_outs.transpose((0, 2, 1))).transpose((0, 2, 1))
+
+        return before_outs, after_outs, d_outs, p_outs, e_outs, spk_logits
+
+    def encoder_infer(
+            self,
+            text: paddle.Tensor,
+            note: paddle.Tensor,
+            note_dur: paddle.Tensor,
+            is_slur: paddle.Tensor,
+            alpha: float=1.0,
+            spk_emb=None,
+            spk_id=None,
+    ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
+        xs = paddle.cast(text, 'int64').unsqueeze(0)
+        note = paddle.cast(note, 'int64').unsqueeze(0)
+        note_dur = paddle.cast(note_dur, 'float32').unsqueeze(0)
+        is_slur = paddle.cast(is_slur, 'int64').unsqueeze(0)
+        # setup batch axis
+        ilens = paddle.shape(xs)[1]
+
+        if spk_emb is not None:
+            spk_emb = spk_emb.unsqueeze(0)
+
+        # (1, L, odim)
+        # use *_ to avoid bug in dygraph to static graph    
+        hs, _ = self._forward(
+            xs=xs,
+            note=note,
+            note_dur=note_dur,
+            is_slur=is_slur,
+            ilens=ilens,
+            is_inference=True,
+            return_after_enc=True,
+            alpha=alpha,
+            spk_emb=spk_emb,
+            spk_id=spk_id, )
+        return hs
+
+    # get encoder output for diffusion training
+    def encoder_infer_batch(
+            self,
+            text: paddle.Tensor,
+            note: paddle.Tensor,
+            note_dur: paddle.Tensor,
+            is_slur: paddle.Tensor,
+            text_lengths: paddle.Tensor,
+            speech_lengths: paddle.Tensor,
+            ds: paddle.Tensor=None,
+            ps: paddle.Tensor=None,
+            es: paddle.Tensor=None,
+            alpha: float=1.0,
+            spk_emb=None,
+            spk_id=None, ) -> Tuple[paddle.Tensor, paddle.Tensor]:
+
+        xs = paddle.cast(text, 'int64')
+        note = paddle.cast(note, 'int64')
+        note_dur = paddle.cast(note_dur, 'float32')
+        is_slur = paddle.cast(is_slur, 'int64')
+        ilens = paddle.cast(text_lengths, 'int64')
+        olens = paddle.cast(speech_lengths, 'int64')
+
+        if spk_emb is not None:
+            spk_emb = spk_emb.unsqueeze(0)
+
+        # (1, L, odim)
+        # use *_ to avoid bug in dygraph to static graph    
+        hs, h_masks = self._forward(
+            xs=xs,
+            note=note,
+            note_dur=note_dur,
+            is_slur=is_slur,
+            ilens=ilens,
+            olens=olens,
+            ds=ds,
+            ps=ps,
+            es=es,
+            return_after_enc=True,
+            is_train_diffusion=True,
+            alpha=alpha,
+            spk_emb=spk_emb,
+            spk_id=spk_id, )
+        return hs, h_masks
+
+    def inference(
+            self,
+            text: paddle.Tensor,
+            note: paddle.Tensor,
+            note_dur: paddle.Tensor,
+            is_slur: paddle.Tensor,
+            durations: paddle.Tensor=None,
+            pitch: paddle.Tensor=None,
+            energy: paddle.Tensor=None,
+            alpha: float=1.0,
+            use_teacher_forcing: bool=False,
+            spk_emb=None,
+            spk_id=None,
+    ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
+        """Generate the sequence of features given the sequences of characters.
+
+        Args:
+            text(Tensor(int64)): 
+                Input sequence of characters (T,).
+            note(Tensor(int64)): 
+                Input note (element in music score) ids (T,).
+            note_dur(Tensor(float32)): 
+               Input note durations in seconds (element in music score) (T,).
+            is_slur(Tensor(int64)): 
+                Input slur (element in music score) ids (T,).
+            durations(Tensor, optional (int64)): 
+                Groundtruth of duration (T,).
+            pitch(Tensor, optional): 
+                Groundtruth of token-averaged pitch (T, 1).
+            energy(Tensor, optional): 
+                Groundtruth of token-averaged energy (T, 1).
+            alpha(float, optional): 
+                Alpha to control the speed.
+            use_teacher_forcing(bool, optional): 
+                Whether to use teacher forcing.
+                If true, groundtruth of duration, pitch and energy will be used.
+            spk_emb(Tensor, optional, optional): 
+                peaker embedding vector (spk_embed_dim,). (Default value = None)
+            spk_id(Tensor, optional(int64), optional): 
+                spk ids (1,). (Default value = None)
+
+        Returns:
+
+        """
+        xs = paddle.cast(text, 'int64').unsqueeze(0)
+        note = paddle.cast(note, 'int64').unsqueeze(0)
+        note_dur = paddle.cast(note_dur, 'float32').unsqueeze(0)
+        is_slur = paddle.cast(is_slur, 'int64').unsqueeze(0)
+        d, p, e = durations, pitch, energy
+        # setup batch axis
+        ilens = paddle.shape(xs)[1]
+
+        if spk_emb is not None:
+            spk_emb = spk_emb.unsqueeze(0)
+
+        if use_teacher_forcing:
+            # use groundtruth of duration, pitch, and energy
+            ds = d.unsqueeze(0) if d is not None else None
+            ps = p.unsqueeze(0) if p is not None else None
+            es = e.unsqueeze(0) if e is not None else None
+
+            # (1, L, odim)
+            _, outs, d_outs, p_outs, e_outs, _ = self._forward(
+                xs=xs,
+                note=note,
+                note_dur=note_dur,
+                is_slur=is_slur,
+                ilens=ilens,
+                ds=ds,
+                ps=ps,
+                es=es,
+                spk_emb=spk_emb,
+                spk_id=spk_id,
+                is_inference=True)
+        else:
+            # (1, L, odim)
+            _, outs, d_outs, p_outs, e_outs, _ = self._forward(
+                xs=xs,
+                note=note,
+                note_dur=note_dur,
+                is_slur=is_slur,
+                ilens=ilens,
+                is_inference=True,
+                alpha=alpha,
+                spk_emb=spk_emb,
+                spk_id=spk_id, )
+
+        if e_outs is None:
+            e_outs = [None]
+
+        return outs[0], d_outs[0], p_outs[0], e_outs[0]
+
+
+class FastSpeech2MIDILoss(FastSpeech2Loss):
+    """Loss function module for DiffSinger."""
+
+    def __init__(self, use_masking: bool=True,
+                 use_weighted_masking: bool=False):
+        """Initialize feed-forward Transformer loss module.
+        Args:
+            use_masking (bool): 
+                Whether to apply masking for padded part in loss calculation.
+            use_weighted_masking (bool): 
+                Whether to weighted masking in loss calculation.
+        """
+        assert check_argument_types()
+        super().__init__(use_masking, use_weighted_masking)
+
+    def forward(
+            self,
+            after_outs: paddle.Tensor,
+            before_outs: paddle.Tensor,
+            d_outs: paddle.Tensor,
+            p_outs: paddle.Tensor,
+            e_outs: paddle.Tensor,
+            ys: paddle.Tensor,
+            ds: paddle.Tensor,
+            ps: paddle.Tensor,
+            es: paddle.Tensor,
+            ilens: paddle.Tensor,
+            olens: paddle.Tensor,
+            spk_logits: paddle.Tensor=None,
+            spk_ids: paddle.Tensor=None,
+    ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor,
+               paddle.Tensor, ]:
+        """Calculate forward propagation.
+
+        Args:
+            after_outs(Tensor):  
+                Batch of outputs after postnets (B, Lmax, odim).
+            before_outs(Tensor): 
+                Batch of outputs before postnets (B, Lmax, odim).
+            d_outs(Tensor): 
+                Batch of outputs of duration predictor (B, Tmax).
+            p_outs(Tensor): 
+                Batch of outputs of pitch predictor (B, Lmax, 1).
+            e_outs(Tensor): 
+                Batch of outputs of energy predictor (B, Lmax, 1).
+            ys(Tensor): 
+                Batch of target features (B, Lmax, odim).
+            ds(Tensor): 
+                Batch of durations (B, Tmax).
+            ps(Tensor): 
+                Batch of target frame-averaged pitch (B, Lmax, 1).
+            es(Tensor): 
+                Batch of target frame-averaged energy (B, Lmax, 1).
+            ilens(Tensor): 
+                Batch of the lengths of each input (B,).
+            olens(Tensor): 
+                Batch of the lengths of each target (B,).
+            spk_logits(Option[Tensor]):
+                Batch of outputs after speaker classifier (B, Lmax, num_spk)
+            spk_ids(Option[Tensor]):
+                Batch of target spk_id (B,)
+        
+
+        Returns:
+
+        
+        """
+        l1_loss = duration_loss = pitch_loss = energy_loss = speaker_loss = ssim_loss = 0.0
+
+        # apply mask to remove padded part
+        if self.use_masking:
+            # make feature for ssim loss
+            out_pad_masks = make_pad_mask(olens).unsqueeze(-1)
+            before_outs_ssim = masked_fill(before_outs, out_pad_masks, 0.0)
+            if not paddle.equal_all(after_outs, before_outs):
+                after_outs_ssim = masked_fill(after_outs, out_pad_masks, 0.0)
+            ys_ssim = masked_fill(ys, out_pad_masks, 0.0)
+
+            out_masks = make_non_pad_mask(olens).unsqueeze(-1)
+            before_outs = before_outs.masked_select(
+                out_masks.broadcast_to(before_outs.shape))
+            if not paddle.equal_all(after_outs, before_outs):
+                after_outs = after_outs.masked_select(
+                    out_masks.broadcast_to(after_outs.shape))
+            ys = ys.masked_select(out_masks.broadcast_to(ys.shape))
+            duration_masks = make_non_pad_mask(ilens)
+            d_outs = d_outs.masked_select(
+                duration_masks.broadcast_to(d_outs.shape))
+            ds = ds.masked_select(duration_masks.broadcast_to(ds.shape))
+            pitch_masks = out_masks
+            p_outs = p_outs.masked_select(
+                pitch_masks.broadcast_to(p_outs.shape))
+            ps = ps.masked_select(pitch_masks.broadcast_to(ps.shape))
+            if e_outs is not None:
+                e_outs = e_outs.masked_select(
+                    pitch_masks.broadcast_to(e_outs.shape))
+                es = es.masked_select(pitch_masks.broadcast_to(es.shape))
+
+            if spk_logits is not None and spk_ids is not None:
+                batch_size = spk_ids.shape[0]
+                spk_ids = paddle.repeat_interleave(spk_ids, spk_logits.shape[1],
+                                                   None)
+                spk_logits = paddle.reshape(spk_logits,
+                                            [-1, spk_logits.shape[-1]])
+                mask_index = spk_logits.abs().sum(axis=1) != 0
+                spk_ids = spk_ids[mask_index]
+                spk_logits = spk_logits[mask_index]
+
+        # calculate loss
+        l1_loss = self.l1_criterion(before_outs, ys)
+        ssim_loss = 1.0 - ssim(
+            before_outs_ssim.unsqueeze(1), ys_ssim.unsqueeze(1))
+        if not paddle.equal_all(after_outs, before_outs):
+            l1_loss += self.l1_criterion(after_outs, ys)
+            ssim_loss += (
+                1.0 - ssim(after_outs_ssim.unsqueeze(1), ys_ssim.unsqueeze(1)))
+        l1_loss = l1_loss * 0.5
+        ssim_loss = ssim_loss * 0.5
+
+        duration_loss = self.duration_criterion(d_outs, ds)
+        pitch_loss = self.l1_criterion(p_outs, ps)
+        if e_outs is not None:
+            energy_loss = self.l1_criterion(e_outs, es)
+
+        if spk_logits is not None and spk_ids is not None:
+            speaker_loss = self.ce_criterion(spk_logits, spk_ids) / batch_size
+
+        # make weighted mask and apply it
+        if self.use_weighted_masking:
+            out_masks = make_non_pad_mask(olens).unsqueeze(-1)
+            out_weights = out_masks.cast(dtype=paddle.float32) / out_masks.cast(
+                dtype=paddle.float32).sum(
+                    axis=1, keepdim=True)
+            out_weights /= ys.shape[0] * ys.shape[2]
+            duration_masks = make_non_pad_mask(ilens)
+            duration_weights = (duration_masks.cast(dtype=paddle.float32) /
+                                duration_masks.cast(dtype=paddle.float32).sum(
+                                    axis=1, keepdim=True))
+            duration_weights /= ds.shape[0]
+
+            # apply weight
+            l1_loss = l1_loss.multiply(out_weights)
+            l1_loss = l1_loss.masked_select(
+                out_masks.broadcast_to(l1_loss.shape)).sum()
+            ssim_loss = ssim_loss.multiply(out_weights)
+            ssim_loss = ssim_loss.masked_select(
+                out_masks.broadcast_to(ssim_loss.shape)).sum()
+            duration_loss = (duration_loss.multiply(duration_weights)
+                             .masked_select(duration_masks).sum())
+            pitch_masks = out_masks
+            pitch_weights = out_weights
+            pitch_loss = pitch_loss.multiply(pitch_weights)
+            pitch_loss = pitch_loss.masked_select(
+                pitch_masks.broadcast_to(pitch_loss.shape)).sum()
+            if e_outs is not None:
+                energy_loss = energy_loss.multiply(pitch_weights)
+                energy_loss = energy_loss.masked_select(
+                    pitch_masks.broadcast_to(energy_loss.shape)).sum()
+
+        return l1_loss, ssim_loss, duration_loss, pitch_loss, energy_loss, speaker_loss
diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
index 0eb44beb..8ce19795 100644
--- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
+++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
@@ -93,6 +93,7 @@ class FastSpeech2(nn.Layer):
             transformer_dec_dropout_rate: float=0.1,
             transformer_dec_positional_dropout_rate: float=0.1,
             transformer_dec_attn_dropout_rate: float=0.1,
+            transformer_activation_type: str="relu",
             # for conformer
             conformer_pos_enc_layer_type: str="rel_pos",
             conformer_self_attn_layer_type: str="rel_selfattn",
@@ -200,6 +201,8 @@ class FastSpeech2(nn.Layer):
                 Dropout rate after decoder positional encoding.
             transformer_dec_attn_dropout_rate (float): 
                 Dropout rate in decoder self-attention module.
+            transformer_activation_type (str): 
+                Activation function type in transformer.
             conformer_pos_enc_layer_type (str): 
                 Pos encoding layer type in conformer.
             conformer_self_attn_layer_type (str): 
@@ -250,7 +253,7 @@ class FastSpeech2(nn.Layer):
                 Kernel size of energy embedding.
             energy_embed_dropout_rate (float): 
                 Dropout rate for energy embedding.
-            stop_gradient_from_energy_predictor（bool): 
+            stop_gradient_from_energy_predictor (bool): 
                 Whether to stop gradient from energy predictor to encoder.
             spk_num (Optional[int]): 
                 Number of speakers. If not None, assume that the spk_embed_dim is not None,
@@ -269,7 +272,7 @@ class FastSpeech2(nn.Layer):
                 How to integrate tone embedding.
             init_type (str): 
                 How to initialize transformer parameters.
-            init_enc_alpha （float): 
+            init_enc_alpha (float): 
                 Initial value of alpha in scaled pos encoding of the encoder.
             init_dec_alpha (float): 
                 Initial value of alpha in scaled pos encoding of the decoder.
@@ -344,7 +347,8 @@ class FastSpeech2(nn.Layer):
                 normalize_before=encoder_normalize_before,
                 concat_after=encoder_concat_after,
                 positionwise_layer_type=positionwise_layer_type,
-                positionwise_conv_kernel_size=positionwise_conv_kernel_size, )
+                positionwise_conv_kernel_size=positionwise_conv_kernel_size,
+                activation_type=transformer_activation_type)
         elif encoder_type == "conformer":
             self.encoder = ConformerEncoder(
                 idim=idim,
@@ -453,7 +457,8 @@ class FastSpeech2(nn.Layer):
                 normalize_before=decoder_normalize_before,
                 concat_after=decoder_concat_after,
                 positionwise_layer_type=positionwise_layer_type,
-                positionwise_conv_kernel_size=positionwise_conv_kernel_size, )
+                positionwise_conv_kernel_size=positionwise_conv_kernel_size,
+                activation_type=conformer_activation_type, )
         elif decoder_type == "conformer":
             self.decoder = ConformerEncoder(
                 idim=0,
@@ -705,9 +710,9 @@ class FastSpeech2(nn.Layer):
     def encoder_infer(
             self,
             text: paddle.Tensor,
+            spk_id=None,
             alpha: float=1.0,
             spk_emb=None,
-            spk_id=None,
             tone_id=None,
     ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
         # input of embedding must be int64
diff --git a/paddlespeech/t2s/models/hifigan/hifigan.py b/paddlespeech/t2s/models/hifigan/hifigan.py
index 7a01840e..2759af9d 100644
--- a/paddlespeech/t2s/models/hifigan/hifigan.py
+++ b/paddlespeech/t2s/models/hifigan/hifigan.py
@@ -37,8 +37,8 @@ class HiFiGANGenerator(nn.Layer):
             channels: int=512,
             global_channels: int=-1,
             kernel_size: int=7,
-            upsample_scales: List[int]=(8, 8, 2, 2),
-            upsample_kernel_sizes: List[int]=(16, 16, 4, 4),
+            upsample_scales: List[int]=(5, 5, 4, 3),
+            upsample_kernel_sizes: List[int]=(10, 10, 8, 6),
             resblock_kernel_sizes: List[int]=(3, 7, 11),
             resblock_dilations: List[List[int]]=[(1, 3, 5), (1, 3, 5),
                                                  (1, 3, 5)],
@@ -47,8 +47,13 @@ class HiFiGANGenerator(nn.Layer):
             nonlinear_activation: str="leakyrelu",
             nonlinear_activation_params: Dict[str, Any]={"negative_slope": 0.1},
             use_weight_norm: bool=True,
-            init_type: str="xavier_uniform", ):
+            init_type: str="xavier_uniform",
+            use_istft: bool=False,
+            istft_layer_id: int=2,
+            n_fft: int=2048,
+            win_length: int=1200, ):
         """Initialize HiFiGANGenerator module.
+
         Args:
             in_channels (int): 
                 Number of input channels.
@@ -79,6 +84,14 @@ class HiFiGANGenerator(nn.Layer):
             use_weight_norm (bool): 
                 Whether to use weight norm.
                 If set to true, it will be applied to all of the conv layers.
+            use_istft (bool):
+                If set to true, it will be a iSTFTNet based on hifigan.
+            istft_layer_id (int):
+                Use istft after istft_layer_id layers of upsample layer if use_istft=True
+            n_fft (int):
+                Number of fft points in feature extraction
+            win_length (int):
+                Window length in feature extraction
         """
         super().__init__()
 
@@ -89,9 +102,11 @@ class HiFiGANGenerator(nn.Layer):
         assert kernel_size % 2 == 1, "Kernel size must be odd number."
         assert len(upsample_scales) == len(upsample_kernel_sizes)
         assert len(resblock_dilations) == len(resblock_kernel_sizes)
+        assert len(upsample_scales) >= istft_layer_id if use_istft else True
 
         # define modules
-        self.num_upsamples = len(upsample_kernel_sizes)
+        self.num_upsamples = len(
+            upsample_kernel_sizes) if not use_istft else istft_layer_id
         self.num_blocks = len(resblock_kernel_sizes)
         self.input_conv = nn.Conv1D(
             in_channels,
@@ -101,7 +116,7 @@ class HiFiGANGenerator(nn.Layer):
             padding=(kernel_size - 1) // 2, )
         self.upsamples = nn.LayerList()
         self.blocks = nn.LayerList()
-        for i in range(len(upsample_kernel_sizes)):
+        for i in range(self.num_upsamples):
             assert upsample_kernel_sizes[i] == 2 * upsample_scales[i]
             self.upsamples.append(
                 nn.Sequential(
@@ -126,15 +141,36 @@ class HiFiGANGenerator(nn.Layer):
                         nonlinear_activation=nonlinear_activation,
                         nonlinear_activation_params=nonlinear_activation_params,
                     ))
-        self.output_conv = nn.Sequential(
-            nn.LeakyReLU(),
-            nn.Conv1D(
+        self.use_istft = use_istft
+        if self.use_istft:
+            self.istft_hop_size = 1
+            for j in range(istft_layer_id, len(upsample_scales)):
+                self.istft_hop_size *= upsample_scales[j]
+            s = 1
+            for j in range(istft_layer_id):
+                s *= upsample_scales[j]
+            self.istft_n_fft = int(n_fft / s) if (
+                n_fft / s) % 2 == 0 else int((n_fft / s + 2) - n_fft / s % 2)
+            self.istft_win_length = int(win_length / s) if (
+                win_length /
+                s) % 2 == 0 else int((win_length / s + 2) - win_length / s % 2)
+            self.reflection_pad = nn.Pad1D(padding=[1, 0], mode='reflect')
+            self.output_conv = nn.Conv1D(
                 channels // (2**(i + 1)),
-                out_channels,
+                (self.istft_n_fft // 2 + 1) * 2,
                 kernel_size,
                 1,
-                padding=(kernel_size - 1) // 2, ),
-            nn.Tanh(), )
+                padding=(kernel_size - 1) // 2, )
+        else:
+            self.output_conv = nn.Sequential(
+                nn.LeakyReLU(),
+                nn.Conv1D(
+                    channels // (2**(i + 1)),
+                    out_channels,
+                    kernel_size,
+                    1,
+                    padding=(kernel_size - 1) // 2, ),
+                nn.Tanh(), )
 
         if global_channels > 0:
             self.global_conv = nn.Conv1D(global_channels, channels, 1)
@@ -167,7 +203,29 @@ class HiFiGANGenerator(nn.Layer):
             for j in range(self.num_blocks):
                 cs += self.blocks[i * self.num_blocks + j](c)
             c = cs / self.num_blocks
-        c = self.output_conv(c)
+
+        if self.use_istft:
+            c = F.leaky_relu(c)
+            c = self.reflection_pad(c)
+            c = self.output_conv(c)
+            """
+            Input of Exp operator, an N-D Tensor, with data type float32, float64 or float16.
+            https://www.paddlepaddle.org.cn/documentation/docs/en/api/paddle/exp_en.html
+            Use Euler's formula to implement spec*paddle.exp(1j*phase)
+            """
+            spec = paddle.exp(c[:, :self.istft_n_fft // 2 + 1, :])
+            phase = paddle.sin(c[:, self.istft_n_fft // 2 + 1:, :])
+
+            c = paddle.complex(spec * (paddle.cos(phase)),
+                               spec * (paddle.sin(phase)))
+            c = paddle.signal.istft(
+                c,
+                n_fft=self.istft_n_fft,
+                hop_length=self.istft_hop_size,
+                win_length=self.istft_win_length)
+            c = c.unsqueeze(1)
+        else:
+            c = self.output_conv(c)
 
         return c
 
diff --git a/paddlespeech/t2s/models/jets/__init__.py b/paddlespeech/t2s/models/jets/__init__.py
new file mode 100644
index 00000000..dec4a331
--- /dev/null
+++ b/paddlespeech/t2s/models/jets/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .jets import *
+from .jets_updater import *
diff --git a/paddlespeech/t2s/models/jets/alignments.py b/paddlespeech/t2s/models/jets/alignments.py
new file mode 100644
index 00000000..998f67e2
--- /dev/null
+++ b/paddlespeech/t2s/models/jets/alignments.py
@@ -0,0 +1,182 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Generator module in JETS.
+
+This code is based on https://github.com/imdanboy/jets.
+
+"""
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+from numba import jit
+from paddle import nn
+
+from paddlespeech.t2s.modules.masked_fill import masked_fill
+
+
+class AlignmentModule(nn.Layer):
+    """Alignment Learning Framework proposed for parallel TTS models in:
+    https://arxiv.org/abs/2108.10447
+    """
+
+    def __init__(self, adim, odim):
+        super().__init__()
+        self.t_conv1 = nn.Conv1D(adim, adim, kernel_size=3, padding=1)
+        self.t_conv2 = nn.Conv1D(adim, adim, kernel_size=1, padding=0)
+
+        self.f_conv1 = nn.Conv1D(odim, adim, kernel_size=3, padding=1)
+        self.f_conv2 = nn.Conv1D(adim, adim, kernel_size=3, padding=1)
+        self.f_conv3 = nn.Conv1D(adim, adim, kernel_size=1, padding=0)
+
+    def forward(self, text, feats, x_masks=None):
+        """
+        Args:
+            text (Tensor): Batched text embedding (B, T_text, adim)
+            feats (Tensor): Batched acoustic feature (B, T_feats, odim)
+            x_masks (Tensor): Mask tensor (B, T_text)
+
+        Returns:
+            Tensor: log probability of attention matrix (B, T_feats, T_text)
+        """
+
+        text = text.transpose((0, 2, 1))
+        text = F.relu(self.t_conv1(text))
+        text = self.t_conv2(text)
+        text = text.transpose((0, 2, 1))
+
+        feats = feats.transpose((0, 2, 1))
+        feats = F.relu(self.f_conv1(feats))
+        feats = F.relu(self.f_conv2(feats))
+        feats = self.f_conv3(feats)
+        feats = feats.transpose((0, 2, 1))
+
+        dist = feats.unsqueeze(2) - text.unsqueeze(1)
+        dist = paddle.linalg.norm(dist, p=2, axis=3)
+        score = -dist
+
+        if x_masks is not None:
+            x_masks = x_masks.unsqueeze(-2)
+            score = masked_fill(score, x_masks, -np.inf)
+        log_p_attn = F.log_softmax(score, axis=-1)
+        return log_p_attn, score
+
+
+@jit(nopython=True)
+def _monotonic_alignment_search(log_p_attn):
+    # https://arxiv.org/abs/2005.11129
+    T_mel = log_p_attn.shape[0]
+    T_inp = log_p_attn.shape[1]
+    Q = np.full((T_inp, T_mel), fill_value=-np.inf)
+
+    log_prob = log_p_attn.transpose(1, 0)  # -> (T_inp,T_mel)
+    # 1.  Q <- init first row for all j
+    for j in range(T_mel):
+        Q[0, j] = log_prob[0, :j + 1].sum()
+
+    # 2. 
+    for j in range(1, T_mel):
+        for i in range(1, min(j + 1, T_inp)):
+            Q[i, j] = max(Q[i - 1, j - 1], Q[i, j - 1]) + log_prob[i, j]
+
+    # 3.
+    A = np.full((T_mel, ), fill_value=T_inp - 1)
+    for j in range(T_mel - 2, -1, -1):  # T_mel-2, ..., 0
+        # 'i' in {A[j+1]-1, A[j+1]}
+        i_a = A[j + 1] - 1
+        i_b = A[j + 1]
+        if i_b == 0:
+            argmax_i = 0
+        elif Q[i_a, j] >= Q[i_b, j]:
+            argmax_i = i_a
+        else:
+            argmax_i = i_b
+        A[j] = argmax_i
+    return A
+
+
+def viterbi_decode(log_p_attn, text_lengths, feats_lengths):
+    """
+    Args:
+        log_p_attn (Tensor): 
+            Batched log probability of attention matrix (B, T_feats, T_text)
+        text_lengths (Tensor): 
+            Text length tensor (B,)
+        feats_legnths (Tensor): 
+            Feature length tensor (B,)
+    Returns:
+        Tensor: 
+            Batched token duration extracted from `log_p_attn` (B,T_text)
+        Tensor: 
+            binarization loss tensor ()
+    """
+    B = log_p_attn.shape[0]
+    T_text = log_p_attn.shape[2]
+    device = log_p_attn.place
+
+    bin_loss = 0
+    ds = paddle.zeros((B, T_text), dtype="int32")
+    for b in range(B):
+        cur_log_p_attn = log_p_attn[b, :feats_lengths[b], :text_lengths[b]]
+        viterbi = _monotonic_alignment_search(cur_log_p_attn.numpy())
+        _ds = np.bincount(viterbi)
+        ds[b, :len(_ds)] = paddle.to_tensor(
+            _ds, place=device, dtype="int32")  
+
+        t_idx = paddle.arange(feats_lengths[b])
+        bin_loss = bin_loss - cur_log_p_attn[t_idx, viterbi].mean()
+    bin_loss = bin_loss / B
+    return ds, bin_loss
+
+
+@jit(nopython=True)
+def _average_by_duration(ds, xs, text_lengths, feats_lengths):
+    B = ds.shape[0]
+    # xs_avg = np.zeros_like(ds)
+    xs_avg = np.zeros(shape=ds.shape, dtype=np.float32)
+    ds = ds.astype(np.int32)
+    for b in range(B):
+        t_text = text_lengths[b]
+        t_feats = feats_lengths[b]
+        d = ds[b, :t_text]
+        d_cumsum = d.cumsum()
+        d_cumsum = [0] + list(d_cumsum)
+        x = xs[b, :t_feats]
+        for n, (start, end) in enumerate(zip(d_cumsum[:-1], d_cumsum[1:])):
+            if len(x[start:end]) != 0:
+                xs_avg[b, n] = x[start:end].mean()
+            else:
+                xs_avg[b, n] = 0
+    return xs_avg
+
+
+def average_by_duration(ds, xs, text_lengths, feats_lengths):
+    """
+    Args:
+        ds (Tensor): 
+            Batched token duration (B,T_text)
+        xs (Tensor): 
+            Batched feature sequences to be averaged (B,T_feats)
+        text_lengths (Tensor): 
+            Text length tensor (B,)
+        feats_lengths (Tensor): 
+            Feature length tensor (B,)
+    Returns:
+        Tensor: Batched feature averaged according to the token duration (B, T_text)
+    """
+    device = ds.place
+    args = [ds, xs, text_lengths, feats_lengths]
+    args = [arg.numpy() for arg in args]
+    xs_avg = _average_by_duration(*args)
+    xs_avg = paddle.to_tensor(xs_avg, place=device)
+    return xs_avg
diff --git a/paddlespeech/t2s/models/jets/generator.py b/paddlespeech/t2s/models/jets/generator.py
new file mode 100644
index 00000000..9580d17d
--- /dev/null
+++ b/paddlespeech/t2s/models/jets/generator.py
@@ -0,0 +1,897 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Generator module in JETS.
+
+This code is based on https://github.com/imdanboy/jets.
+
+"""
+import logging
+import math
+from typing import Any
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Sequence
+from typing import Tuple
+
+import numpy as np
+import paddle
+from paddle import nn
+from typeguard import check_argument_types
+
+from paddlespeech.t2s.models.hifigan import HiFiGANGenerator
+from paddlespeech.t2s.models.jets.alignments import AlignmentModule
+from paddlespeech.t2s.models.jets.alignments import average_by_duration
+from paddlespeech.t2s.models.jets.alignments import viterbi_decode
+from paddlespeech.t2s.models.jets.length_regulator import GaussianUpsampling
+from paddlespeech.t2s.modules.nets_utils import get_random_segments
+from paddlespeech.t2s.modules.nets_utils import initialize
+from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask
+from paddlespeech.t2s.modules.nets_utils import make_pad_mask
+from paddlespeech.t2s.modules.predictor.duration_predictor import DurationPredictor
+from paddlespeech.t2s.modules.predictor.length_regulator import LengthRegulator
+from paddlespeech.t2s.modules.predictor.variance_predictor import VariancePredictor
+from paddlespeech.t2s.modules.style_encoder import StyleEncoder
+from paddlespeech.t2s.modules.transformer.embedding import PositionalEncoding
+from paddlespeech.t2s.modules.transformer.embedding import ScaledPositionalEncoding
+from paddlespeech.t2s.modules.transformer.encoder import ConformerEncoder
+from paddlespeech.t2s.modules.transformer.encoder import TransformerEncoder
+
+
+class JETSGenerator(nn.Layer):
+    """Generator module in JETS.
+    """
+
+    def __init__(
+            self,
+            idim: int,
+            odim: int,
+            adim: int=256,
+            aheads: int=2,
+            elayers: int=4,
+            eunits: int=1024,
+            dlayers: int=4,
+            dunits: int=1024,
+            positionwise_layer_type: str="conv1d",
+            positionwise_conv_kernel_size: int=1,
+            use_scaled_pos_enc: bool=True,
+            use_batch_norm: bool=True,
+            encoder_normalize_before: bool=True,
+            decoder_normalize_before: bool=True,
+            encoder_concat_after: bool=False,
+            decoder_concat_after: bool=False,
+            reduction_factor: int=1,
+            encoder_type: str="transformer",
+            decoder_type: str="transformer",
+            transformer_enc_dropout_rate: float=0.1,
+            transformer_enc_positional_dropout_rate: float=0.1,
+            transformer_enc_attn_dropout_rate: float=0.1,
+            transformer_dec_dropout_rate: float=0.1,
+            transformer_dec_positional_dropout_rate: float=0.1,
+            transformer_dec_attn_dropout_rate: float=0.1,
+            transformer_activation_type: str="relu",
+            # only for conformer
+            conformer_rel_pos_type: str="legacy",
+            conformer_pos_enc_layer_type: str="rel_pos",
+            conformer_self_attn_layer_type: str="rel_selfattn",
+            conformer_activation_type: str="swish",
+            use_macaron_style_in_conformer: bool=True,
+            use_cnn_in_conformer: bool=True,
+            zero_triu: bool=False,
+            conformer_enc_kernel_size: int=7,
+            conformer_dec_kernel_size: int=31,
+            # duration predictor
+            duration_predictor_layers: int=2,
+            duration_predictor_chans: int=384,
+            duration_predictor_kernel_size: int=3,
+            duration_predictor_dropout_rate: float=0.1,
+            # energy predictor
+            energy_predictor_layers: int=2,
+            energy_predictor_chans: int=384,
+            energy_predictor_kernel_size: int=3,
+            energy_predictor_dropout: float=0.5,
+            energy_embed_kernel_size: int=9,
+            energy_embed_dropout: float=0.5,
+            stop_gradient_from_energy_predictor: bool=False,
+            # pitch predictor
+            pitch_predictor_layers: int=2,
+            pitch_predictor_chans: int=384,
+            pitch_predictor_kernel_size: int=3,
+            pitch_predictor_dropout: float=0.5,
+            pitch_embed_kernel_size: int=9,
+            pitch_embed_dropout: float=0.5,
+            stop_gradient_from_pitch_predictor: bool=False,
+            # extra embedding related
+            spks: Optional[int]=None,
+            langs: Optional[int]=None,
+            spk_embed_dim: Optional[int]=None,
+            spk_embed_integration_type: str="add",
+            use_gst: bool=False,
+            gst_tokens: int=10,
+            gst_heads: int=4,
+            gst_conv_layers: int=6,
+            gst_conv_chans_list: Sequence[int]=(32, 32, 64, 64, 128, 128),
+            gst_conv_kernel_size: int=3,
+            gst_conv_stride: int=2,
+            gst_gru_layers: int=1,
+            gst_gru_units: int=128,
+            # training related
+            init_type: str="xavier_uniform",
+            init_enc_alpha: float=1.0,
+            init_dec_alpha: float=1.0,
+            use_masking: bool=False,
+            use_weighted_masking: bool=False,
+            segment_size: int=64,
+            # hifigan generator
+            generator_out_channels: int=1,
+            generator_channels: int=512,
+            generator_global_channels: int=-1,
+            generator_kernel_size: int=7,
+            generator_upsample_scales: List[int]=[8, 8, 2, 2],
+            generator_upsample_kernel_sizes: List[int]=[16, 16, 4, 4],
+            generator_resblock_kernel_sizes: List[int]=[3, 7, 11],
+            generator_resblock_dilations: List[List[int]]=[[1, 3, 5], [1, 3, 5],
+                                                           [1, 3, 5]],
+            generator_use_additional_convs: bool=True,
+            generator_bias: bool=True,
+            generator_nonlinear_activation: str="LeakyReLU",
+            generator_nonlinear_activation_params: Dict[
+                str, Any]={"negative_slope": 0.1},
+            generator_use_weight_norm: bool=True, ):
+        """Initialize JETS generator module.
+
+        Args:
+            idim (int): 
+                Dimension of the inputs.
+            odim (int): 
+                Dimension of the outputs.
+            adim (int): 
+                Attention dimension.
+            aheads (int): 
+                Number of attention heads.
+            elayers (int): 
+                Number of encoder layers.
+            eunits (int): 
+                Number of encoder hidden units.
+            dlayers (int): 
+                Number of decoder layers.
+            dunits (int): 
+                Number of decoder hidden units.
+            use_scaled_pos_enc (bool): 
+                Whether to use trainable scaled pos encoding.
+            use_batch_norm (bool): 
+                Whether to use batch normalization in encoder prenet.
+            encoder_normalize_before (bool): 
+                Whether to apply layernorm layer before encoder block.
+            decoder_normalize_before (bool): 
+                Whether to apply layernorm layer before decoder block.
+            encoder_concat_after (bool): 
+                Whether to concatenate attention layer's input and output in encoder.
+            decoder_concat_after (bool): 
+                Whether to concatenate attention layer's input and output in decoder.
+            reduction_factor (int): 
+                Reduction factor.
+            encoder_type (str): 
+                Encoder type ("transformer" or "conformer").
+            decoder_type (str): 
+                Decoder type ("transformer" or "conformer").
+            transformer_enc_dropout_rate (float): 
+                Dropout rate in encoder except attention and positional encoding.
+            transformer_enc_positional_dropout_rate (float): 
+                Dropout rate after encoder positional encoding.
+            transformer_enc_attn_dropout_rate (float): 
+                Dropout rate in encoder self-attention module.
+            transformer_dec_dropout_rate (float): 
+                Dropout rate in decoder except attention & positional encoding.
+            transformer_dec_positional_dropout_rate (float): 
+                Dropout rate after decoder positional encoding.
+            transformer_dec_attn_dropout_rate (float): 
+                Dropout rate in decoder self-attention module.
+            conformer_rel_pos_type (str): 
+                Relative pos encoding type in conformer.
+            conformer_pos_enc_layer_type (str): 
+                Pos encoding layer type in conformer.
+            conformer_self_attn_layer_type (str): 
+                Self-attention layer type in conformer
+            conformer_activation_type (str): 
+                Activation function type in conformer.
+            use_macaron_style_in_conformer: 
+                Whether to use macaron style FFN.
+            use_cnn_in_conformer: 
+                Whether to use CNN in conformer.
+            zero_triu: 
+                Whether to use zero triu in relative self-attention module.
+            conformer_enc_kernel_size: 
+                Kernel size of encoder conformer.
+            conformer_dec_kernel_size: 
+                Kernel size of decoder conformer.
+            duration_predictor_layers (int): 
+                Number of duration predictor layers.
+            duration_predictor_chans (int): 
+                Number of duration predictor channels.
+            duration_predictor_kernel_size (int): 
+                Kernel size of duration predictor.
+            duration_predictor_dropout_rate (float): 
+                Dropout rate in duration predictor.
+            pitch_predictor_layers (int): 
+                Number of pitch predictor layers.
+            pitch_predictor_chans (int): 
+                Number of pitch predictor channels.
+            pitch_predictor_kernel_size (int): 
+                Kernel size of pitch predictor.
+            pitch_predictor_dropout_rate (float): 
+                Dropout rate in pitch predictor.
+            pitch_embed_kernel_size (float): 
+                Kernel size of pitch embedding.
+            pitch_embed_dropout_rate (float): 
+                Dropout rate for pitch embedding.
+            stop_gradient_from_pitch_predictor: 
+                Whether to stop gradient from pitch predictor to encoder.
+            energy_predictor_layers (int): 
+                Number of energy predictor layers.
+            energy_predictor_chans (int): 
+                Number of energy predictor channels.
+            energy_predictor_kernel_size (int): 
+                Kernel size of energy predictor.
+            energy_predictor_dropout_rate (float): 
+                Dropout rate in energy predictor.
+            energy_embed_kernel_size (float): 
+                Kernel size of energy embedding.
+            energy_embed_dropout_rate (float): 
+                Dropout rate for energy embedding.
+            stop_gradient_from_energy_predictor: 
+                Whether to stop gradient from energy predictor to encoder.
+            spks (Optional[int]): 
+                Number of speakers. If set to > 1, assume that the sids will be provided as the input and use sid embedding layer.
+            langs (Optional[int]): 
+                Number of languages. If set to > 1, assume that the lids will be provided as the input and use sid embedding layer.
+            spk_embed_dim (Optional[int]): 
+                Speaker embedding dimension. If set to > 0, assume that spembs will be provided as the input.
+            spk_embed_integration_type: 
+                How to integrate speaker embedding.
+            use_gst (str): 
+                Whether to use global style token.
+            gst_tokens (int): 
+                The number of GST embeddings.
+            gst_heads (int): 
+                The number of heads in GST multihead attention.
+            gst_conv_layers (int): 
+                The number of conv layers in GST.
+            gst_conv_chans_list: (Sequence[int]):
+                List of the number of channels of conv layers in GST.
+            gst_conv_kernel_size (int): 
+                Kernel size of conv layers in GST.
+            gst_conv_stride (int): 
+                Stride size of conv layers in GST.
+            gst_gru_layers (int): 
+                The number of GRU layers in GST.
+            gst_gru_units (int): 
+                The number of GRU units in GST.
+            init_type (str): 
+                How to initialize transformer parameters.
+            init_enc_alpha (float): 
+                Initial value of alpha in scaled pos encoding of the encoder.
+            init_dec_alpha (float): 
+                Initial value of alpha in scaled pos encoding of the decoder.
+            use_masking (bool): 
+                Whether to apply masking for padded part in loss calculation.
+            use_weighted_masking (bool): 
+                Whether to apply weighted masking in loss calculation.
+            segment_size (int): 
+                Segment size for random windowed discriminator
+            generator_out_channels (int): 
+                Number of output channels.
+            generator_channels (int): 
+                Number of hidden representation channels.
+            generator_global_channels (int): 
+                Number of global conditioning channels.
+            generator_kernel_size (int): 
+                Kernel size of initial and final conv layer.
+            generator_upsample_scales (List[int]): 
+                List of upsampling scales.
+            generator_upsample_kernel_sizes (List[int]): 
+                List of kernel sizes for upsample layers.
+            generator_resblock_kernel_sizes (List[int]): 
+                List of kernel sizes for residual blocks.
+            generator_resblock_dilations (List[List[int]]): 
+                List of list of dilations for residual blocks.
+            generator_use_additional_convs (bool): 
+                Whether to use additional conv layers in residual blocks.
+            generator_bias (bool): 
+                Whether to add bias parameter in convolution layers.
+            generator_nonlinear_activation (str): 
+                Activation function module name.
+            generator_nonlinear_activation_params (Dict[str, Any]): 
+                Hyperparameters for activation function.
+            generator_use_weight_norm (bool): 
+                Whether to use weight norm. If set to true, it will be applied to all of the conv layers.
+
+        """
+        super().__init__()
+        self.segment_size = segment_size
+        self.upsample_factor = int(np.prod(generator_upsample_scales))
+        self.idim = idim
+        self.odim = odim
+        self.reduction_factor = reduction_factor
+        self.encoder_type = encoder_type
+        self.decoder_type = decoder_type
+        self.stop_gradient_from_pitch_predictor = stop_gradient_from_pitch_predictor
+        self.stop_gradient_from_energy_predictor = stop_gradient_from_energy_predictor
+        self.use_scaled_pos_enc = use_scaled_pos_enc
+        self.use_gst = use_gst
+
+        # use idx 0 as padding idx
+        self.padding_idx = 0
+
+        # get positional encoding layer type
+        transformer_pos_enc_layer_type = "scaled_abs_pos" if self.use_scaled_pos_enc else "abs_pos"
+
+        # check relative positional encoding compatibility
+        if "conformer" in [encoder_type, decoder_type]:
+            if conformer_rel_pos_type == "legacy":
+                if conformer_pos_enc_layer_type == "rel_pos":
+                    conformer_pos_enc_layer_type = "legacy_rel_pos"
+                    logging.warning(
+                        "Fallback to conformer_pos_enc_layer_type = 'legacy_rel_pos' "
+                        "due to the compatibility. If you want to use the new one, "
+                        "please use conformer_pos_enc_layer_type = 'latest'.")
+                if conformer_self_attn_layer_type == "rel_selfattn":
+                    conformer_self_attn_layer_type = "legacy_rel_selfattn"
+                    logging.warning(
+                        "Fallback to "
+                        "conformer_self_attn_layer_type = 'legacy_rel_selfattn' "
+                        "due to the compatibility. If you want to use the new one, "
+                        "please use conformer_pos_enc_layer_type = 'latest'.")
+            elif conformer_rel_pos_type == "latest":
+                assert conformer_pos_enc_layer_type != "legacy_rel_pos"
+                assert conformer_self_attn_layer_type != "legacy_rel_selfattn"
+            else:
+                raise ValueError(
+                    f"Unknown rel_pos_type: {conformer_rel_pos_type}")
+
+        # define encoder
+        encoder_input_layer = nn.Embedding(
+            num_embeddings=idim,
+            embedding_dim=adim,
+            padding_idx=self.padding_idx)
+        if encoder_type == "transformer":
+            self.encoder = TransformerEncoder(
+                idim=idim,
+                attention_dim=adim,
+                attention_heads=aheads,
+                linear_units=eunits,
+                num_blocks=elayers,
+                input_layer=encoder_input_layer,
+                dropout_rate=transformer_enc_dropout_rate,
+                positional_dropout_rate=transformer_enc_positional_dropout_rate,
+                attention_dropout_rate=transformer_enc_attn_dropout_rate,
+                pos_enc_layer_type=transformer_pos_enc_layer_type,
+                normalize_before=encoder_normalize_before,
+                concat_after=encoder_concat_after,
+                positionwise_layer_type=positionwise_layer_type,
+                positionwise_conv_kernel_size=positionwise_conv_kernel_size,
+                activation_type=transformer_activation_type)
+        elif encoder_type == "conformer":
+            self.encoder = ConformerEncoder(
+                idim=idim,
+                attention_dim=adim,
+                attention_heads=aheads,
+                linear_units=eunits,
+                num_blocks=elayers,
+                input_layer=encoder_input_layer,
+                dropout_rate=transformer_enc_dropout_rate,
+                positional_dropout_rate=transformer_enc_positional_dropout_rate,
+                attention_dropout_rate=transformer_enc_attn_dropout_rate,
+                normalize_before=encoder_normalize_before,
+                concat_after=encoder_concat_after,
+                positionwise_layer_type=positionwise_layer_type,
+                positionwise_conv_kernel_size=positionwise_conv_kernel_size,
+                macaron_style=use_macaron_style_in_conformer,
+                pos_enc_layer_type=conformer_pos_enc_layer_type,
+                selfattention_layer_type=conformer_self_attn_layer_type,
+                activation_type=conformer_activation_type,
+                use_cnn_module=use_cnn_in_conformer,
+                cnn_module_kernel=conformer_enc_kernel_size,
+                zero_triu=zero_triu, )
+        else:
+            raise ValueError(f"{encoder_type} is not supported.")
+
+        # define GST
+        if self.use_gst:
+            self.gst = StyleEncoder(
+                idim=odim,  # the input is mel-spectrogram
+                gst_tokens=gst_tokens,
+                gst_token_dim=adim,
+                gst_heads=gst_heads,
+                conv_layers=gst_conv_layers,
+                conv_chans_list=gst_conv_chans_list,
+                conv_kernel_size=gst_conv_kernel_size,
+                conv_stride=gst_conv_stride,
+                gru_layers=gst_gru_layers,
+                gru_units=gst_gru_units, )
+
+        # define spk and lang embedding
+        self.spks = None
+        if spks is not None and spks > 1:
+            self.spks = spks
+            self.sid_emb = nn.Embedding(spks, adim)
+        self.langs = None
+        if langs is not None and langs > 1:
+            self.langs = langs
+            self.lid_emb = nn.Embedding(langs, adim)
+
+        # define additional projection for speaker embedding
+        self.spk_embed_dim = None
+        if spk_embed_dim is not None and spk_embed_dim > 0:
+            self.spk_embed_dim = spk_embed_dim
+            self.spk_embed_integration_type = spk_embed_integration_type
+        if self.spk_embed_dim is not None:
+            if self.spk_embed_integration_type == "add":
+                self.projection = nn.Linear(self.spk_embed_dim, adim)
+            else:
+                self.projection = nn.Linear(adim + self.spk_embed_dim, adim)
+
+        # define duration predictor
+        self.duration_predictor = DurationPredictor(
+            idim=adim,
+            n_layers=duration_predictor_layers,
+            n_chans=duration_predictor_chans,
+            kernel_size=duration_predictor_kernel_size,
+            dropout_rate=duration_predictor_dropout_rate, )
+
+        # define pitch predictor
+        self.pitch_predictor = VariancePredictor(
+            idim=adim,
+            n_layers=pitch_predictor_layers,
+            n_chans=pitch_predictor_chans,
+            kernel_size=pitch_predictor_kernel_size,
+            dropout_rate=pitch_predictor_dropout, )
+        # NOTE(kan-bayashi): We use continuous pitch + FastPitch style avg
+        self.pitch_embed = nn.Sequential(
+            nn.Conv1D(
+                in_channels=1,
+                out_channels=adim,
+                kernel_size=pitch_embed_kernel_size,
+                padding=(pitch_embed_kernel_size - 1) // 2, ),
+            nn.Dropout(pitch_embed_dropout), )
+
+        # define energy predictor
+        self.energy_predictor = VariancePredictor(
+            idim=adim,
+            n_layers=energy_predictor_layers,
+            n_chans=energy_predictor_chans,
+            kernel_size=energy_predictor_kernel_size,
+            dropout_rate=energy_predictor_dropout, )
+        # NOTE(kan-bayashi): We use continuous enegy + FastPitch style avg
+        self.energy_embed = nn.Sequential(
+            nn.Conv1D(
+                in_channels=1,
+                out_channels=adim,
+                kernel_size=energy_embed_kernel_size,
+                padding=(energy_embed_kernel_size - 1) // 2, ),
+            nn.Dropout(energy_embed_dropout), )
+
+        # define length regulator
+        self.length_regulator = GaussianUpsampling()
+
+        # define decoder
+        # NOTE: we use encoder as decoder
+        # because fastspeech's decoder is the same as encoder
+        if decoder_type == "transformer":
+            self.decoder = TransformerEncoder(
+                idim=0,
+                attention_dim=adim,
+                attention_heads=aheads,
+                linear_units=dunits,
+                num_blocks=dlayers,
+                # in decoder, don't need layer before pos_enc_class (we use embedding here in encoder)
+                input_layer=None,
+                dropout_rate=transformer_dec_dropout_rate,
+                positional_dropout_rate=transformer_dec_positional_dropout_rate,
+                attention_dropout_rate=transformer_dec_attn_dropout_rate,
+                pos_enc_layer_type=transformer_pos_enc_layer_type,
+                normalize_before=decoder_normalize_before,
+                concat_after=decoder_concat_after,
+                positionwise_layer_type=positionwise_layer_type,
+                positionwise_conv_kernel_size=positionwise_conv_kernel_size,
+                activation_type=conformer_activation_type, )
+
+        elif decoder_type == "conformer":
+            self.decoder = ConformerEncoder(
+                idim=0,
+                attention_dim=adim,
+                attention_heads=aheads,
+                linear_units=dunits,
+                num_blocks=dlayers,
+                input_layer=None,
+                dropout_rate=transformer_dec_dropout_rate,
+                positional_dropout_rate=transformer_dec_positional_dropout_rate,
+                attention_dropout_rate=transformer_dec_attn_dropout_rate,
+                normalize_before=decoder_normalize_before,
+                concat_after=decoder_concat_after,
+                positionwise_layer_type=positionwise_layer_type,
+                positionwise_conv_kernel_size=positionwise_conv_kernel_size,
+                macaron_style=use_macaron_style_in_conformer,
+                pos_enc_layer_type=conformer_pos_enc_layer_type,
+                selfattention_layer_type=conformer_self_attn_layer_type,
+                activation_type=conformer_activation_type,
+                use_cnn_module=use_cnn_in_conformer,
+                cnn_module_kernel=conformer_dec_kernel_size, )
+        else:
+            raise ValueError(f"{decoder_type} is not supported.")
+
+        self.generator = HiFiGANGenerator(
+            in_channels=adim,
+            out_channels=generator_out_channels,
+            channels=generator_channels,
+            global_channels=generator_global_channels,
+            kernel_size=generator_kernel_size,
+            upsample_scales=generator_upsample_scales,
+            upsample_kernel_sizes=generator_upsample_kernel_sizes,
+            resblock_kernel_sizes=generator_resblock_kernel_sizes,
+            resblock_dilations=generator_resblock_dilations,
+            use_additional_convs=generator_use_additional_convs,
+            bias=generator_bias,
+            nonlinear_activation=generator_nonlinear_activation,
+            nonlinear_activation_params=generator_nonlinear_activation_params,
+            use_weight_norm=generator_use_weight_norm, )
+
+        self.alignment_module = AlignmentModule(adim, odim)
+
+        # initialize parameters
+        self._reset_parameters(
+            init_type=init_type,
+            init_enc_alpha=init_enc_alpha,
+            init_dec_alpha=init_dec_alpha, )
+
+    def forward(
+            self,
+            text: paddle.Tensor,
+            text_lengths: paddle.Tensor,
+            feats: paddle.Tensor,
+            feats_lengths: paddle.Tensor,
+            durations: paddle.Tensor,
+            durations_lengths: paddle.Tensor,
+            pitch: paddle.Tensor,
+            energy: paddle.Tensor,
+            sids: Optional[paddle.Tensor]=None,
+            spembs: Optional[paddle.Tensor]=None,
+            lids: Optional[paddle.Tensor]=None,
+            use_alignment_module: bool=False,
+    ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor,
+               paddle.Tensor, paddle.Tensor,
+               Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor,
+                     paddle.Tensor, paddle.Tensor, ], ]:
+        """Calculate forward propagation.
+        Args:
+            text (Tensor):
+                Text index tensor (B, T_text).
+            text_lengths (Tensor):
+                Text length tensor (B,).
+            feats (Tensor):
+                Feature tensor (B, aux_channels, T_feats).
+            feats_lengths (Tensor):
+                Feature length tensor (B,).
+            pitch (Tensor): 
+                Batch of padded token-averaged pitch (B, T_text, 1).
+            energy (Tensor):
+                Batch of padded token-averaged energy (B, T_text, 1).
+            sids (Optional[Tensor]):
+                Speaker index tensor (B,) or (B, 1).
+            spembs (Optional[Tensor]):
+                Speaker embedding tensor (B, spk_embed_dim).
+            lids (Optional[Tensor]):
+                Language index tensor (B,) or (B, 1).
+            use_alignment_module (bool):
+                Whether to use alignment module.
+                
+        Returns:
+            Tensor: 
+                Waveform tensor (B, 1, segment_size * upsample_factor).
+            Tensor: 
+                binarization loss ()
+            Tensor: 
+                log probability attention matrix (B,T_feats,T_text)
+            Tensor: 
+                Segments start index tensor (B,).
+            Tensor: 
+                predicted duration (B,T_text)
+            Tensor: 
+                ground-truth duration obtained from an alignment module (B,T_text)
+            Tensor: 
+                predicted pitch (B,T_text,1)
+            Tensor: 
+                ground-truth averaged pitch (B,T_text,1)
+            Tensor: 
+                predicted energy (B,T_text,1)
+            Tensor: 
+                ground-truth averaged energy (B,T_text,1)
+        """
+        if use_alignment_module:
+            text = text[:, :text_lengths.max()]  # for data-parallel
+            feats = feats[:, :feats_lengths.max()]  # for data-parallel
+            pitch = pitch[:, :durations_lengths.max()]  # for data-parallel
+            energy = energy[:, :durations_lengths.max()]  # for data-parallel
+        else:
+            text = text[:, :text_lengths.max()]  # for data-parallel
+            feats = feats[:, :feats_lengths.max()]  # for data-parallel
+            pitch = pitch[:, :feats_lengths.max()]  # for data-parallel
+            energy = energy[:, :feats_lengths.max()]  # for data-parallel
+
+        # forward encoder
+        x_masks = self._source_mask(text_lengths)
+        hs, _ = self.encoder(text, x_masks)  # (B, T_text, adim)
+
+        # integrate with GST
+        if self.use_gst:
+            style_embs = self.gst(ys)
+            hs = hs + style_embs.unsqueeze(1)
+
+        # integrate with SID and LID embeddings
+        if self.spks is not None:
+            sid_embs = self.sid_emb(sids.reshape([-1]))
+            hs = hs + sid_embs.unsqueeze(1)
+        if self.langs is not None:
+            lid_embs = self.lid_emb(lids.reshape([-1]))
+            hs = hs + lid_embs.unsqueeze(1)
+
+        # integrate speaker embedding
+        if self.spk_embed_dim is not None:
+            hs = self._integrate_with_spk_embed(hs, spembs)
+
+        # forward alignment module and obtain duration, averaged pitch, energy
+        h_masks = make_pad_mask(text_lengths)
+        if use_alignment_module:
+            log_p_attn = self.alignment_module(hs, feats, h_masks)
+            ds, bin_loss = viterbi_decode(log_p_attn, text_lengths,
+                                          feats_lengths)
+            ps = average_by_duration(ds,
+                                     pitch.squeeze(-1), text_lengths,
+                                     feats_lengths).unsqueeze(-1)
+            es = average_by_duration(ds,
+                                     energy.squeeze(-1), text_lengths,
+                                     feats_lengths).unsqueeze(-1)
+        else:
+            ds = durations
+            ps = pitch
+            es = energy
+            log_p_attn = attn = bin_loss = None
+
+        # forward duration predictor and variance predictors
+        if self.stop_gradient_from_pitch_predictor:
+            p_outs = self.pitch_predictor(hs.detach(), h_masks.unsqueeze(-1))
+        else:
+            p_outs = self.pitch_predictor(hs, h_masks.unsqueeze(-1))
+        if self.stop_gradient_from_energy_predictor:
+            e_outs = self.energy_predictor(hs.detach(), h_masks.unsqueeze(-1))
+        else:
+            e_outs = self.energy_predictor(hs, h_masks.unsqueeze(-1))
+
+        d_outs = self.duration_predictor(hs, h_masks)
+
+        # use groundtruth in training
+        p_embs = self.pitch_embed(ps.transpose([0, 2, 1])).transpose([0, 2, 1])
+        e_embs = self.energy_embed(es.transpose([0, 2, 1])).transpose([0, 2, 1])
+        hs = hs + e_embs + p_embs
+
+        # upsampling
+        h_masks = make_non_pad_mask(feats_lengths)
+        # d_masks = make_non_pad_mask(text_lengths).to(ds.device)
+        d_masks = make_non_pad_mask(text_lengths)
+        hs = self.length_regulator(hs, ds, h_masks,
+                                   d_masks)  # (B, T_feats, adim)
+
+        # forward decoder
+        h_masks = self._source_mask(feats_lengths)
+        zs, _ = self.decoder(hs, h_masks)  # (B, T_feats, adim)
+
+        # get random segments
+        z_segments, z_start_idxs = get_random_segments(
+            zs.transpose([0, 2, 1]),
+            feats_lengths,
+            self.segment_size, )
+        # forward generator
+        wav = self.generator(z_segments)
+        if use_alignment_module:
+            return wav, bin_loss, log_p_attn, z_start_idxs, d_outs, ds, p_outs, ps, e_outs, es
+        else:
+            return wav, None, None, z_start_idxs, d_outs, ds, p_outs, ps, e_outs, es
+
+    def inference(
+            self,
+            text: paddle.Tensor,
+            text_lengths: paddle.Tensor,
+            feats: Optional[paddle.Tensor]=None,
+            feats_lengths: Optional[paddle.Tensor]=None,
+            pitch: Optional[paddle.Tensor]=None,
+            energy: Optional[paddle.Tensor]=None,
+            sids: Optional[paddle.Tensor]=None,
+            spembs: Optional[paddle.Tensor]=None,
+            lids: Optional[paddle.Tensor]=None,
+            use_alignment_module: bool=False,
+    ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
+        """Run inference.
+
+        Args:
+            text (Tensor): Input text index tensor (B, T_text,).
+            text_lengths (Tensor): Text length tensor (B,).
+            feats (Tensor): Feature tensor (B, T_feats, aux_channels).
+            feats_lengths (Tensor): Feature length tensor (B,).
+            pitch (Tensor): Pitch tensor (B, T_feats, 1)
+            energy (Tensor): Energy tensor (B, T_feats, 1)
+            sids (Optional[Tensor]): Speaker index tensor (B,) or (B, 1).
+            spembs (Optional[Tensor]): Speaker embedding tensor (B, spk_embed_dim).
+            lids (Optional[Tensor]): Language index tensor (B,) or (B, 1).
+            use_alignment_module (bool): Whether to use alignment module.
+
+        Returns:
+            Tensor: Generated waveform tensor (B, T_wav).
+            Tensor: Duration tensor (B, T_text).
+
+        """
+        # forward encoder
+        x_masks = self._source_mask(text_lengths)
+        hs, _ = self.encoder(text, x_masks)  # (B, T_text, adim)
+
+        # integrate with GST
+        if self.use_gst:
+            style_embs = self.gst(ys)
+            hs = hs + style_embs.unsqueeze(1)
+
+        # integrate with SID and LID embeddings
+        if self.spks is not None:
+            sid_embs = self.sid_emb(sids.view(-1))
+            hs = hs + sid_embs.unsqueeze(1)
+        if self.langs is not None:
+            lid_embs = self.lid_emb(lids.view(-1))
+            hs = hs + lid_embs.unsqueeze(1)
+
+        # integrate speaker embedding
+        if self.spk_embed_dim is not None:
+            hs = self._integrate_with_spk_embed(hs, spembs)
+
+        h_masks = make_pad_mask(text_lengths)
+        if use_alignment_module:
+            # forward alignment module and obtain duration, averaged pitch, energy
+            log_p_attn, attn = self.alignment_module(hs, feats, h_masks)
+            d_outs, _ = viterbi_decode(log_p_attn, text_lengths, feats_lengths)
+            p_outs = average_by_duration(d_outs,
+                                         pitch.squeeze(-1), text_lengths,
+                                         feats_lengths).unsqueeze(-1)
+            e_outs = average_by_duration(d_outs,
+                                         energy.squeeze(-1), text_lengths,
+                                         feats_lengths).unsqueeze(-1)
+        else:
+            # forward duration predictor and variance predictors
+            p_outs = self.pitch_predictor(hs, h_masks.unsqueeze(-1))
+            e_outs = self.energy_predictor(hs, h_masks.unsqueeze(-1))
+            d_outs = self.duration_predictor.inference(hs, h_masks)
+
+        p_embs = self.pitch_embed(p_outs.transpose([0, 2, 1])).transpose(
+            [0, 2, 1])
+        e_embs = self.energy_embed(e_outs.transpose([0, 2, 1])).transpose(
+            [0, 2, 1])
+        hs = hs + e_embs + p_embs
+
+        # upsampling
+        if feats_lengths is not None:
+            h_masks = make_non_pad_mask(feats_lengths)
+        else:
+            h_masks = None
+        d_masks = make_non_pad_mask(text_lengths)
+        hs = self.length_regulator(hs, d_outs, h_masks,
+                                   d_masks)  # (B, T_feats, adim)
+
+        # forward decoder
+        if feats_lengths is not None:
+            h_masks = self._source_mask(feats_lengths)
+        else:
+            h_masks = None
+        zs, _ = self.decoder(hs, h_masks)  # (B, T_feats, adim)
+
+        # forward generator
+        wav = self.generator(zs.transpose([0, 2, 1]))
+
+        return wav.squeeze(1), d_outs
+
+    def _integrate_with_spk_embed(self,
+                                  hs: paddle.Tensor,
+                                  spembs: paddle.Tensor) -> paddle.Tensor:
+        """Integrate speaker embedding with hidden states.
+
+        Args:
+            hs (Tensor): Batch of hidden state sequences (B, T_text, adim).
+            spembs (Tensor): Batch of speaker embeddings (B, spk_embed_dim).
+
+        Returns:
+            Tensor: Batch of integrated hidden state sequences (B, T_text, adim).
+
+        """
+        if self.spk_embed_integration_type == "add":
+            # apply projection and then add to hidden states
+            spembs = self.projection(F.normalize(spembs))
+            hs = hs + spembs.unsqueeze(1)
+        elif self.spk_embed_integration_type == "concat":
+            # concat hidden states with spk embeds and then apply projection
+            spembs = F.normalize(spembs).unsqueeze(1).expand(-1, hs.shape[1],
+                                                             -1)
+            hs = self.projection(paddle.concat([hs, spembs], axis=-1))
+        else:
+            raise NotImplementedError("support only add or concat.")
+
+        return hs
+
+    def _generate_path(self, dur: paddle.Tensor,
+                       mask: paddle.Tensor) -> paddle.Tensor:
+        """Generate path a.k.a. monotonic attention.
+        Args:
+            dur (Tensor):
+                Duration tensor (B, 1, T_text).
+            mask (Tensor):
+                Attention mask tensor (B, 1, T_feats, T_text).
+        Returns:
+            Tensor:
+                Path tensor (B, 1, T_feats, T_text).
+        """
+        b, _, t_y, t_x = paddle.shape(mask)
+        cum_dur = paddle.cumsum(dur, -1)
+        cum_dur_flat = paddle.reshape(cum_dur, [b * t_x])
+
+        path = paddle.arange(t_y, dtype=dur.dtype)
+        path = path.unsqueeze(0) < cum_dur_flat.unsqueeze(1)
+        path = paddle.reshape(path, [b, t_x, t_y])
+        '''
+        path will be like (t_x = 3, t_y = 5):
+        [[[1., 1., 0., 0., 0.],      [[[1., 1., 0., 0., 0.],
+          [1., 1., 1., 1., 0.],  -->   [0., 0., 1., 1., 0.],
+          [1., 1., 1., 1., 1.]]]       [0., 0., 0., 0., 1.]]]
+        '''
+
+        path = paddle.cast(path, dtype='float32')
+        pad_tmp = self.pad1d(path)[:, :-1]
+        path = path - pad_tmp
+        return path.unsqueeze(1).transpose([0, 1, 3, 2]) * mask
+
+    def _source_mask(self, ilens: paddle.Tensor) -> paddle.Tensor:
+        """Make masks for self-attention.
+
+        Args:
+            ilens (LongTensor): Batch of lengths (B,).
+
+        Returns:
+            Tensor: Mask tensor for self-attention.
+                dtype=paddle.uint8 
+
+        Examples:
+            >>> ilens = [5, 3]
+            >>> self._source_mask(ilens)
+            tensor([[[1, 1, 1, 1, 1],
+                     [1, 1, 1, 0, 0]]], dtype=torch.uint8)
+
+        """
+        x_masks = paddle.to_tensor(make_non_pad_mask(ilens))
+        return x_masks.unsqueeze(-2)
+
+    def _reset_parameters(self,
+                          init_type: str,
+                          init_enc_alpha: float,
+                          init_dec_alpha: float):
+        # initialize parameters
+        initialize(self, init_type)
+
+        # initialize alpha in scaled positional encoding
+        if self.encoder_type == "transformer" and self.use_scaled_pos_enc:
+            self.encoder.embed[-1].alpha.data = paddle.to_tensor(init_enc_alpha)
+        if self.decoder_type == "transformer" and self.use_scaled_pos_enc:
+            self.decoder.embed[-1].alpha.data = paddle.to_tensor(init_dec_alpha)
diff --git a/paddlespeech/t2s/models/jets/jets.py b/paddlespeech/t2s/models/jets/jets.py
new file mode 100644
index 00000000..4346c65b
--- /dev/null
+++ b/paddlespeech/t2s/models/jets/jets.py
@@ -0,0 +1,582 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Generator module in JETS.
+
+This code is based on https://github.com/imdanboy/jets.
+
+"""
+"""JETS module"""
+import math
+from typing import Any
+from typing import Dict
+from typing import Optional
+
+import paddle
+from paddle import nn
+from typeguard import check_argument_types
+
+from paddlespeech.t2s.models.hifigan import HiFiGANMultiPeriodDiscriminator
+from paddlespeech.t2s.models.hifigan import HiFiGANMultiScaleDiscriminator
+from paddlespeech.t2s.models.hifigan import HiFiGANMultiScaleMultiPeriodDiscriminator
+from paddlespeech.t2s.models.hifigan import HiFiGANPeriodDiscriminator
+from paddlespeech.t2s.models.hifigan import HiFiGANScaleDiscriminator
+from paddlespeech.t2s.models.jets.generator import JETSGenerator
+from paddlespeech.utils.initialize import _calculate_fan_in_and_fan_out
+from paddlespeech.utils.initialize import kaiming_uniform_
+from paddlespeech.utils.initialize import normal_
+from paddlespeech.utils.initialize import ones_
+from paddlespeech.utils.initialize import uniform_
+from paddlespeech.utils.initialize import zeros_
+
+AVAILABLE_GENERATERS = {
+    "jets_generator": JETSGenerator,
+}
+AVAILABLE_DISCRIMINATORS = {
+    "hifigan_period_discriminator":
+    HiFiGANPeriodDiscriminator,
+    "hifigan_scale_discriminator":
+    HiFiGANScaleDiscriminator,
+    "hifigan_multi_period_discriminator":
+    HiFiGANMultiPeriodDiscriminator,
+    "hifigan_multi_scale_discriminator":
+    HiFiGANMultiScaleDiscriminator,
+    "hifigan_multi_scale_multi_period_discriminator":
+    HiFiGANMultiScaleMultiPeriodDiscriminator,
+}
+
+
+class JETS(nn.Layer):
+    """JETS module (generator + discriminator).
+    This is a module of JETS described in `JETS: Jointly Training FastSpeech2 
+    and HiFi-GAN for End to End Text to Speech`_.
+    .. _`JETS: Jointly Training FastSpeech2 and HiFi-GAN for End to End Text to Speech
+        Text-to-Speech`: https://arxiv.org/abs/2203.16852v1
+    """
+
+    def __init__(
+            self,
+            # generator related
+            idim: int,
+            odim: int,
+            sampling_rate: int=22050,
+            generator_type: str="jets_generator",
+            generator_params: Dict[str, Any]={
+                "adim": 256,
+                "aheads": 2,
+                "elayers": 4,
+                "eunits": 1024,
+                "dlayers": 4,
+                "dunits": 1024,
+                "positionwise_layer_type": "conv1d",
+                "positionwise_conv_kernel_size": 1,
+                "use_scaled_pos_enc": True,
+                "use_batch_norm": True,
+                "encoder_normalize_before": True,
+                "decoder_normalize_before": True,
+                "encoder_concat_after": False,
+                "decoder_concat_after": False,
+                "reduction_factor": 1,
+                "encoder_type": "transformer",
+                "decoder_type": "transformer",
+                "transformer_enc_dropout_rate": 0.1,
+                "transformer_enc_positional_dropout_rate": 0.1,
+                "transformer_enc_attn_dropout_rate": 0.1,
+                "transformer_dec_dropout_rate": 0.1,
+                "transformer_dec_positional_dropout_rate": 0.1,
+                "transformer_dec_attn_dropout_rate": 0.1,
+                "conformer_rel_pos_type": "latest",
+                "conformer_pos_enc_layer_type": "rel_pos",
+                "conformer_self_attn_layer_type": "rel_selfattn",
+                "conformer_activation_type": "swish",
+                "use_macaron_style_in_conformer": True,
+                "use_cnn_in_conformer": True,
+                "zero_triu": False,
+                "conformer_enc_kernel_size": 7,
+                "conformer_dec_kernel_size": 31,
+                "duration_predictor_layers": 2,
+                "duration_predictor_chans": 384,
+                "duration_predictor_kernel_size": 3,
+                "duration_predictor_dropout_rate": 0.1,
+                "energy_predictor_layers": 2,
+                "energy_predictor_chans": 384,
+                "energy_predictor_kernel_size": 3,
+                "energy_predictor_dropout": 0.5,
+                "energy_embed_kernel_size": 1,
+                "energy_embed_dropout": 0.5,
+                "stop_gradient_from_energy_predictor": False,
+                "pitch_predictor_layers": 5,
+                "pitch_predictor_chans": 384,
+                "pitch_predictor_kernel_size": 5,
+                "pitch_predictor_dropout": 0.5,
+                "pitch_embed_kernel_size": 1,
+                "pitch_embed_dropout": 0.5,
+                "stop_gradient_from_pitch_predictor": True,
+                "generator_out_channels": 1,
+                "generator_channels": 512,
+                "generator_global_channels": -1,
+                "generator_kernel_size": 7,
+                "generator_upsample_scales": [8, 8, 2, 2],
+                "generator_upsample_kernel_sizes": [16, 16, 4, 4],
+                "generator_resblock_kernel_sizes": [3, 7, 11],
+                "generator_resblock_dilations":
+                [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+                "generator_use_additional_convs": True,
+                "generator_bias": True,
+                "generator_nonlinear_activation": "LeakyReLU",
+                "generator_nonlinear_activation_params": {
+                    "negative_slope": 0.1
+                },
+                "generator_use_weight_norm": True,
+                "segment_size": 64,
+                "spks": -1,
+                "langs": -1,
+                "spk_embed_dim": None,
+                "spk_embed_integration_type": "add",
+                "use_gst": False,
+                "gst_tokens": 10,
+                "gst_heads": 4,
+                "gst_conv_layers": 6,
+                "gst_conv_chans_list": [32, 32, 64, 64, 128, 128],
+                "gst_conv_kernel_size": 3,
+                "gst_conv_stride": 2,
+                "gst_gru_layers": 1,
+                "gst_gru_units": 128,
+                "init_type": "xavier_uniform",
+                "init_enc_alpha": 1.0,
+                "init_dec_alpha": 1.0,
+                "use_masking": False,
+                "use_weighted_masking": False,
+            },
+            # discriminator related
+            discriminator_type: str="hifigan_multi_scale_multi_period_discriminator",
+            discriminator_params: Dict[str, Any]={
+                "scales": 1,
+                "scale_downsample_pooling": "AvgPool1D",
+                "scale_downsample_pooling_params": {
+                    "kernel_size": 4,
+                    "stride": 2,
+                    "padding": 2,
+                },
+                "scale_discriminator_params": {
+                    "in_channels": 1,
+                    "out_channels": 1,
+                    "kernel_sizes": [15, 41, 5, 3],
+                    "channels": 128,
+                    "max_downsample_channels": 1024,
+                    "max_groups": 16,
+                    "bias": True,
+                    "downsample_scales": [2, 2, 4, 4, 1],
+                    "nonlinear_activation": "leakyrelu",
+                    "nonlinear_activation_params": {
+                        "negative_slope": 0.1
+                    },
+                    "use_weight_norm": True,
+                    "use_spectral_norm": False,
+                },
+                "follow_official_norm": False,
+                "periods": [2, 3, 5, 7, 11],
+                "period_discriminator_params": {
+                    "in_channels": 1,
+                    "out_channels": 1,
+                    "kernel_sizes": [5, 3],
+                    "channels": 32,
+                    "downsample_scales": [3, 3, 3, 3, 1],
+                    "max_downsample_channels": 1024,
+                    "bias": True,
+                    "nonlinear_activation": "leakyrelu",
+                    "nonlinear_activation_params": {
+                        "negative_slope": 0.1
+                    },
+                    "use_weight_norm": True,
+                    "use_spectral_norm": False,
+                },
+            },
+            cache_generator_outputs: bool=True, ):
+        """Initialize JETS module.
+        Args:
+            idim (int):
+                Input vocabrary size.
+            odim (int):
+                Acoustic feature dimension. The actual output channels will
+                be 1 since JETS is the end-to-end text-to-wave model but for the
+                compatibility odim is used to indicate the acoustic feature dimension.
+            sampling_rate (int):
+                Sampling rate, not used for the training but it will
+                be referred in saving waveform during the inference.
+            generator_type (str):
+                Generator type.
+            generator_params (Dict[str, Any]):
+                Parameter dict for generator.
+            discriminator_type (str):
+                Discriminator type.
+            discriminator_params (Dict[str, Any]):
+                Parameter dict for discriminator.
+            cache_generator_outputs (bool):
+                Whether to cache generator outputs.
+        """
+        assert check_argument_types()
+        super().__init__()
+
+        # define modules
+        generator_class = AVAILABLE_GENERATERS[generator_type]
+        if generator_type == "jets_generator":
+            # NOTE: Update parameters for the compatibility.
+            #   The idim and odim is automatically decided from input data,
+            #   where idim represents #vocabularies and odim represents
+            #   the input acoustic feature dimension.
+            generator_params.update(idim=idim, odim=odim)
+        self.generator = generator_class(
+            **generator_params, )
+        discriminator_class = AVAILABLE_DISCRIMINATORS[discriminator_type]
+        self.discriminator = discriminator_class(
+            **discriminator_params, )
+
+        # cache
+        self.cache_generator_outputs = cache_generator_outputs
+        self._cache = None
+
+        # store sampling rate for saving wav file
+        # (not used for the training)
+        self.fs = sampling_rate
+
+        # store parameters for test compatibility
+        self.spks = self.generator.spks
+        self.langs = self.generator.langs
+        self.spk_embed_dim = self.generator.spk_embed_dim
+
+        self.reuse_cache_gen = True
+        self.reuse_cache_dis = True
+
+        self.reset_parameters()
+        self.generator._reset_parameters(
+            init_type=generator_params["init_type"],
+            init_enc_alpha=generator_params["init_enc_alpha"],
+            init_dec_alpha=generator_params["init_dec_alpha"], )
+
+    def forward(
+            self,
+            text: paddle.Tensor,
+            text_lengths: paddle.Tensor,
+            feats: paddle.Tensor,
+            feats_lengths: paddle.Tensor,
+            durations: paddle.Tensor,
+            durations_lengths: paddle.Tensor,
+            pitch: paddle.Tensor,
+            energy: paddle.Tensor,
+            sids: Optional[paddle.Tensor]=None,
+            spembs: Optional[paddle.Tensor]=None,
+            lids: Optional[paddle.Tensor]=None,
+            forward_generator: bool=True,
+            use_alignment_module: bool=False,
+            **kwargs,
+    ) -> Dict[str, Any]:
+        """Perform generator forward.
+        Args:
+            text (Tensor):
+                Text index tensor (B, T_text).
+            text_lengths (Tensor):
+                Text length tensor (B,).
+            feats (Tensor):
+                Feature tensor (B, T_feats, aux_channels).
+            feats_lengths (Tensor):
+                Feature length tensor (B,).
+            durations(Tensor(int64)): 
+                Batch of padded durations (B, Tmax).
+            durations_lengths (Tensor):
+                durations length tensor (B,).
+            pitch(Tensor): 
+                Batch of padded token-averaged pitch (B, Tmax, 1).
+            energy(Tensor): 
+                Batch of padded token-averaged energy (B, Tmax, 1).
+            sids (Optional[Tensor]):
+                Speaker index tensor (B,) or (B, 1).
+            spembs (Optional[Tensor]):
+                Speaker embedding tensor (B, spk_embed_dim).
+            lids (Optional[Tensor]):
+                Language index tensor (B,) or (B, 1).
+            forward_generator (bool):
+                Whether to forward generator.
+            use_alignment_module (bool):
+                Whether to use alignment module.
+        Returns:
+
+        """
+        if forward_generator:
+            return self._forward_generator(
+                text=text,
+                text_lengths=text_lengths,
+                feats=feats,
+                feats_lengths=feats_lengths,
+                durations=durations,
+                durations_lengths=durations_lengths,
+                pitch=pitch,
+                energy=energy,
+                sids=sids,
+                spembs=spembs,
+                lids=lids,
+                use_alignment_module=use_alignment_module, )
+        else:
+            return self._forward_discrminator(
+                text=text,
+                text_lengths=text_lengths,
+                feats=feats,
+                feats_lengths=feats_lengths,
+                durations=durations,
+                durations_lengths=durations_lengths,
+                pitch=pitch,
+                energy=energy,
+                sids=sids,
+                spembs=spembs,
+                lids=lids,
+                use_alignment_module=use_alignment_module, )
+
+    def _forward_generator(
+            self,
+            text: paddle.Tensor,
+            text_lengths: paddle.Tensor,
+            feats: paddle.Tensor,
+            feats_lengths: paddle.Tensor,
+            durations: paddle.Tensor,
+            durations_lengths: paddle.Tensor,
+            pitch: paddle.Tensor,
+            energy: paddle.Tensor,
+            sids: Optional[paddle.Tensor]=None,
+            spembs: Optional[paddle.Tensor]=None,
+            lids: Optional[paddle.Tensor]=None,
+            use_alignment_module: bool=False,
+            **kwargs, ) -> Dict[str, Any]:
+        """Perform generator forward.
+        Args:
+            text (Tensor):
+                Text index tensor (B, T_text).
+            text_lengths (Tensor):
+                Text length tensor (B,).
+            feats (Tensor):
+                Feature tensor (B, T_feats, aux_channels).
+            feats_lengths (Tensor):
+                Feature length tensor (B,).
+            durations(Tensor(int64)): 
+                Batch of padded durations (B, Tmax).
+            durations_lengths (Tensor):
+                durations length tensor (B,).
+            pitch(Tensor): 
+                Batch of padded token-averaged pitch (B, Tmax, 1).
+            energy(Tensor): 
+                Batch of padded token-averaged energy (B, Tmax, 1).
+            sids (Optional[Tensor]):
+                Speaker index tensor (B,) or (B, 1).
+            spembs (Optional[Tensor]):
+                Speaker embedding tensor (B, spk_embed_dim).
+            lids (Optional[Tensor]):
+                Language index tensor (B,) or (B, 1).
+            use_alignment_module (bool):
+                Whether to use alignment module.
+        Returns:
+
+        """
+        # setup
+        # calculate generator outputs
+        self.reuse_cache_gen = True
+        if not self.cache_generator_outputs or self._cache is None:
+            self.reuse_cache_gen = False
+            outs = self.generator(
+                text=text,
+                text_lengths=text_lengths,
+                feats=feats,
+                feats_lengths=feats_lengths,
+                durations=durations,
+                durations_lengths=durations_lengths,
+                pitch=pitch,
+                energy=energy,
+                sids=sids,
+                spembs=spembs,
+                lids=lids,
+                use_alignment_module=use_alignment_module, )
+        else:
+            outs = self._cache
+
+        # store cache
+        if self.training and self.cache_generator_outputs and not self.reuse_cache_gen:
+            self._cache = outs
+
+        return outs
+
+    def _forward_discrminator(
+            self,
+            text: paddle.Tensor,
+            text_lengths: paddle.Tensor,
+            feats: paddle.Tensor,
+            feats_lengths: paddle.Tensor,
+            durations: paddle.Tensor,
+            durations_lengths: paddle.Tensor,
+            pitch: paddle.Tensor,
+            energy: paddle.Tensor,
+            sids: Optional[paddle.Tensor]=None,
+            spembs: Optional[paddle.Tensor]=None,
+            lids: Optional[paddle.Tensor]=None,
+            use_alignment_module: bool=False,
+            **kwargs, ) -> Dict[str, Any]:
+        """Perform discriminator forward.
+        Args:
+            text (Tensor):
+                Text index tensor (B, T_text).
+            text_lengths (Tensor):
+                Text length tensor (B,).
+            feats (Tensor):
+                Feature tensor (B, T_feats, aux_channels).
+            feats_lengths (Tensor):
+                Feature length tensor (B,).
+            durations(Tensor(int64)): 
+                Batch of padded durations (B, Tmax).
+            durations_lengths (Tensor):
+                durations length tensor (B,).
+            pitch(Tensor): 
+                Batch of padded token-averaged pitch (B, Tmax, 1).
+            energy(Tensor): 
+                Batch of padded token-averaged energy (B, Tmax, 1).
+            sids (Optional[Tensor]):
+                Speaker index tensor (B,) or (B, 1).
+            spembs (Optional[Tensor]):
+                Speaker embedding tensor (B, spk_embed_dim).
+            lids (Optional[Tensor]):
+                Language index tensor (B,) or (B, 1).
+            use_alignment_module (bool):
+                Whether to use alignment module.
+        Returns:
+
+        """
+        # setup
+        # calculate generator outputs
+        self.reuse_cache_dis = True
+        if not self.cache_generator_outputs or self._cache is None:
+            self.reuse_cache_dis = False
+            outs = self.generator(
+                text=text,
+                text_lengths=text_lengths,
+                feats=feats,
+                feats_lengths=feats_lengths,
+                durations=durations,
+                durations_lengths=durations_lengths,
+                pitch=pitch,
+                energy=energy,
+                sids=sids,
+                spembs=spembs,
+                lids=lids,
+                use_alignment_module=use_alignment_module,
+                **kwargs, )
+        else:
+            outs = self._cache
+
+        # store cache
+        if self.cache_generator_outputs and not self.reuse_cache_dis:
+            self._cache = outs
+
+        return outs
+
+    def inference(self,
+                  text: paddle.Tensor,
+                  feats: Optional[paddle.Tensor]=None,
+                  pitch: Optional[paddle.Tensor]=None,
+                  energy: Optional[paddle.Tensor]=None,
+                  use_alignment_module: bool=False,
+                  **kwargs) -> Dict[str, paddle.Tensor]:
+        """Run inference.
+        Args:
+            text (Tensor):
+                Input text index tensor (T_text,).
+            feats (Tensor):
+                Feature tensor (T_feats, aux_channels).
+            pitch (Tensor):
+                Pitch tensor (T_feats, 1).
+            energy (Tensor): 
+                Energy tensor (T_feats, 1).
+            use_alignment_module (bool):
+                Whether to use alignment module.
+        Returns:
+            Dict[str, Tensor]:
+                * wav (Tensor):
+                    Generated waveform tensor (T_wav,).
+                * duration (Tensor):
+                    Predicted duration tensor (T_text,).
+        """
+        # setup
+        text = text[None]
+        text_lengths = paddle.to_tensor(paddle.shape(text)[1])
+
+        # inference
+        if use_alignment_module:
+            assert feats is not None
+            feats = feats[None]
+            feats_lengths = paddle.to_tensor(paddle.shape(feats)[1])
+            pitch = pitch[None]
+            energy = energy[None]
+            wav, dur = self.generator.inference(
+                text=text,
+                text_lengths=text_lengths,
+                feats=feats,
+                feats_lengths=feats_lengths,
+                pitch=pitch,
+                energy=energy,
+                use_alignment_module=use_alignment_module,
+                **kwargs)
+        else:
+            wav, dur = self.generator.inference(
+                text=text,
+                text_lengths=text_lengths,
+                **kwargs, )
+        return dict(wav=paddle.reshape(wav, [-1]), duration=dur[0])
+
+    def reset_parameters(self):
+        def _reset_parameters(module):
+            if isinstance(
+                    module,
+                (nn.Conv1D, nn.Conv1DTranspose, nn.Conv2D, nn.Conv2DTranspose)):
+                kaiming_uniform_(module.weight, a=math.sqrt(5))
+                if module.bias is not None:
+                    fan_in, _ = _calculate_fan_in_and_fan_out(module.weight)
+                    if fan_in != 0:
+                        bound = 1 / math.sqrt(fan_in)
+                        uniform_(module.bias, -bound, bound)
+
+            if isinstance(
+                    module,
+                (nn.BatchNorm1D, nn.BatchNorm2D, nn.GroupNorm, nn.LayerNorm)):
+                ones_(module.weight)
+                zeros_(module.bias)
+
+            if isinstance(module, nn.Linear):
+                kaiming_uniform_(module.weight, a=math.sqrt(5))
+                if module.bias is not None:
+                    fan_in, _ = _calculate_fan_in_and_fan_out(module.weight)
+                    bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+                    uniform_(module.bias, -bound, bound)
+
+            if isinstance(module, nn.Embedding):
+                normal_(module.weight)
+                if module._padding_idx is not None:
+                    with paddle.no_grad():
+                        module.weight[module._padding_idx] = 0
+
+        self.apply(_reset_parameters)
+
+
+class JETSInference(nn.Layer):
+    def __init__(self, model):
+        super().__init__()
+        self.acoustic_model = model
+
+    def forward(self, text, sids=None):
+        out = self.acoustic_model.inference(text)
+        wav = out['wav']
+        return wav
diff --git a/paddlespeech/t2s/models/jets/jets_updater.py b/paddlespeech/t2s/models/jets/jets_updater.py
new file mode 100644
index 00000000..a82ac85c
--- /dev/null
+++ b/paddlespeech/t2s/models/jets/jets_updater.py
@@ -0,0 +1,437 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Generator module in JETS.
+
+This code is based on https://github.com/imdanboy/jets.
+
+"""
+import logging
+from typing import Dict
+
+import paddle
+from paddle import distributed as dist
+from paddle.io import DataLoader
+from paddle.nn import Layer
+from paddle.optimizer import Optimizer
+from paddle.optimizer.lr import LRScheduler
+
+from paddlespeech.t2s.modules.nets_utils import get_segments
+from paddlespeech.t2s.training.extensions.evaluator import StandardEvaluator
+from paddlespeech.t2s.training.reporter import report
+from paddlespeech.t2s.training.updaters.standard_updater import StandardUpdater
+from paddlespeech.t2s.training.updaters.standard_updater import UpdaterState
+
+logging.basicConfig(
+    format='%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s',
+    datefmt='[%Y-%m-%d %H:%M:%S]')
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+class JETSUpdater(StandardUpdater):
+    def __init__(self,
+                 model: Layer,
+                 optimizers: Dict[str, Optimizer],
+                 criterions: Dict[str, Layer],
+                 schedulers: Dict[str, LRScheduler],
+                 dataloader: DataLoader,
+                 generator_train_start_steps: int=0,
+                 discriminator_train_start_steps: int=100000,
+                 lambda_adv: float=1.0,
+                 lambda_mel: float=45.0,
+                 lambda_feat_match: float=2.0,
+                 lambda_var: float=1.0,
+                 lambda_align: float=2.0,
+                 generator_first: bool=False,
+                 use_alignment_module: bool=False,
+                 output_dir=None):
+        # it is designed to hold multiple models
+        # 因为输入的是单模型，但是没有用到父类的 init(), 所以需要重新写这部分
+        models = {"main": model}
+        self.models: Dict[str, Layer] = models
+        # self.model = model
+
+        self.model = model._layers if isinstance(model,
+                                                 paddle.DataParallel) else model
+
+        self.optimizers = optimizers
+        self.optimizer_g: Optimizer = optimizers['generator']
+        self.optimizer_d: Optimizer = optimizers['discriminator']
+
+        self.criterions = criterions
+        self.criterion_mel = criterions['mel']
+        self.criterion_feat_match = criterions['feat_match']
+        self.criterion_gen_adv = criterions["gen_adv"]
+        self.criterion_dis_adv = criterions["dis_adv"]
+        self.criterion_var = criterions["var"]
+        self.criterion_forwardsum = criterions["forwardsum"]
+
+        self.schedulers = schedulers
+        self.scheduler_g = schedulers['generator']
+        self.scheduler_d = schedulers['discriminator']
+
+        self.dataloader = dataloader
+
+        self.generator_train_start_steps = generator_train_start_steps
+        self.discriminator_train_start_steps = discriminator_train_start_steps
+
+        self.lambda_adv = lambda_adv
+        self.lambda_mel = lambda_mel
+        self.lambda_feat_match = lambda_feat_match
+        self.lambda_var = lambda_var
+        self.lambda_align = lambda_align
+
+        self.use_alignment_module = use_alignment_module
+
+        if generator_first:
+            self.turns = ["generator", "discriminator"]
+        else:
+            self.turns = ["discriminator", "generator"]
+
+        self.state = UpdaterState(iteration=0, epoch=0)
+        self.train_iterator = iter(self.dataloader)
+
+        log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
+        self.filehandler = logging.FileHandler(str(log_file))
+        logger.addHandler(self.filehandler)
+        self.logger = logger
+        self.msg = ""
+
+    def update_core(self, batch):
+        self.msg = "Rank: {}, ".format(dist.get_rank())
+        losses_dict = {}
+
+        for turn in self.turns:
+            speech = batch["speech"]
+            speech = speech.unsqueeze(1)
+            text_lengths = batch["text_lengths"]
+            feats_lengths = batch["feats_lengths"]
+            outs = self.model(
+                text=batch["text"],
+                text_lengths=batch["text_lengths"],
+                feats=batch["feats"],
+                feats_lengths=batch["feats_lengths"],
+                durations=batch["durations"],
+                durations_lengths=batch["durations_lengths"],
+                pitch=batch["pitch"],
+                energy=batch["energy"],
+                sids=batch.get("spk_id", None),
+                spembs=batch.get("spk_emb", None),
+                forward_generator=turn == "generator",
+                use_alignment_module=self.use_alignment_module)
+            # Generator
+            if turn == "generator":
+                # parse outputs
+                speech_hat_, bin_loss, log_p_attn, start_idxs, d_outs, ds, p_outs, ps, e_outs, es = outs
+                speech_ = get_segments(
+                    x=speech,
+                    start_idxs=start_idxs *
+                    self.model.generator.upsample_factor,
+                    segment_size=self.model.generator.segment_size *
+                    self.model.generator.upsample_factor, )
+
+                # calculate discriminator outputs
+                p_hat = self.model.discriminator(speech_hat_)
+                with paddle.no_grad():
+                    # do not store discriminator gradient in generator turn
+                    p = self.model.discriminator(speech_)
+
+                # calculate losses
+                mel_loss = self.criterion_mel(speech_hat_, speech_)
+
+                adv_loss = self.criterion_gen_adv(p_hat)
+                feat_match_loss = self.criterion_feat_match(p_hat, p)
+                dur_loss, pitch_loss, energy_loss = self.criterion_var(
+                    d_outs, ds, p_outs, ps, e_outs, es, text_lengths)
+
+                mel_loss = mel_loss * self.lambda_mel
+                adv_loss = adv_loss * self.lambda_adv
+                feat_match_loss = feat_match_loss * self.lambda_feat_match
+                g_loss = mel_loss + adv_loss + feat_match_loss
+                var_loss = (
+                    dur_loss + pitch_loss + energy_loss) * self.lambda_var
+
+                gen_loss = g_loss + var_loss  #+ align_loss
+
+                report("train/generator_loss", float(gen_loss))
+                report("train/generator_generator_loss", float(g_loss))
+                report("train/generator_variance_loss", float(var_loss))
+                report("train/generator_generator_mel_loss", float(mel_loss))
+                report("train/generator_generator_adv_loss", float(adv_loss))
+                report("train/generator_generator_feat_match_loss",
+                       float(feat_match_loss))
+                report("train/generator_variance_dur_loss", float(dur_loss))
+                report("train/generator_variance_pitch_loss", float(pitch_loss))
+                report("train/generator_variance_energy_loss",
+                       float(energy_loss))
+
+                losses_dict["generator_loss"] = float(gen_loss)
+                losses_dict["generator_generator_loss"] = float(g_loss)
+                losses_dict["generator_variance_loss"] = float(var_loss)
+                losses_dict["generator_generator_mel_loss"] = float(mel_loss)
+                losses_dict["generator_generator_adv_loss"] = float(adv_loss)
+                losses_dict["generator_generator_feat_match_loss"] = float(
+                    feat_match_loss)
+                losses_dict["generator_variance_dur_loss"] = float(dur_loss)
+                losses_dict["generator_variance_pitch_loss"] = float(pitch_loss)
+                losses_dict["generator_variance_energy_loss"] = float(
+                    energy_loss)
+
+                if self.use_alignment_module == True:
+                    forwardsum_loss = self.criterion_forwardsum(
+                        log_p_attn, text_lengths, feats_lengths)
+                    align_loss = (
+                        forwardsum_loss + bin_loss) * self.lambda_align
+                    report("train/generator_alignment_loss", float(align_loss))
+                    report("train/generator_alignment_forwardsum_loss",
+                           float(forwardsum_loss))
+                    report("train/generator_alignment_bin_loss",
+                           float(bin_loss))
+                    losses_dict["generator_alignment_loss"] = float(align_loss)
+                    losses_dict["generator_alignment_forwardsum_loss"] = float(
+                        forwardsum_loss)
+                    losses_dict["generator_alignment_bin_loss"] = float(
+                        bin_loss)
+
+                self.optimizer_g.clear_grad()
+                gen_loss.backward()
+
+                self.optimizer_g.step()
+                self.scheduler_g.step()
+
+                # reset cache
+                if self.model.reuse_cache_gen or not self.model.training:
+                    self.model._cache = None
+
+            # Disctiminator
+            elif turn == "discriminator":
+                # parse outputs
+                speech_hat_, _, _, start_idxs, *_ = outs
+                speech_ = get_segments(
+                    x=speech,
+                    start_idxs=start_idxs *
+                    self.model.generator.upsample_factor,
+                    segment_size=self.model.generator.segment_size *
+                    self.model.generator.upsample_factor, )
+
+                # calculate discriminator outputs
+                p_hat = self.model.discriminator(speech_hat_.detach())
+                p = self.model.discriminator(speech_)
+
+                # calculate losses
+                real_loss, fake_loss = self.criterion_dis_adv(p_hat, p)
+                dis_loss = real_loss + fake_loss
+
+                report("train/real_loss", float(real_loss))
+                report("train/fake_loss", float(fake_loss))
+                report("train/discriminator_loss", float(dis_loss))
+                losses_dict["real_loss"] = float(real_loss)
+                losses_dict["fake_loss"] = float(fake_loss)
+                losses_dict["discriminator_loss"] = float(dis_loss)
+
+                self.optimizer_d.clear_grad()
+                dis_loss.backward()
+
+                self.optimizer_d.step()
+                self.scheduler_d.step()
+
+                # reset cache
+                if self.model.reuse_cache_dis or not self.model.training:
+                    self.model._cache = None
+
+        self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                              for k, v in losses_dict.items())
+
+
+class JETSEvaluator(StandardEvaluator):
+    def __init__(self,
+                 model,
+                 criterions: Dict[str, Layer],
+                 dataloader: DataLoader,
+                 lambda_adv: float=1.0,
+                 lambda_mel: float=45.0,
+                 lambda_feat_match: float=2.0,
+                 lambda_var: float=1.0,
+                 lambda_align: float=2.0,
+                 generator_first: bool=False,
+                 use_alignment_module: bool=False,
+                 output_dir=None):
+        # 因为输入的是单模型，但是没有用到父类的 init(), 所以需要重新写这部分
+        models = {"main": model}
+        self.models: Dict[str, Layer] = models
+        # self.model = model
+        self.model = model._layers if isinstance(model,
+                                                 paddle.DataParallel) else model
+
+        self.criterions = criterions
+        self.criterion_mel = criterions['mel']
+        self.criterion_feat_match = criterions['feat_match']
+        self.criterion_gen_adv = criterions["gen_adv"]
+        self.criterion_dis_adv = criterions["dis_adv"]
+        self.criterion_var = criterions["var"]
+        self.criterion_forwardsum = criterions["forwardsum"]
+
+        self.dataloader = dataloader
+
+        self.lambda_adv = lambda_adv
+        self.lambda_mel = lambda_mel
+        self.lambda_feat_match = lambda_feat_match
+        self.lambda_var = lambda_var
+        self.lambda_align = lambda_align
+        self.use_alignment_module = use_alignment_module
+
+        if generator_first:
+            self.turns = ["generator", "discriminator"]
+        else:
+            self.turns = ["discriminator", "generator"]
+
+        log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
+        self.filehandler = logging.FileHandler(str(log_file))
+        logger.addHandler(self.filehandler)
+        self.logger = logger
+        self.msg = ""
+
+    def evaluate_core(self, batch):
+        # logging.debug("Evaluate: ")
+        self.msg = "Evaluate: "
+        losses_dict = {}
+
+        for turn in self.turns:
+            speech = batch["speech"]
+            speech = speech.unsqueeze(1)
+            text_lengths = batch["text_lengths"]
+            feats_lengths = batch["feats_lengths"]
+            outs = self.model(
+                text=batch["text"],
+                text_lengths=batch["text_lengths"],
+                feats=batch["feats"],
+                feats_lengths=batch["feats_lengths"],
+                durations=batch["durations"],
+                durations_lengths=batch["durations_lengths"],
+                pitch=batch["pitch"],
+                energy=batch["energy"],
+                sids=batch.get("spk_id", None),
+                spembs=batch.get("spk_emb", None),
+                forward_generator=turn == "generator",
+                use_alignment_module=self.use_alignment_module)
+            # Generator
+            if turn == "generator":
+                # parse outputs
+                speech_hat_, bin_loss, log_p_attn, start_idxs, d_outs, ds, p_outs, ps, e_outs, es = outs
+                speech_ = get_segments(
+                    x=speech,
+                    start_idxs=start_idxs *
+                    self.model.generator.upsample_factor,
+                    segment_size=self.model.generator.segment_size *
+                    self.model.generator.upsample_factor, )
+
+                # calculate discriminator outputs
+                p_hat = self.model.discriminator(speech_hat_)
+                with paddle.no_grad():
+                    # do not store discriminator gradient in generator turn
+                    p = self.model.discriminator(speech_)
+
+                # calculate losses
+                mel_loss = self.criterion_mel(speech_hat_, speech_)
+
+                adv_loss = self.criterion_gen_adv(p_hat)
+                feat_match_loss = self.criterion_feat_match(p_hat, p)
+                dur_loss, pitch_loss, energy_loss = self.criterion_var(
+                    d_outs, ds, p_outs, ps, e_outs, es, text_lengths)
+
+                mel_loss = mel_loss * self.lambda_mel
+                adv_loss = adv_loss * self.lambda_adv
+                feat_match_loss = feat_match_loss * self.lambda_feat_match
+                g_loss = mel_loss + adv_loss + feat_match_loss
+                var_loss = (
+                    dur_loss + pitch_loss + energy_loss) * self.lambda_var
+
+                gen_loss = g_loss + var_loss  #+ align_loss
+
+                report("eval/generator_loss", float(gen_loss))
+                report("eval/generator_generator_loss", float(g_loss))
+                report("eval/generator_variance_loss", float(var_loss))
+                report("eval/generator_generator_mel_loss", float(mel_loss))
+                report("eval/generator_generator_adv_loss", float(adv_loss))
+                report("eval/generator_generator_feat_match_loss",
+                       float(feat_match_loss))
+                report("eval/generator_variance_dur_loss", float(dur_loss))
+                report("eval/generator_variance_pitch_loss", float(pitch_loss))
+                report("eval/generator_variance_energy_loss",
+                       float(energy_loss))
+
+                losses_dict["generator_loss"] = float(gen_loss)
+                losses_dict["generator_generator_loss"] = float(g_loss)
+                losses_dict["generator_variance_loss"] = float(var_loss)
+                losses_dict["generator_generator_mel_loss"] = float(mel_loss)
+                losses_dict["generator_generator_adv_loss"] = float(adv_loss)
+                losses_dict["generator_generator_feat_match_loss"] = float(
+                    feat_match_loss)
+                losses_dict["generator_variance_dur_loss"] = float(dur_loss)
+                losses_dict["generator_variance_pitch_loss"] = float(pitch_loss)
+                losses_dict["generator_variance_energy_loss"] = float(
+                    energy_loss)
+
+                if self.use_alignment_module == True:
+                    forwardsum_loss = self.criterion_forwardsum(
+                        log_p_attn, text_lengths, feats_lengths)
+                    align_loss = (
+                        forwardsum_loss + bin_loss) * self.lambda_align
+                    report("eval/generator_alignment_loss", float(align_loss))
+                    report("eval/generator_alignment_forwardsum_loss",
+                           float(forwardsum_loss))
+                    report("eval/generator_alignment_bin_loss", float(bin_loss))
+                    losses_dict["generator_alignment_loss"] = float(align_loss)
+                    losses_dict["generator_alignment_forwardsum_loss"] = float(
+                        forwardsum_loss)
+                    losses_dict["generator_alignment_bin_loss"] = float(
+                        bin_loss)
+
+                # reset cache
+                if self.model.reuse_cache_gen or not self.model.training:
+                    self.model._cache = None
+
+            # Disctiminator
+            elif turn == "discriminator":
+                # parse outputs
+                speech_hat_, _, _, start_idxs, *_ = outs
+                speech_ = get_segments(
+                    x=speech,
+                    start_idxs=start_idxs *
+                    self.model.generator.upsample_factor,
+                    segment_size=self.model.generator.segment_size *
+                    self.model.generator.upsample_factor, )
+
+                # calculate discriminator outputs
+                p_hat = self.model.discriminator(speech_hat_.detach())
+                p = self.model.discriminator(speech_)
+
+                # calculate losses
+                real_loss, fake_loss = self.criterion_dis_adv(p_hat, p)
+                dis_loss = real_loss + fake_loss
+
+                report("eval/real_loss", float(real_loss))
+                report("eval/fake_loss", float(fake_loss))
+                report("eval/discriminator_loss", float(dis_loss))
+                losses_dict["real_loss"] = float(real_loss)
+                losses_dict["fake_loss"] = float(fake_loss)
+                losses_dict["discriminator_loss"] = float(dis_loss)
+
+                # reset cache
+                if self.model.reuse_cache_dis or not self.model.training:
+                    self.model._cache = None
+
+        self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                              for k, v in losses_dict.items())
+        self.logger.info(self.msg)
diff --git a/paddlespeech/t2s/models/jets/length_regulator.py b/paddlespeech/t2s/models/jets/length_regulator.py
new file mode 100644
index 00000000..f7a395a6
--- /dev/null
+++ b/paddlespeech/t2s/models/jets/length_regulator.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Generator module in JETS.
+
+This code is based on https://github.com/imdanboy/jets.
+
+"""
+import paddle
+import paddle.nn.functional as F
+from paddle import nn
+
+from paddlespeech.t2s.modules.masked_fill import masked_fill
+
+
+class GaussianUpsampling(nn.Layer):
+    """
+    Gaussian upsampling with fixed temperature as in:
+    https://arxiv.org/abs/2010.04301
+    """
+
+    def __init__(self, delta=0.1):
+        super().__init__()
+        self.delta = delta
+
+    def forward(self, hs, ds, h_masks=None, d_masks=None):
+        """
+        Args:
+            hs (Tensor): Batched hidden state to be expanded (B, T_text, adim)
+            ds (Tensor): Batched token duration (B, T_text)
+            h_masks (Tensor): Mask tensor (B,T_feats)
+            d_masks (Tensor): Mask tensor (B,T_text)
+        Returns:
+            Tensor: Expanded hidden state (B, T_feat, adim)
+        """
+        B = ds.shape[0]
+
+        if h_masks is None:
+            T_feats = paddle.to_tensor(ds.sum(), dtype="int32")
+        else:
+            T_feats = h_masks.shape[-1]
+        t = paddle.to_tensor(
+            paddle.arange(0, T_feats).unsqueeze(0).tile([B, 1]),
+            dtype="float32")
+        if h_masks is not None:
+            t = t * paddle.to_tensor(h_masks, dtype="float32")
+
+        c = ds.cumsum(axis=-1) - ds / 2
+        energy = -1 * self.delta * (t.unsqueeze(-1) - c.unsqueeze(1))**2
+        if d_masks is not None:
+            d_masks = ~(d_masks.unsqueeze(1))
+            d_masks.stop_gradient = True
+            d_masks = d_masks.tile([1, T_feats, 1])
+            energy = masked_fill(energy, d_masks, -float("inf"))
+        p_attn = F.softmax(energy, axis=2)  # (B, T_feats, T_text)
+        hs = paddle.matmul(p_attn, hs)
+        return hs
diff --git a/paddlespeech/t2s/models/starganv2_vc/AuxiliaryASR/__init__.py b/paddlespeech/t2s/models/starganv2_vc/AuxiliaryASR/__init__.py
new file mode 100644
index 00000000..595add0a
--- /dev/null
+++ b/paddlespeech/t2s/models/starganv2_vc/AuxiliaryASR/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddlespeech/t2s/models/starganv2_vc/AuxiliaryASR/config.yml b/paddlespeech/t2s/models/starganv2_vc/AuxiliaryASR/config.yml
new file mode 100644
index 00000000..9f4f9594
--- /dev/null
+++ b/paddlespeech/t2s/models/starganv2_vc/AuxiliaryASR/config.yml
@@ -0,0 +1,29 @@
+log_dir: "logs"
+save_freq: 20
+device: "cuda"
+epochs: 180
+batch_size: 48
+pretrained_model: ""
+train_data: "asr_train_list.txt"
+val_data: "asr_val_list.txt"
+
+dataset_params:
+  data_augmentation: true
+
+preprocess_parasm:
+  sr: 24000
+  spect_params:
+    n_fft: 2048
+    win_length: 1200
+    hop_length: 300
+  mel_params:
+    n_mels: 80
+
+model_params:
+   input_dim: 80
+   hidden_dim: 256
+   n_token: 80
+   token_embedding_dim: 256
+
+optimizer_params:
+  lr: 0.0005
\ No newline at end of file
diff --git a/paddlespeech/t2s/models/starganv2_vc/AuxiliaryASR/layers.py b/paddlespeech/t2s/models/starganv2_vc/AuxiliaryASR/layers.py
new file mode 100644
index 00000000..5901c805
--- /dev/null
+++ b/paddlespeech/t2s/models/starganv2_vc/AuxiliaryASR/layers.py
@@ -0,0 +1,262 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+import paddle.nn.functional as F
+import paddleaudio.functional as audio_F
+from paddle import nn
+
+from paddlespeech.utils.initialize import _calculate_gain
+from paddlespeech.utils.initialize import xavier_uniform_
+
+
+def _get_activation_fn(activ):
+    if activ == 'relu':
+        return nn.ReLU()
+    elif activ == 'lrelu':
+        return nn.LeakyReLU(0.2)
+    elif activ == 'swish':
+        return nn.Swish()
+    else:
+        raise RuntimeError(
+            'Unexpected activ type %s, expected [relu, lrelu, swish]' % activ)
+
+
+class LinearNorm(nn.Layer):
+    def __init__(self,
+                 in_dim: int,
+                 out_dim: int,
+                 bias: bool=True,
+                 w_init_gain: str='linear'):
+        super().__init__()
+        self.linear_layer = nn.Linear(in_dim, out_dim, bias_attr=bias)
+        xavier_uniform_(
+            self.linear_layer.weight, gain=_calculate_gain(w_init_gain))
+
+    def forward(self, x: paddle.Tensor):
+        out = self.linear_layer(x)
+        return out
+
+
+class ConvNorm(nn.Layer):
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: int=1,
+                 stride: int=1,
+                 padding: int=None,
+                 dilation: int=1,
+                 bias: bool=True,
+                 w_init_gain: str='linear',
+                 param=None):
+        super().__init__()
+        if padding is None:
+            assert (kernel_size % 2 == 1)
+            padding = int(dilation * (kernel_size - 1) / 2)
+
+        self.conv = nn.Conv1D(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias_attr=bias)
+
+        xavier_uniform_(
+            self.conv.weight, gain=_calculate_gain(w_init_gain, param=param))
+
+    def forward(self, signal: paddle.Tensor):
+        conv_signal = self.conv(signal)
+        return conv_signal
+
+
+class ConvBlock(nn.Layer):
+    def __init__(self,
+                 hidden_dim: int,
+                 n_conv: int=3,
+                 dropout_p: float=0.2,
+                 activ: str='relu'):
+        super().__init__()
+        self._n_groups = 8
+        self.blocks = nn.LayerList([
+            self._get_conv(
+                hidden_dim=hidden_dim,
+                dilation=3**i,
+                activ=activ,
+                dropout_p=dropout_p) for i in range(n_conv)
+        ])
+
+    def forward(self, x: paddle.Tensor):
+        for block in self.blocks:
+            res = x
+            x = block(x)
+            x += res
+        return x
+
+    def _get_conv(self,
+                  hidden_dim: int,
+                  dilation: int,
+                  activ: str='relu',
+                  dropout_p: float=0.2):
+        layers = [
+            ConvNorm(
+                in_channels=hidden_dim,
+                out_channels=hidden_dim,
+                kernel_size=3,
+                padding=dilation,
+                dilation=dilation), _get_activation_fn(activ),
+            nn.GroupNorm(num_groups=self._n_groups, num_channels=hidden_dim),
+            nn.Dropout(p=dropout_p), ConvNorm(
+                hidden_dim, hidden_dim, kernel_size=3, padding=1,
+                dilation=1), _get_activation_fn(activ), nn.Dropout(p=dropout_p)
+        ]
+        return nn.Sequential(*layers)
+
+
+class LocationLayer(nn.Layer):
+    def __init__(self,
+                 attention_n_filters: int,
+                 attention_kernel_size: int,
+                 attention_dim: int):
+        super().__init__()
+        padding = int((attention_kernel_size - 1) / 2)
+        self.location_conv = ConvNorm(
+            in_channels=2,
+            out_channels=attention_n_filters,
+            kernel_size=attention_kernel_size,
+            padding=padding,
+            bias=False,
+            stride=1,
+            dilation=1)
+        self.location_dense = LinearNorm(
+            in_dim=attention_n_filters,
+            out_dim=attention_dim,
+            bias=False,
+            w_init_gain='tanh')
+
+    def forward(self, attention_weights_cat: paddle.Tensor):
+        processed_attention = self.location_conv(attention_weights_cat)
+        processed_attention = processed_attention.transpose([0, 2, 1])
+        processed_attention = self.location_dense(processed_attention)
+        return processed_attention
+
+
+class Attention(nn.Layer):
+    def __init__(self,
+                 attention_rnn_dim: int,
+                 embedding_dim: int,
+                 attention_dim: int,
+                 attention_location_n_filters: int,
+                 attention_location_kernel_size: int):
+        super().__init__()
+        self.query_layer = LinearNorm(
+            in_dim=attention_rnn_dim,
+            out_dim=attention_dim,
+            bias=False,
+            w_init_gain='tanh')
+        self.memory_layer = LinearNorm(
+            in_dim=embedding_dim,
+            out_dim=attention_dim,
+            bias=False,
+            w_init_gain='tanh')
+        self.v = LinearNorm(in_dim=attention_dim, out_dim=1, bias=False)
+        self.location_layer = LocationLayer(
+            attention_n_filters=attention_location_n_filters,
+            attention_kernel_size=attention_location_kernel_size,
+            attention_dim=attention_dim)
+        self.score_mask_value = -float("inf")
+
+    def get_alignment_energies(self,
+                               query: paddle.Tensor,
+                               processed_memory: paddle.Tensor,
+                               attention_weights_cat: paddle.Tensor):
+        """
+        Args:
+            query: 
+                decoder output (B, n_mel_channels * n_frames_per_step)
+            processed_memory: 
+                processed encoder outputs (B, T_in, attention_dim)
+            attention_weights_cat: 
+                cumulative and prev. att weights (B, 2, max_time)
+        Returns:
+            Tensor: 
+                alignment (B, max_time)
+        """
+
+        processed_query = self.query_layer(query.unsqueeze(1))
+        processed_attention_weights = self.location_layer(attention_weights_cat)
+        energies = self.v(
+            paddle.tanh(processed_query + processed_attention_weights +
+                        processed_memory))
+
+        energies = energies.squeeze(-1)
+        return energies
+
+    def forward(self,
+                attention_hidden_state: paddle.Tensor,
+                memory: paddle.Tensor,
+                processed_memory: paddle.Tensor,
+                attention_weights_cat: paddle.Tensor,
+                mask: paddle.Tensor):
+        """
+        Args:
+            attention_hidden_state: 
+                attention rnn last output
+            memory: 
+                encoder outputs
+            processed_memory: 
+                processed encoder outputs
+            attention_weights_cat: 
+                previous and cummulative attention weights
+            mask: 
+                binary mask for padded data
+        """
+        alignment = self.get_alignment_energies(
+            query=attention_hidden_state,
+            processed_memory=processed_memory,
+            attention_weights_cat=attention_weights_cat)
+
+        if mask is not None:
+            alignment.data.masked_fill_(mask, self.score_mask_value)
+
+        attention_weights = F.softmax(alignment, axis=1)
+        attention_context = paddle.bmm(attention_weights.unsqueeze(1), memory)
+        attention_context = attention_context.squeeze(1)
+
+        return attention_context, attention_weights
+
+
+class MFCC(nn.Layer):
+    def __init__(self, n_mfcc: int=40, n_mels: int=80):
+        super().__init__()
+        self.n_mfcc = n_mfcc
+        self.n_mels = n_mels
+        self.norm = 'ortho'
+        dct_mat = audio_F.create_dct(self.n_mfcc, self.n_mels, self.norm)
+        self.register_buffer('dct_mat', dct_mat)
+
+    def forward(self, mel_specgram: paddle.Tensor):
+        if len(mel_specgram.shape) == 2:
+            mel_specgram = mel_specgram.unsqueeze(0)
+            unsqueezed = True
+        else:
+            unsqueezed = False
+        # (channel, n_mels, time).tranpose(...) dot (n_mels, n_mfcc)
+        # -> (channel, time, n_mfcc).tranpose(...)
+        mfcc = paddle.matmul(mel_specgram.transpose([0, 2, 1]),
+                             self.dct_mat).transpose([0, 2, 1])
+        # unpack batch
+        if unsqueezed:
+            mfcc = mfcc.squeeze(0)
+        return mfcc
diff --git a/paddlespeech/t2s/models/starganv2_vc/AuxiliaryASR/model.py b/paddlespeech/t2s/models/starganv2_vc/AuxiliaryASR/model.py
new file mode 100644
index 00000000..85b3453d
--- /dev/null
+++ b/paddlespeech/t2s/models/starganv2_vc/AuxiliaryASR/model.py
@@ -0,0 +1,247 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+
+import paddle
+import paddle.nn.functional as F
+from paddle import nn
+
+from .layers import Attention
+from .layers import ConvBlock
+from .layers import ConvNorm
+from .layers import LinearNorm
+from .layers import MFCC
+from paddlespeech.t2s.modules.nets_utils import _reset_parameters
+from paddlespeech.utils.initialize import uniform_
+
+
+class ASRCNN(nn.Layer):
+    def __init__(
+            self,
+            input_dim: int=80,
+            hidden_dim: int=256,
+            n_token: int=35,
+            n_layers: int=6,
+            token_embedding_dim: int=256, ):
+        super().__init__()
+        self.n_token = n_token
+        self.n_down = 1
+        self.to_mfcc = MFCC()
+        self.init_cnn = ConvNorm(
+            in_channels=input_dim // 2,
+            out_channels=hidden_dim,
+            kernel_size=7,
+            padding=3,
+            stride=2)
+        self.cnns = nn.Sequential(* [
+            nn.Sequential(
+                ConvBlock(hidden_dim),
+                nn.GroupNorm(num_groups=1, num_channels=hidden_dim))
+            for n in range(n_layers)
+        ])
+        self.projection = ConvNorm(
+            in_channels=hidden_dim, out_channels=hidden_dim // 2)
+        self.ctc_linear = nn.Sequential(
+            LinearNorm(in_dim=hidden_dim // 2, out_dim=hidden_dim),
+            nn.ReLU(), LinearNorm(in_dim=hidden_dim, out_dim=n_token))
+        self.asr_s2s = ASRS2S(
+            embedding_dim=token_embedding_dim,
+            hidden_dim=hidden_dim // 2,
+            n_token=n_token)
+
+        self.reset_parameters()
+        self.asr_s2s.reset_parameters()
+
+    def forward(self,
+                x: paddle.Tensor,
+                src_key_padding_mask: paddle.Tensor=None,
+                text_input: paddle.Tensor=None):
+        x = self.to_mfcc(x)
+        x = self.init_cnn(x)
+        x = self.cnns(x)
+        x = self.projection(x)
+        x = x.transpose([0, 2, 1])
+        ctc_logit = self.ctc_linear(x)
+        if text_input is not None:
+            _, s2s_logit, s2s_attn = self.asr_s2s(
+                memory=x,
+                memory_mask=src_key_padding_mask,
+                text_input=text_input)
+            return ctc_logit, s2s_logit, s2s_attn
+        else:
+            return ctc_logit
+
+    def get_feature(self, x: paddle.Tensor):
+        x = self.to_mfcc(x.squeeze(1))
+        x = self.init_cnn(x)
+        x = self.cnns(x)
+        x = self.projection(x)
+        return x
+
+    def length_to_mask(self, lengths: paddle.Tensor):
+        mask = paddle.arange(lengths.max()).unsqueeze(0).expand(
+            (lengths.shape[0], -1)).astype(lengths.dtype)
+        mask = paddle.greater_than(mask + 1, lengths.unsqueeze(1))
+        return mask
+
+    def get_future_mask(self, out_length: int, unmask_future_steps: int=0):
+        """
+        Args:
+            out_length (int):
+                returned mask shape is (out_length, out_length).
+            unmask_futre_steps (int): 
+                unmasking future step size.
+        Return:
+            Tensor (paddle.Tensor(bool)): 
+                mask future timesteps mask[i, j] = True if i > j + unmask_future_steps else False
+        """
+        index_tensor = paddle.arange(out_length).unsqueeze(0).expand(
+            [out_length, -1])
+        mask = paddle.greater_than(index_tensor,
+                                   index_tensor.T + unmask_future_steps)
+        return mask
+
+    def reset_parameters(self):
+        self.apply(_reset_parameters)
+
+
+class ASRS2S(nn.Layer):
+    def __init__(self,
+                 embedding_dim: int=256,
+                 hidden_dim: int=512,
+                 n_location_filters: int=32,
+                 location_kernel_size: int=63,
+                 n_token: int=40):
+        super().__init__()
+        self.embedding = nn.Embedding(n_token, embedding_dim)
+        self.val_range = math.sqrt(6 / hidden_dim)
+
+        self.decoder_rnn_dim = hidden_dim
+        self.project_to_n_symbols = nn.Linear(self.decoder_rnn_dim, n_token)
+        self.attention_layer = Attention(
+            attention_rnn_dim=self.decoder_rnn_dim,
+            embedding_dim=hidden_dim,
+            attention_dim=hidden_dim,
+            attention_location_n_filters=n_location_filters,
+            attention_location_kernel_size=location_kernel_size)
+        self.decoder_rnn = nn.LSTMCell(self.decoder_rnn_dim + embedding_dim,
+                                       self.decoder_rnn_dim)
+        self.project_to_hidden = nn.Sequential(
+            LinearNorm(in_dim=self.decoder_rnn_dim * 2, out_dim=hidden_dim),
+            nn.Tanh())
+        self.sos = 1
+        self.eos = 2
+
+    def initialize_decoder_states(self,
+                                  memory: paddle.Tensor,
+                                  mask: paddle.Tensor):
+        """
+        moemory.shape = (B, L, H) = (Batchsize, Maxtimestep, Hiddendim)
+        """
+        B, L, H = memory.shape
+        dtype = memory.dtype
+        self.decoder_hidden = paddle.zeros(
+            (B, self.decoder_rnn_dim)).astype(dtype)
+        self.decoder_cell = paddle.zeros(
+            (B, self.decoder_rnn_dim)).astype(dtype)
+        self.attention_weights = paddle.zeros((B, L)).astype(dtype)
+        self.attention_weights_cum = paddle.zeros((B, L)).astype(dtype)
+        self.attention_context = paddle.zeros((B, H)).astype(dtype)
+        self.memory = memory
+        self.processed_memory = self.attention_layer.memory_layer(memory)
+        self.mask = mask
+        self.unk_index = 3
+        self.random_mask = 0.1
+
+    def forward(self,
+                memory: paddle.Tensor,
+                memory_mask: paddle.Tensor,
+                text_input: paddle.Tensor):
+        """
+        moemory.shape = (B, L, H) = (Batchsize, Maxtimestep, Hiddendim)
+        moemory_mask.shape = (B, L, )
+        texts_input.shape = (B, T)
+        """
+        self.initialize_decoder_states(memory, memory_mask)
+        # text random mask
+        random_mask = (paddle.rand(text_input.shape) < self.random_mask)
+        _text_input = text_input.clone()
+        _text_input[:] = paddle.where(
+            condition=random_mask,
+            x=paddle.full(
+                shape=_text_input.shape,
+                fill_value=self.unk_index,
+                dtype=_text_input.dtype),
+            y=_text_input)
+        decoder_inputs = self.embedding(_text_input).transpose(
+            [1, 0, 2])  # -> [T, B, channel]
+        start_embedding = self.embedding(
+            paddle.to_tensor(
+                [self.sos] * decoder_inputs.shape[1], dtype=paddle.long))
+        decoder_inputs = paddle.concat(
+            (start_embedding.unsqueeze(0), decoder_inputs), axis=0)
+
+        hidden_outputs, logit_outputs, alignments = [], [], []
+        while len(hidden_outputs) < decoder_inputs.shape[0]:
+            decoder_input = decoder_inputs[len(hidden_outputs)]
+            hidden, logit, attention_weights = self.decode(decoder_input)
+            hidden_outputs += [hidden]
+            logit_outputs += [logit]
+            alignments += [attention_weights]
+
+        hidden_outputs, logit_outputs, alignments = self.parse_decoder_outputs(
+            hidden_outputs, logit_outputs, alignments)
+
+        return hidden_outputs, logit_outputs, alignments
+
+    def decode(self, decoder_input: paddle.Tensor):
+        cell_input = paddle.concat((decoder_input, self.attention_context), -1)
+        self.decoder_rnn.flatten_parameters()
+        self.decoder_hidden, self.decoder_cell = self.decoder_rnn(
+            cell_input, (self.decoder_hidden, self.decoder_cell))
+
+        attention_weights_cat = paddle.concat(
+            (self.attention_weights.unsqueeze(1),
+             self.attention_weights_cum.unsqueeze(1)),
+            axis=1)
+
+        self.attention_context, self.attention_weights = self.attention_layer(
+            self.decoder_hidden, self.memory, self.processed_memory,
+            attention_weights_cat, self.mask)
+
+        self.attention_weights_cum += self.attention_weights
+
+        hidden_and_context = paddle.concat(
+            (self.decoder_hidden, self.attention_context), -1)
+        hidden = self.project_to_hidden(hidden_and_context)
+
+        # dropout to increasing g
+        logit = self.project_to_n_symbols(F.dropout(hidden, 0.5, self.training))
+
+        return hidden, logit, self.attention_weights
+
+    def parse_decoder_outputs(self,
+                              hidden: paddle.Tensor,
+                              logit: paddle.Tensor,
+                              alignments: paddle.Tensor):
+        # -> [B, T_out + 1, max_time]
+        alignments = paddle.stack(alignments).transpose([1, 0, 2])
+        # [T_out + 1, B, n_symbols] -> [B, T_out + 1,  n_symbols]
+        logit = paddle.stack(logit).transpose([1, 0, 2])
+        hidden = paddle.stack(hidden).transpose([1, 0, 2])
+
+        return hidden, logit, alignments
+
+    def reset_parameters(self):
+        uniform_(self.embedding.weight, -self.val_range, self.val_range)
diff --git a/paddlespeech/t2s/models/starganv2_vc/JDCNet/__init__.py b/paddlespeech/t2s/models/starganv2_vc/JDCNet/__init__.py
new file mode 100644
index 00000000..595add0a
--- /dev/null
+++ b/paddlespeech/t2s/models/starganv2_vc/JDCNet/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddlespeech/t2s/models/starganv2_vc/JDCNet/model.py b/paddlespeech/t2s/models/starganv2_vc/JDCNet/model.py
new file mode 100644
index 00000000..5938e6a7
--- /dev/null
+++ b/paddlespeech/t2s/models/starganv2_vc/JDCNet/model.py
@@ -0,0 +1,210 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Implementation of model from:
+Kum et al. - "Joint Detection and Classification of Singing Voice Melody Using
+Convolutional Recurrent Neural Networks" (2019)
+Link: https://www.semanticscholar.org/paper/Joint-Detection-and-Classification-of-Singing-Voice-Kum-Nam/60a2ad4c7db43bace75805054603747fcd062c0d
+"""
+import paddle
+from paddle import nn
+
+
+class JDCNet(nn.Layer):
+    """
+    Joint Detection and Classification Network model for singing voice melody.
+    """
+
+    def __init__(self,
+                 num_class: int=722,
+                 seq_len: int=31,
+                 leaky_relu_slope: float=0.01):
+        super().__init__()
+        self.seq_len = seq_len
+        self.num_class = num_class
+        # input: (B, num_class, T, n_mels)
+        self.conv_block = nn.Sequential(
+            # output: (B, out_channels, T, n_mels)
+            nn.Conv2D(
+                in_channels=1,
+                out_channels=64,
+                kernel_size=3,
+                padding=1,
+                bias_attr=False),
+            nn.BatchNorm2D(num_features=64),
+            nn.LeakyReLU(leaky_relu_slope),
+            # out: (B, out_channels, T, n_mels)
+            nn.Conv2D(64, 64, 3, padding=1, bias_attr=False), )
+        # output: (B, out_channels, T, n_mels // 2)
+        self.res_block1 = ResBlock(in_channels=64, out_channels=128)
+        # output: (B, out_channels, T, n_mels // 4) 
+        self.res_block2 = ResBlock(in_channels=128, out_channels=192)
+        # output: (B, out_channels, T, n_mels // 8)  
+        self.res_block3 = ResBlock(in_channels=192, out_channels=256)
+        # pool block
+        self.pool_block = nn.Sequential(
+            nn.BatchNorm2D(num_features=256),
+            nn.LeakyReLU(leaky_relu_slope),
+            # (B, num_features, T, 2)
+            nn.MaxPool2D(kernel_size=(1, 4)),
+            nn.Dropout(p=0.5), )
+        # input: (B, T, input_size), resized from (B, input_size // 2, T, 2)
+        # output: (B, T, input_size)
+        self.bilstm_classifier = nn.LSTM(
+            input_size=512,
+            hidden_size=256,
+            time_major=False,
+            direction='bidirectional')
+        # input: (B * T, in_features)
+        # output: (B * T, num_class)
+        self.classifier = nn.Linear(
+            in_features=512, out_features=self.num_class)
+
+        # initialize weights
+        self.apply(self.init_weights)
+
+    def get_feature_GAN(self, x: paddle.Tensor):
+        """Calculate feature_GAN.
+        Args:
+            x(Tensor(float32)): 
+                Shape (B, num_class, n_mels, T).
+        Returns:
+            Tensor:
+                Shape (B, num_features, n_mels // 8, T).
+        """
+        x = x.astype(paddle.float32)
+        x = x.transpose([0, 1, 3, 2] if len(x.shape) == 4 else [0, 2, 1])
+        convblock_out = self.conv_block(x)
+        resblock1_out = self.res_block1(convblock_out)
+        resblock2_out = self.res_block2(resblock1_out)
+        resblock3_out = self.res_block3(resblock2_out)
+        poolblock_out = self.pool_block[0](resblock3_out)
+        poolblock_out = self.pool_block[1](poolblock_out)
+        GAN_feature = poolblock_out.transpose([0, 1, 3, 2] if len(
+            poolblock_out.shape) == 4 else [0, 2, 1])
+        return GAN_feature
+
+    def forward(self, x: paddle.Tensor):
+        """Calculate forward propagation.
+        Args:
+            x(Tensor(float32)): 
+                Shape (B, num_class, n_mels, seq_len).
+        Returns:
+            Tensor:
+                classifier output consists of predicted pitch classes per frame.
+                Shape: (B, seq_len, num_class).
+            Tensor:
+                GAN_feature. Shape: (B, num_features, n_mels // 8, seq_len)
+            Tensor:
+                poolblock_out. Shape (B, seq_len, 512)     
+        """
+        ###############################
+        # forward pass for classifier #
+        ###############################
+        # (B, num_class, n_mels, T) -> (B, num_class, T, n_mels)
+        x = x.transpose([0, 1, 3, 2] if len(x.shape) == 4 else
+                        [0, 2, 1]).astype(paddle.float32)
+
+        convblock_out = self.conv_block(x)
+        resblock1_out = self.res_block1(convblock_out)
+        resblock2_out = self.res_block2(resblock1_out)
+        resblock3_out = self.res_block3(resblock2_out)
+        poolblock_out = self.pool_block[0](resblock3_out)
+        poolblock_out = self.pool_block[1](poolblock_out)
+        GAN_feature = poolblock_out.transpose([0, 1, 3, 2] if len(
+            poolblock_out.shape) == 4 else [0, 2, 1])
+        poolblock_out = self.pool_block[2](poolblock_out)
+        # (B, 256, seq_len, 2) => (B, seq_len, 256, 2) => (B, seq_len, 512)
+        classifier_out = poolblock_out.transpose([0, 2, 1, 3]).reshape(
+            (-1, self.seq_len, 512))
+        self.bilstm_classifier.flatten_parameters()
+        # ignore the hidden states
+        classifier_out, _ = self.bilstm_classifier(classifier_out)
+        # (B * seq_len, 512)
+        classifier_out = classifier_out.reshape((-1, 512))
+        classifier_out = self.classifier(classifier_out)
+        # (B, seq_len, num_class)
+        classifier_out = classifier_out.reshape(
+            (-1, self.seq_len, self.num_class))
+        return paddle.abs(classifier_out.squeeze()), GAN_feature, poolblock_out
+
+    @staticmethod
+    def init_weights(m):
+        if isinstance(m, nn.Linear):
+            nn.initializer.KaimingUniform()(m.weight)
+            if m.bias is not None:
+                nn.initializer.Constant(0)(m.bias)
+        elif isinstance(m, nn.Conv2D):
+            nn.initializer.XavierNormal()(m.weight)
+        elif isinstance(m, nn.LSTM) or isinstance(m, nn.LSTMCell):
+            for p in m.parameters():
+                if len(p.shape) >= 2:
+                    nn.initializer.Orthogonal()(p)
+                else:
+                    nn.initializer.Normal()(p)
+
+
+class ResBlock(nn.Layer):
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 leaky_relu_slope: float=0.01):
+        super().__init__()
+        self.downsample = in_channels != out_channels
+        # BN / LReLU / MaxPool layer before the conv layer - see Figure 1b in the paper
+        self.pre_conv = nn.Sequential(
+            nn.BatchNorm2D(num_features=in_channels),
+            nn.LeakyReLU(leaky_relu_slope),
+            # apply downsampling on the y axis only
+            nn.MaxPool2D(kernel_size=(1, 2)), )
+
+        # conv layers
+        self.conv = nn.Sequential(
+            nn.Conv2D(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=3,
+                padding=1,
+                bias_attr=False),
+            nn.BatchNorm2D(out_channels),
+            nn.LeakyReLU(leaky_relu_slope),
+            nn.Conv2D(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                kernel_size=3,
+                padding=1,
+                bias_attr=False), )
+        # 1 x 1 convolution layer to match the feature dimensions
+        self.conv1by1 = None
+        if self.downsample:
+            self.conv1by1 = nn.Conv2D(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=1,
+                bias_attr=False)
+
+    def forward(self, x: paddle.Tensor):
+        """Calculate forward propagation.
+        Args:
+            x(Tensor(float32)): Shape (B, in_channels, T, n_mels).
+        Returns:
+            Tensor:
+                The residual output, Shape (B, out_channels, T, n_mels // 2).
+        """
+        x = self.pre_conv(x)
+        if self.downsample:
+            x = self.conv(x) + self.conv1by1(x)
+        else:
+            x = self.conv(x) + x
+        return x
diff --git a/paddlespeech/t2s/models/starganv2_vc/__init__.py b/paddlespeech/t2s/models/starganv2_vc/__init__.py
new file mode 100644
index 00000000..e3327867
--- /dev/null
+++ b/paddlespeech/t2s/models/starganv2_vc/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .starganv2_vc import *
+from .starganv2_vc_updater import *
+from .AuxiliaryASR.model import *
+from .JDCNet.model import *
diff --git a/paddlespeech/t2s/models/starganv2_vc/losses.py b/paddlespeech/t2s/models/starganv2_vc/losses.py
new file mode 100644
index 00000000..d94c9342
--- /dev/null
+++ b/paddlespeech/t2s/models/starganv2_vc/losses.py
@@ -0,0 +1,257 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any
+from typing import Dict
+
+import paddle
+import paddle.nn.functional as F
+
+from .transforms import build_transforms
+
+# 这些都写到 updater 里
+
+
+def compute_d_loss(
+        nets: Dict[str, Any],
+        x_real: paddle.Tensor,
+        y_org: paddle.Tensor,
+        y_trg: paddle.Tensor,
+        z_trg: paddle.Tensor=None,
+        x_ref: paddle.Tensor=None,
+        # TODO: should be True here, but r1_reg has some bug now 
+        use_r1_reg: bool=False,
+        use_adv_cls: bool=False,
+        use_con_reg: bool=False,
+        lambda_reg: float=1.,
+        lambda_adv_cls: float=0.1,
+        lambda_con_reg: float=10.):
+
+    assert (z_trg is None) != (x_ref is None)
+    # with real audios
+    x_real.stop_gradient = False
+    out = nets['discriminator'](x_real, y_org)
+    loss_real = adv_loss(out, 1)
+    # R1 regularizaition (https://arxiv.org/abs/1801.04406v4)
+    if use_r1_reg:
+        loss_reg = r1_reg(out, x_real)
+    else:
+        # loss_reg = paddle.to_tensor([0.], dtype=paddle.float32)
+        loss_reg = paddle.zeros([1])
+
+    # consistency regularization (bCR-GAN: https://arxiv.org/abs/2002.04724)
+    loss_con_reg = paddle.zeros([1])
+    if use_con_reg:
+        t = build_transforms()
+        out_aug = nets['discriminator'](t(x_real).detach(), y_org)
+        loss_con_reg += F.smooth_l1_loss(out, out_aug)
+
+    # with fake audios
+    with paddle.no_grad():
+        if z_trg is not None:
+            s_trg = nets['mapping_network'](z_trg, y_trg)
+        else:  # x_ref is not None
+            s_trg = nets['style_encoder'](x_ref, y_trg)
+
+        F0 = nets['F0_model'].get_feature_GAN(x_real)
+        x_fake = nets['generator'](x_real, s_trg, masks=None, F0=F0)
+    out = nets['discriminator'](x_fake, y_trg)
+    loss_fake = adv_loss(out, 0)
+    if use_con_reg:
+        out_aug = nets['discriminator'](t(x_fake).detach(), y_trg)
+        loss_con_reg += F.smooth_l1_loss(out, out_aug)
+
+    # adversarial classifier loss
+    if use_adv_cls:
+        out_de = nets['discriminator'].classifier(x_fake)
+        loss_real_adv_cls = F.cross_entropy(out_de[y_org != y_trg],
+                                            y_org[y_org != y_trg])
+
+        if use_con_reg:
+            out_de_aug = nets['discriminator'].classifier(t(x_fake).detach())
+            loss_con_reg += F.smooth_l1_loss(out_de, out_de_aug)
+    else:
+        loss_real_adv_cls = paddle.zeros([1]).mean()
+
+    loss = loss_real + loss_fake + lambda_reg * loss_reg + \
+            lambda_adv_cls * loss_real_adv_cls + \
+            lambda_con_reg * loss_con_reg
+
+    return loss
+
+
+def compute_g_loss(nets: Dict[str, Any],
+                   x_real: paddle.Tensor,
+                   y_org: paddle.Tensor,
+                   y_trg: paddle.Tensor,
+                   z_trgs: paddle.Tensor=None,
+                   x_refs: paddle.Tensor=None,
+                   use_adv_cls: bool=False,
+                   lambda_sty: float=1.,
+                   lambda_cyc: float=5.,
+                   lambda_ds: float=1.,
+                   lambda_norm: float=1.,
+                   lambda_asr: float=10.,
+                   lambda_f0: float=5.,
+                   lambda_f0_sty: float=0.1,
+                   lambda_adv: float=2.,
+                   lambda_adv_cls: float=0.5,
+                   norm_bias: float=0.5):
+
+    assert (z_trgs is None) != (x_refs is None)
+    if z_trgs is not None:
+        z_trg, z_trg2 = z_trgs
+    if x_refs is not None:
+        x_ref, x_ref2 = x_refs
+
+    # compute style vectors
+    if z_trgs is not None:
+        s_trg = nets['mapping_network'](z_trg, y_trg)
+    else:
+        s_trg = nets['style_encoder'](x_ref, y_trg)
+
+    # compute ASR/F0 features (real)
+    # 源码没有用 .eval(), 使用了 no_grad()
+    # 我们使用了 .eval(), 开启 with paddle.no_grad() 会报错
+    F0_real, GAN_F0_real, cyc_F0_real = nets['F0_model'](x_real)
+    ASR_real = nets['asr_model'].get_feature(x_real)
+
+    # adversarial loss
+    x_fake = nets['generator'](x_real, s_trg, masks=None, F0=GAN_F0_real)
+    out = nets['discriminator'](x_fake, y_trg)
+    loss_adv = adv_loss(out, 1)
+
+    # compute ASR/F0 features (fake)
+    F0_fake, GAN_F0_fake, _ = nets['F0_model'](x_fake)
+    ASR_fake = nets['asr_model'].get_feature(x_fake)
+
+    # norm consistency loss
+    x_fake_norm = log_norm(x_fake)
+    x_real_norm = log_norm(x_real)
+    tmp = paddle.abs(x_fake_norm - x_real_norm) - norm_bias
+    loss_norm = ((paddle.nn.ReLU()(tmp))**2).mean()
+
+    # F0 loss
+    loss_f0 = f0_loss(F0_fake, F0_real)
+
+    # style F0 loss (style initialization)
+    if x_refs is not None and lambda_f0_sty > 0 and not use_adv_cls:
+        F0_sty, _, _ = nets['F0_model'](x_ref)
+        loss_f0_sty = F.l1_loss(
+            compute_mean_f0(F0_fake), compute_mean_f0(F0_sty))
+    else:
+        loss_f0_sty = paddle.zeros([1]).mean()
+
+    # ASR loss
+    loss_asr = F.smooth_l1_loss(ASR_fake, ASR_real)
+
+    # style reconstruction loss
+    s_pred = nets['style_encoder'](x_fake, y_trg)
+    loss_sty = paddle.mean(paddle.abs(s_pred - s_trg))
+
+    # diversity sensitive loss
+    if z_trgs is not None:
+        s_trg2 = nets['mapping_network'](z_trg2, y_trg)
+    else:
+        s_trg2 = nets['style_encoder'](x_ref2, y_trg)
+    x_fake2 = nets['generator'](x_real, s_trg2, masks=None, F0=GAN_F0_real)
+    x_fake2 = x_fake2.detach()
+    _, GAN_F0_fake2, _ = nets['F0_model'](x_fake2)
+    loss_ds = paddle.mean(paddle.abs(x_fake - x_fake2))
+    loss_ds += F.smooth_l1_loss(GAN_F0_fake, GAN_F0_fake2.detach())
+
+    # cycle-consistency loss
+    s_org = nets['style_encoder'](x_real, y_org)
+    x_rec = nets['generator'](x_fake, s_org, masks=None, F0=GAN_F0_fake)
+    loss_cyc = paddle.mean(paddle.abs(x_rec - x_real))
+    # F0 loss in cycle-consistency loss
+    if lambda_f0 > 0:
+        _, _, cyc_F0_rec = nets['F0_model'](x_rec)
+        loss_cyc += F.smooth_l1_loss(cyc_F0_rec, cyc_F0_real)
+    if lambda_asr > 0:
+        ASR_recon = nets['asr_model'].get_feature(x_rec)
+        loss_cyc += F.smooth_l1_loss(ASR_recon, ASR_real)
+
+    # adversarial classifier loss
+    if use_adv_cls:
+        out_de = nets['discriminator'].classifier(x_fake)
+        loss_adv_cls = F.cross_entropy(out_de[y_org != y_trg],
+                                       y_trg[y_org != y_trg])
+    else:
+        loss_adv_cls = paddle.zeros([1]).mean()
+
+    loss = lambda_adv * loss_adv + lambda_sty * loss_sty \
+           - lambda_ds * loss_ds + lambda_cyc * loss_cyc \
+           + lambda_norm * loss_norm \
+           + lambda_asr * loss_asr \
+           + lambda_f0 * loss_f0 \
+           + lambda_f0_sty * loss_f0_sty \
+           + lambda_adv_cls * loss_adv_cls
+
+    return loss
+
+
+# for norm consistency loss
+def log_norm(x: paddle.Tensor, mean: float=-4, std: float=4, axis: int=2):
+    """
+    normalized log mel -> mel -> norm -> log(norm)
+    """
+    x = paddle.log(paddle.exp(x * std + mean).norm(axis=axis))
+    return x
+
+
+# for adversarial loss
+def adv_loss(logits: paddle.Tensor, target: float):
+    assert target in [1, 0]
+    if len(logits.shape) > 1:
+        logits = logits.reshape([-1])
+    targets = paddle.full_like(logits, fill_value=target)
+    logits = logits.clip(min=-10, max=10)  # prevent nan
+    loss = F.binary_cross_entropy_with_logits(logits, targets)
+    return loss
+
+
+# for R1 regularization loss
+def r1_reg(d_out: paddle.Tensor, x_in: paddle.Tensor):
+    # zero-centered gradient penalty for real images
+    batch_size = x_in.shape[0]
+    grad_dout = paddle.grad(
+        outputs=d_out.sum(),
+        inputs=x_in,
+        create_graph=True,
+        retain_graph=True,
+        only_inputs=True)[0]
+    grad_dout2 = grad_dout.pow(2)
+    assert (grad_dout2.shape == x_in.shape)
+    reg = 0.5 * grad_dout2.reshape((batch_size, -1)).sum(1).mean(0)
+    return reg
+
+
+# for F0 consistency loss
+def compute_mean_f0(f0: paddle.Tensor):
+    f0_mean = f0.mean(-1)
+    f0_mean = f0_mean.expand((f0.shape[-1], f0_mean.shape[0])).transpose(
+        (1, 0))  # (B, M)
+    return f0_mean
+
+
+def f0_loss(x_f0: paddle.Tensor, y_f0: paddle.Tensor):
+    """
+    x.shape = (B, 1, M, L): predict
+    y.shape = (B, 1, M, L): target
+    """
+    # compute the mean
+    x_mean = compute_mean_f0(x_f0)
+    y_mean = compute_mean_f0(y_f0)
+    loss = F.l1_loss(x_f0 / x_mean, y_f0 / y_mean)
+    return loss
diff --git a/paddlespeech/t2s/models/starganv2_vc/starganv2_vc.py b/paddlespeech/t2s/models/starganv2_vc/starganv2_vc.py
new file mode 100644
index 00000000..99aeb73b
--- /dev/null
+++ b/paddlespeech/t2s/models/starganv2_vc/starganv2_vc.py
@@ -0,0 +1,633 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+StarGAN v2
+Copyright (c) 2020-present NAVER Corp.
+This work is licensed under the Creative Commons Attribution-NonCommercial
+4.0 International License. To view a copy of this license, visit
+http://creativecommons.org/licenses/by-nc/4.0/ or send a letter to
+Creative Commons, PO Box 1866, Mountain View, CA 94042, USA.
+"""
+import math
+
+import paddle
+import paddle.nn.functional as F
+from paddle import nn
+
+from paddlespeech.t2s.modules.nets_utils import _reset_parameters
+
+
+class DownSample(nn.Layer):
+    def __init__(self, layer_type: str):
+        super().__init__()
+        self.layer_type = layer_type
+
+    def forward(self, x: paddle.Tensor):
+        """Calculate forward propagation.
+        Args:
+            x(Tensor(float32)): Shape (B, dim_in, n_mels, T).
+        Returns:
+            Tensor:
+                layer_type == 'none': Shape (B, dim_in, n_mels, T)
+                layer_type == 'timepreserve': Shape (B, dim_in, n_mels // 2, T)
+                layer_type == 'half': Shape (B, dim_in, n_mels // 2, T // 2)
+        """
+        if self.layer_type == 'none':
+            return x
+        elif self.layer_type == 'timepreserve':
+            out = F.avg_pool2d(x, (2, 1))
+            return out
+        elif self.layer_type == 'half':
+            out = F.avg_pool2d(x, 2)
+            return out
+        else:
+            raise RuntimeError(
+                'Got unexpected donwsampletype %s, expected is [none, timepreserve, half]'
+                % self.layer_type)
+
+
+class UpSample(nn.Layer):
+    def __init__(self, layer_type: str):
+        super().__init__()
+        self.layer_type = layer_type
+
+    def forward(self, x: paddle.Tensor):
+        """Calculate forward propagation.
+        Args:
+            x(Tensor(float32)): Shape (B, dim_in, n_mels, T).
+        Returns:
+            Tensor:
+                layer_type == 'none': Shape (B, dim_in, n_mels, T)
+                layer_type == 'timepreserve': Shape (B, dim_in, n_mels * 2, T)
+                layer_type == 'half': Shape (B, dim_in, n_mels * 2, T * 2)
+        """
+        if self.layer_type == 'none':
+            return x
+        elif self.layer_type == 'timepreserve':
+            out = F.interpolate(x, scale_factor=(2, 1), mode='nearest')
+            return out
+        elif self.layer_type == 'half':
+            out = F.interpolate(x, scale_factor=2, mode='nearest')
+            return out
+        else:
+            raise RuntimeError(
+                'Got unexpected upsampletype %s, expected is [none, timepreserve, half]'
+                % self.layer_type)
+
+
+class ResBlk(nn.Layer):
+    def __init__(self,
+                 dim_in: int,
+                 dim_out: int,
+                 actv: nn.LeakyReLU=nn.LeakyReLU(0.2),
+                 normalize: bool=False,
+                 downsample: str='none'):
+        super().__init__()
+        self.actv = actv
+        self.normalize = normalize
+        self.downsample = DownSample(layer_type=downsample)
+        self.learned_sc = dim_in != dim_out
+        self._build_weights(dim_in, dim_out)
+
+    def _build_weights(self, dim_in: int, dim_out: int):
+        self.conv1 = nn.Conv2D(
+            in_channels=dim_in,
+            out_channels=dim_in,
+            kernel_size=3,
+            stride=1,
+            padding=1)
+        self.conv2 = nn.Conv2D(
+            in_channels=dim_in,
+            out_channels=dim_out,
+            kernel_size=3,
+            stride=1,
+            padding=1)
+        if self.normalize:
+            self.norm1 = nn.InstanceNorm2D(dim_in)
+            self.norm2 = nn.InstanceNorm2D(dim_in)
+        if self.learned_sc:
+            self.conv1x1 = nn.Conv2D(
+                in_channels=dim_in,
+                out_channels=dim_out,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias_attr=False)
+
+    def _shortcut(self, x: paddle.Tensor):
+        if self.learned_sc:
+            x = self.conv1x1(x)
+        if self.downsample:
+            x = self.downsample(x)
+        return x
+
+    def _residual(self, x: paddle.Tensor):
+        if self.normalize:
+            x = self.norm1(x)
+        x = self.actv(x)
+        x = self.conv1(x)
+        x = self.downsample(x)
+        if self.normalize:
+            x = self.norm2(x)
+        x = self.actv(x)
+        x = self.conv2(x)
+        return x
+
+    def forward(self, x: paddle.Tensor):
+        """Calculate forward propagation.
+        Args:
+            x(Tensor(float32)): Shape (B, dim_in, n_mels, T).
+        Returns:
+            Tensor:
+                downsample == 'none': Shape (B, dim_in, n_mels, T).
+                downsample == 'timepreserve': Shape (B, dim_out, T, n_mels // 2, T).
+                downsample == 'half': Shape (B, dim_out, T, n_mels // 2, T // 2).
+        """
+        x = self._shortcut(x) + self._residual(x)
+        # unit variance
+        out = x / math.sqrt(2)
+        return out
+
+
+class AdaIN(nn.Layer):
+    def __init__(self, style_dim: int, num_features: int):
+        super().__init__()
+        self.norm = nn.InstanceNorm2D(
+            num_features=num_features, weight_attr=False, bias_attr=False)
+        self.fc = nn.Linear(style_dim, num_features * 2)
+
+    def forward(self, x: paddle.Tensor, s: paddle.Tensor):
+        """Calculate forward propagation.
+        Args:
+            x(Tensor(float32)): Shape (B, style_dim, n_mels, T).
+            s(Tensor(float32)): Shape (style_dim, ).
+        Returns:
+            Tensor:
+                Shape (B, style_dim, T, n_mels, T).
+        """
+        if len(s.shape) == 1:
+            s = s[None]
+        h = self.fc(s)
+        h = h.reshape((h.shape[0], h.shape[1], 1, 1))
+        gamma, beta = paddle.split(h, 2, axis=1)
+        out = (1 + gamma) * self.norm(x) + beta
+        return out
+
+
+class AdainResBlk(nn.Layer):
+    def __init__(self,
+                 dim_in: int,
+                 dim_out: int,
+                 style_dim: int=64,
+                 w_hpf: int=0,
+                 actv: nn.Layer=nn.LeakyReLU(0.2),
+                 upsample: str='none'):
+        super().__init__()
+        self.w_hpf = w_hpf
+        self.actv = actv
+        self.upsample = UpSample(layer_type=upsample)
+        self.learned_sc = dim_in != dim_out
+        self._build_weights(dim_in, dim_out, style_dim)
+        self.layer_type = upsample
+
+    def _build_weights(self, dim_in: int, dim_out: int, style_dim: int=64):
+        self.conv1 = nn.Conv2D(
+            in_channels=dim_in,
+            out_channels=dim_out,
+            kernel_size=3,
+            stride=1,
+            padding=1)
+        self.conv2 = nn.Conv2D(
+            in_channels=dim_out,
+            out_channels=dim_out,
+            kernel_size=3,
+            stride=1,
+            padding=1)
+        self.norm1 = AdaIN(style_dim=style_dim, num_features=dim_in)
+        self.norm2 = AdaIN(style_dim=style_dim, num_features=dim_out)
+        if self.learned_sc:
+            self.conv1x1 = nn.Conv2D(
+                in_channels=dim_in,
+                out_channels=dim_out,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias_attr=False)
+
+    def _shortcut(self, x: paddle.Tensor):
+        x = self.upsample(x)
+        if self.learned_sc:
+            x = self.conv1x1(x)
+        return x
+
+    def _residual(self, x: paddle.Tensor, s: paddle.Tensor):
+        x = self.norm1(x, s)
+        x = self.actv(x)
+        x = self.upsample(x)
+        x = self.conv1(x)
+        x = self.norm2(x, s)
+        x = self.actv(x)
+        x = self.conv2(x)
+        return x
+
+    def forward(self, x: paddle.Tensor, s: paddle.Tensor):
+        """Calculate forward propagation.
+        Args:
+            x(Tensor(float32)): 
+                Shape (B, dim_in, n_mels, T).
+            s(Tensor(float32)):
+                Shape (64,).
+        Returns:
+            Tensor:
+                upsample == 'none': Shape (B, dim_out, T, n_mels, T).  
+                upsample == 'timepreserve': Shape (B, dim_out, T, n_mels * 2, T).
+                upsample == 'half': Shape (B, dim_out, T, n_mels * 2, T * 2).  
+        """
+        out = self._residual(x, s)
+        if self.w_hpf == 0:
+            out = (out + self._shortcut(x)) / math.sqrt(2)
+        return out
+
+
+class HighPass(nn.Layer):
+    def __init__(self, w_hpf: int):
+        super().__init__()
+        self.filter = paddle.to_tensor([[-1, -1, -1], [-1, 8., -1],
+                                        [-1, -1, -1]]) / w_hpf
+
+    def forward(self, x: paddle.Tensor):
+        filter = self.filter.unsqueeze(0).unsqueeze(1).tile(
+            [x.shape[1], 1, 1, 1])
+        out = F.conv2d(x, filter, padding=1, groups=x.shape[1])
+        return out
+
+
+class Generator(nn.Layer):
+    def __init__(self,
+                 dim_in: int=48,
+                 style_dim: int=48,
+                 max_conv_dim: int=48 * 8,
+                 w_hpf: int=1,
+                 F0_channel: int=0):
+        super().__init__()
+
+        self.stem = nn.Conv2D(
+            in_channels=1,
+            out_channels=dim_in,
+            kernel_size=3,
+            stride=1,
+            padding=1)
+        self.encode = nn.LayerList()
+        self.decode = nn.LayerList()
+        self.to_out = nn.Sequential(
+            nn.InstanceNorm2D(dim_in),
+            nn.LeakyReLU(0.2),
+            nn.Conv2D(
+                in_channels=dim_in,
+                out_channels=1,
+                kernel_size=1,
+                stride=1,
+                padding=0))
+        self.F0_channel = F0_channel
+        # down/up-sampling blocks
+        # int(np.log2(img_size)) - 4
+        repeat_num = 4
+        if w_hpf > 0:
+            repeat_num += 1
+
+        for lid in range(repeat_num):
+            if lid in [1, 3]:
+                _downtype = 'timepreserve'
+            else:
+                _downtype = 'half'
+
+            dim_out = min(dim_in * 2, max_conv_dim)
+            self.encode.append(
+                ResBlk(
+                    dim_in=dim_in,
+                    dim_out=dim_out,
+                    normalize=True,
+                    downsample=_downtype))
+            (self.decode.insert if lid else
+             lambda i, sublayer: self.decode.append(sublayer))(0, AdainResBlk(
+                 dim_in=dim_out,
+                 dim_out=dim_in,
+                 style_dim=style_dim,
+                 w_hpf=w_hpf,
+                 upsample=_downtype))  # stack-like
+            dim_in = dim_out
+        # bottleneck blocks (encoder)
+        for _ in range(2):
+            self.encode.append(
+                ResBlk(dim_in=dim_out, dim_out=dim_out, normalize=True))
+        # F0 blocks 
+        if F0_channel != 0:
+            self.decode.insert(0,
+                               AdainResBlk(
+                                   dim_in=dim_out + int(F0_channel / 2),
+                                   dim_out=dim_out,
+                                   style_dim=style_dim,
+                                   w_hpf=w_hpf))
+        # bottleneck blocks (decoder)
+        for _ in range(2):
+            self.decode.insert(0,
+                               AdainResBlk(
+                                   dim_in=dim_out + int(F0_channel / 2),
+                                   dim_out=dim_out + int(F0_channel / 2),
+                                   style_dim=style_dim,
+                                   w_hpf=w_hpf))
+        if F0_channel != 0:
+            self.F0_conv = nn.Sequential(
+                ResBlk(
+                    dim_in=F0_channel,
+                    dim_out=int(F0_channel / 2),
+                    normalize=True,
+                    downsample="half"), )
+        if w_hpf > 0:
+            self.hpf = HighPass(w_hpf)
+
+        self.reset_parameters()
+
+    def forward(self,
+                x: paddle.Tensor,
+                s: paddle.Tensor,
+                masks: paddle.Tensor=None,
+                F0: paddle.Tensor=None):
+        """Calculate forward propagation.
+        Args:
+            x(Tensor(float32)): 
+                Shape (B, 1, n_mels, T).
+            s(Tensor(float32)):
+                Shape (64,).
+            masks:
+                None.
+            F0:
+                Shape (B, num_features(256), n_mels // 8, T).
+        Returns:
+            Tensor:
+                output of generator. Shape (B, 1, n_mels, T // 4 * 4)
+        """
+        x = self.stem(x)
+        cache = {}
+        # output: (B, max_conv_dim, n_mels // 16, T // 4)
+        for block in self.encode:
+            if (masks is not None) and (x.shape[2] in [32, 64, 128]):
+                cache[x.shape[2]] = x
+            x = block(x)
+        if F0 is not None:
+            # input: (B, num_features(256), n_mels // 8, T)
+            # output: (B, num_features(256) // 2, n_mels // 16, T // 2)
+            F0 = self.F0_conv(F0)
+            # output: (B, num_features(256) // 2, n_mels // 16, T // 4)
+            F0 = F.adaptive_avg_pool2d(F0, [x.shape[-2], x.shape[-1]])
+            x = paddle.concat([x, F0], axis=1)
+        # input: (B, max_conv_dim+num_features(256) // 2, n_mels // 16, T // 4 * 4)
+        # output: (B, dim_in, n_mels, T // 4 * 4)
+        for block in self.decode:
+            x = block(x, s)
+            if (masks is not None) and (x.shape[2] in [32, 64, 128]):
+                mask = masks[0] if x.shape[2] in [32] else masks[1]
+                mask = F.interpolate(mask, size=x.shape[2], mode='bilinear')
+                x = x + self.hpf(mask * cache[x.shape[2]])
+        out = self.to_out(x)
+        return out
+
+    def reset_parameters(self):
+        self.apply(_reset_parameters)
+
+
+class MappingNetwork(nn.Layer):
+    def __init__(self,
+                 latent_dim: int=16,
+                 style_dim: int=48,
+                 num_domains: int=2,
+                 hidden_dim: int=384):
+        super().__init__()
+        layers = []
+        layers += [nn.Linear(latent_dim, hidden_dim)]
+        layers += [nn.ReLU()]
+        for _ in range(3):
+            layers += [nn.Linear(hidden_dim, hidden_dim)]
+            layers += [nn.ReLU()]
+        self.shared = nn.Sequential(*layers)
+
+        self.unshared = nn.LayerList()
+        for _ in range(num_domains):
+            self.unshared.extend([
+                nn.Sequential(
+                    nn.Linear(hidden_dim, hidden_dim),
+                    nn.ReLU(),
+                    nn.Linear(hidden_dim, hidden_dim),
+                    nn.ReLU(),
+                    nn.Linear(hidden_dim, hidden_dim),
+                    nn.ReLU(), nn.Linear(hidden_dim, style_dim))
+            ])
+
+        self.reset_parameters()
+
+    def forward(self, z: paddle.Tensor, y: paddle.Tensor):
+        """Calculate forward propagation.
+        Args:
+            z(Tensor(float32)): 
+                Shape (B, latent_dim).
+            y(Tensor(float32)):
+                speaker label. Shape (B, ).    
+        Returns:
+            Tensor:
+                Shape (style_dim, )
+        """
+        h = self.shared(z)
+        out = []
+        for layer in self.unshared:
+            out += [layer(h)]
+        # (B, num_domains, style_dim)
+        out = paddle.stack(out, axis=1)
+        idx = paddle.arange(y.shape[0])
+        # (style_dim, )
+        s = out[idx, y]
+        return s
+
+    def reset_parameters(self):
+        self.apply(_reset_parameters)
+
+
+class StyleEncoder(nn.Layer):
+    def __init__(self,
+                 dim_in: int=48,
+                 style_dim: int=48,
+                 num_domains: int=2,
+                 max_conv_dim: int=384):
+        super().__init__()
+        blocks = []
+        blocks += [
+            nn.Conv2D(
+                in_channels=1,
+                out_channels=dim_in,
+                kernel_size=3,
+                stride=1,
+                padding=1)
+        ]
+        repeat_num = 4
+        for _ in range(repeat_num):
+            dim_out = min(dim_in * 2, max_conv_dim)
+            blocks += [
+                ResBlk(dim_in=dim_in, dim_out=dim_out, downsample='half')
+            ]
+            dim_in = dim_out
+
+        blocks += [nn.LeakyReLU(0.2)]
+        blocks += [
+            nn.Conv2D(
+                in_channels=dim_out,
+                out_channels=dim_out,
+                kernel_size=5,
+                stride=1,
+                padding=0)
+        ]
+        blocks += [nn.AdaptiveAvgPool2D(1)]
+        blocks += [nn.LeakyReLU(0.2)]
+        self.shared = nn.Sequential(*blocks)
+        self.unshared = nn.LayerList()
+        for _ in range(num_domains):
+            self.unshared.append(nn.Linear(dim_out, style_dim))
+
+        self.reset_parameters()
+
+    def forward(self, x: paddle.Tensor, y: paddle.Tensor):
+        """Calculate forward propagation.
+        Args:
+            x(Tensor(float32)): 
+                Shape (B, 1, n_mels, T).   
+            y(Tensor(float32)):
+                speaker label. Shape (B, ).
+        Returns:
+            Tensor:
+                Shape (style_dim, )
+        """
+        h = self.shared(x)
+        h = h.reshape((h.shape[0], -1))
+        out = []
+        for layer in self.unshared:
+            out += [layer(h)]
+        # (B, num_domains, style_dim)
+        out = paddle.stack(out, axis=1)
+        idx = paddle.arange(y.shape[0])
+        # (style_dim,)
+        s = out[idx, y]
+        return s
+
+    def reset_parameters(self):
+        self.apply(_reset_parameters)
+
+
+class Discriminator(nn.Layer):
+    def __init__(self,
+                 dim_in: int=48,
+                 num_domains: int=2,
+                 max_conv_dim: int=384,
+                 repeat_num: int=4):
+        super().__init__()
+        # real/fake discriminator
+        self.dis = Discriminator2D(
+            dim_in=dim_in,
+            num_domains=num_domains,
+            max_conv_dim=max_conv_dim,
+            repeat_num=repeat_num)
+        # adversarial classifier
+        self.cls = Discriminator2D(
+            dim_in=dim_in,
+            num_domains=num_domains,
+            max_conv_dim=max_conv_dim,
+            repeat_num=repeat_num)
+        self.num_domains = num_domains
+
+        self.reset_parameters()
+
+    def forward(self, x: paddle.Tensor, y: paddle.Tensor):
+        """Calculate forward propagation.
+        Args:
+            x(Tensor(float32)):
+                Shape (B, 1, 80, T).
+            y(Tensor(float32)):
+                Shape (B, ). 
+        Returns:
+            Tensor:
+                Shape (B, )
+        """
+        out = self.dis(x, y)
+        return out
+
+    def classifier(self, x: paddle.Tensor):
+        out = self.cls.get_feature(x)
+        return out
+
+    def reset_parameters(self):
+        self.apply(_reset_parameters)
+
+
+class Discriminator2D(nn.Layer):
+    def __init__(self,
+                 dim_in: int=48,
+                 num_domains: int=2,
+                 max_conv_dim: int=384,
+                 repeat_num: int=4):
+        super().__init__()
+        blocks = []
+        blocks += [
+            nn.Conv2D(
+                in_channels=1,
+                out_channels=dim_in,
+                kernel_size=3,
+                stride=1,
+                padding=1)
+        ]
+
+        for lid in range(repeat_num):
+            dim_out = min(dim_in * 2, max_conv_dim)
+            blocks += [ResBlk(dim_in, dim_out, downsample='half')]
+            dim_in = dim_out
+
+        blocks += [nn.LeakyReLU(0.2)]
+        blocks += [
+            nn.Conv2D(
+                in_channels=dim_out,
+                out_channels=dim_out,
+                kernel_size=5,
+                stride=1,
+                padding=0)
+        ]
+        blocks += [nn.LeakyReLU(0.2)]
+        blocks += [nn.AdaptiveAvgPool2D(1)]
+        blocks += [
+            nn.Conv2D(
+                in_channels=dim_out,
+                out_channels=num_domains,
+                kernel_size=1,
+                stride=1,
+                padding=0)
+        ]
+        self.main = nn.Sequential(*blocks)
+
+    def get_feature(self, x: paddle.Tensor):
+        out = self.main(x)
+        # (B, num_domains)
+        out = out.reshape((out.shape[0], -1))
+        return out
+
+    def forward(self, x: paddle.Tensor, y: paddle.Tensor):
+        out = self.get_feature(x)
+        idx = paddle.arange(y.shape[0])
+        # (B,) ?
+        out = out[idx, y]
+        return out
diff --git a/paddlespeech/t2s/models/starganv2_vc/starganv2_vc_updater.py b/paddlespeech/t2s/models/starganv2_vc/starganv2_vc_updater.py
new file mode 100644
index 00000000..1b811a3f
--- /dev/null
+++ b/paddlespeech/t2s/models/starganv2_vc/starganv2_vc_updater.py
@@ -0,0 +1,308 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from typing import Any
+from typing import Dict
+
+from paddle import distributed as dist
+from paddle.io import DataLoader
+from paddle.nn import Layer
+from paddle.optimizer import Optimizer
+from paddle.optimizer.lr import LRScheduler
+
+from paddlespeech.t2s.models.starganv2_vc.losses import compute_d_loss
+from paddlespeech.t2s.models.starganv2_vc.losses import compute_g_loss
+from paddlespeech.t2s.training.extensions.evaluator import StandardEvaluator
+from paddlespeech.t2s.training.reporter import report
+from paddlespeech.t2s.training.updaters.standard_updater import StandardUpdater
+from paddlespeech.t2s.training.updaters.standard_updater import UpdaterState
+
+logging.basicConfig(
+    format='%(asctime)s [%(levelname)s] [%(filename)s:%(lineno)d] %(message)s',
+    datefmt='[%Y-%m-%d %H:%M:%S]')
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+
+class StarGANv2VCUpdater(StandardUpdater):
+    def __init__(self,
+                 models: Dict[str, Layer],
+                 optimizers: Dict[str, Optimizer],
+                 schedulers: Dict[str, LRScheduler],
+                 dataloader: DataLoader,
+                 g_loss_params: Dict[str, Any]={
+                     'lambda_sty': 1.,
+                     'lambda_cyc': 5.,
+                     'lambda_ds': 1.,
+                     'lambda_norm': 1.,
+                     'lambda_asr': 10.,
+                     'lambda_f0': 5.,
+                     'lambda_f0_sty': 0.1,
+                     'lambda_adv': 2.,
+                     'lambda_adv_cls': 0.5,
+                     'norm_bias': 0.5,
+                 },
+                 d_loss_params: Dict[str, Any]={
+                     'lambda_reg': 1.,
+                     'lambda_adv_cls': 0.1,
+                     'lambda_con_reg': 10.,
+                 },
+                 adv_cls_epoch: int=50,
+                 con_reg_epoch: int=30,
+                 use_r1_reg: bool=False,
+                 output_dir=None):
+        self.models = models
+
+        self.optimizers = optimizers
+        self.optimizer_g = optimizers['generator']
+        self.optimizer_s = optimizers['style_encoder']
+        self.optimizer_m = optimizers['mapping_network']
+        self.optimizer_d = optimizers['discriminator']
+
+        self.schedulers = schedulers
+        self.scheduler_g = schedulers['generator']
+        self.scheduler_s = schedulers['style_encoder']
+        self.scheduler_m = schedulers['mapping_network']
+        self.scheduler_d = schedulers['discriminator']
+
+        self.dataloader = dataloader
+
+        self.g_loss_params = g_loss_params
+        self.d_loss_params = d_loss_params
+
+        self.use_r1_reg = use_r1_reg
+        self.con_reg_epoch = con_reg_epoch
+        self.adv_cls_epoch = adv_cls_epoch
+
+        self.state = UpdaterState(iteration=0, epoch=0)
+        self.train_iterator = iter(self.dataloader)
+
+        log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
+        self.filehandler = logging.FileHandler(str(log_file))
+        logger.addHandler(self.filehandler)
+        self.logger = logger
+        self.msg = ""
+
+    def zero_grad(self):
+        self.optimizer_d.clear_grad()
+        self.optimizer_g.clear_grad()
+        self.optimizer_m.clear_grad()
+        self.optimizer_s.clear_grad()
+
+    def scheduler(self):
+        self.scheduler_d.step()
+        self.scheduler_g.step()
+        self.scheduler_m.step()
+        self.scheduler_s.step()
+
+    def update_core(self, batch):
+        self.msg = "Rank: {}, ".format(dist.get_rank())
+        losses_dict = {}
+        # parse batch
+        x_real = batch['x_real']
+        y_org = batch['y_org']
+        x_ref = batch['x_ref']
+        x_ref2 = batch['x_ref2']
+        y_trg = batch['y_trg']
+        z_trg = batch['z_trg']
+        z_trg2 = batch['z_trg2']
+
+        use_con_reg = (self.state.epoch >= self.con_reg_epoch)
+        use_adv_cls = (self.state.epoch >= self.adv_cls_epoch)
+
+        # Discriminator loss
+        # train the discriminator (by random reference)
+        self.zero_grad()
+        random_d_loss = compute_d_loss(
+            nets=self.models,
+            x_real=x_real,
+            y_org=y_org,
+            y_trg=y_trg,
+            z_trg=z_trg,
+            use_adv_cls=use_adv_cls,
+            use_con_reg=use_con_reg,
+            **self.d_loss_params)
+        random_d_loss.backward()
+        self.optimizer_d.step()
+        # train the discriminator (by target reference)
+        self.zero_grad()
+        target_d_loss = compute_d_loss(
+            nets=self.models,
+            x_real=x_real,
+            y_org=y_org,
+            y_trg=y_trg,
+            x_ref=x_ref,
+            use_adv_cls=use_adv_cls,
+            use_con_reg=use_con_reg,
+            **self.d_loss_params)
+        target_d_loss.backward()
+        self.optimizer_d.step()
+        report("train/random_d_loss", float(random_d_loss))
+        report("train/target_d_loss", float(target_d_loss))
+        losses_dict["random_d_loss"] = float(random_d_loss)
+        losses_dict["target_d_loss"] = float(target_d_loss)
+
+        # Generator
+        # train the generator (by random reference)
+        self.zero_grad()
+        random_g_loss = compute_g_loss(
+            nets=self.models,
+            x_real=x_real,
+            y_org=y_org,
+            y_trg=y_trg,
+            z_trgs=[z_trg, z_trg2],
+            use_adv_cls=use_adv_cls,
+            **self.g_loss_params)
+        random_g_loss.backward()
+        self.optimizer_g.step()
+        self.optimizer_m.step()
+        self.optimizer_s.step()
+
+        # train the generator (by target reference)
+        self.zero_grad()
+        target_g_loss = compute_g_loss(
+            nets=self.models,
+            x_real=x_real,
+            y_org=y_org,
+            y_trg=y_trg,
+            x_refs=[x_ref, x_ref2],
+            use_adv_cls=use_adv_cls,
+            **self.g_loss_params)
+        target_g_loss.backward()
+        # 此处是否要 optimizer_g optimizer_m optimizer_s 都写上？
+        # 源码没写上后两个是否是疏忽？
+        self.optimizer_g.step()
+        # self.optimizer_m.step()
+        # self.optimizer_s.step()
+        report("train/random_g_loss", float(random_g_loss))
+        report("train/target_g_loss", float(target_g_loss))
+        losses_dict["random_g_loss"] = float(random_g_loss)
+        losses_dict["target_g_loss"] = float(target_g_loss)
+
+        self.scheduler()
+
+        self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                              for k, v in losses_dict.items())
+
+
+class StarGANv2VCEvaluator(StandardEvaluator):
+    def __init__(self,
+                 models: Dict[str, Layer],
+                 dataloader: DataLoader,
+                 g_loss_params: Dict[str, Any]={
+                     'lambda_sty': 1.,
+                     'lambda_cyc': 5.,
+                     'lambda_ds': 1.,
+                     'lambda_norm': 1.,
+                     'lambda_asr': 10.,
+                     'lambda_f0': 5.,
+                     'lambda_f0_sty': 0.1,
+                     'lambda_adv': 2.,
+                     'lambda_adv_cls': 0.5,
+                     'norm_bias': 0.5,
+                 },
+                 d_loss_params: Dict[str, Any]={
+                     'lambda_reg': 1.,
+                     'lambda_adv_cls': 0.1,
+                     'lambda_con_reg': 10.,
+                 },
+                 adv_cls_epoch: int=50,
+                 con_reg_epoch: int=30,
+                 use_r1_reg: bool=False,
+                 output_dir=None):
+        self.models = models
+
+        self.dataloader = dataloader
+
+        self.g_loss_params = g_loss_params
+        self.d_loss_params = d_loss_params
+
+        self.use_r1_reg = use_r1_reg
+        self.con_reg_epoch = con_reg_epoch
+        self.adv_cls_epoch = adv_cls_epoch
+
+        log_file = output_dir / 'worker_{}.log'.format(dist.get_rank())
+        self.filehandler = logging.FileHandler(str(log_file))
+        logger.addHandler(self.filehandler)
+        self.logger = logger
+        self.msg = ""
+
+    def evaluate_core(self, batch):
+        # logging.debug("Evaluate: ")
+        self.msg = "Evaluate: "
+        losses_dict = {}
+
+        x_real = batch['x_real']
+        y_org = batch['y_org']
+        x_ref = batch['x_ref']
+        x_ref2 = batch['x_ref2']
+        y_trg = batch['y_trg']
+        z_trg = batch['z_trg']
+        z_trg2 = batch['z_trg2']
+
+        # eval the discriminator
+
+        random_d_loss = compute_d_loss(
+            nets=self.models,
+            x_real=x_real,
+            y_org=y_org,
+            y_trg=y_trg,
+            z_trg=z_trg,
+            use_r1_reg=self.use_r1_reg,
+            use_adv_cls=use_adv_cls,
+            **self.d_loss_params)
+
+        target_d_loss = compute_d_loss(
+            nets=self.models,
+            x_real=x_real,
+            y_org=y_org,
+            y_trg=y_trg,
+            x_ref=x_ref,
+            use_r1_reg=self.use_r1_reg,
+            use_adv_cls=use_adv_cls,
+            **self.d_loss_params)
+
+        report("eval/random_d_loss", float(random_d_loss))
+        report("eval/target_d_loss", float(target_d_loss))
+        losses_dict["random_d_loss"] = float(random_d_loss)
+        losses_dict["target_d_loss"] = float(target_d_loss)
+
+        # eval the generator
+
+        random_g_loss = compute_g_loss(
+            nets=self.models,
+            x_real=x_real,
+            y_org=y_org,
+            y_trg=y_trg,
+            z_trgs=[z_trg, z_trg2],
+            use_adv_cls=use_adv_cls,
+            **self.g_loss_params)
+
+        target_g_loss = compute_g_loss(
+            nets=self.models,
+            x_real=x_real,
+            y_org=y_org,
+            y_trg=y_trg,
+            x_refs=[x_ref, x_ref2],
+            use_adv_cls=use_adv_cls,
+            **self.g_loss_params)
+
+        report("eval/random_g_loss", float(random_g_loss))
+        report("eval/target_g_loss", float(target_g_loss))
+        losses_dict["random_g_loss"] = float(random_g_loss)
+        losses_dict["target_g_loss"] = float(target_g_loss)
+
+        self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
+                              for k, v in losses_dict.items())
+        self.logger.info(self.msg)
diff --git a/paddlespeech/t2s/models/starganv2_vc/transforms.py b/paddlespeech/t2s/models/starganv2_vc/transforms.py
new file mode 100644
index 00000000..d7586147
--- /dev/null
+++ b/paddlespeech/t2s/models/starganv2_vc/transforms.py
@@ -0,0 +1,143 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import random
+
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+from paddle import nn
+
+
+## 1. RandomTimeStrech
+class TimeStrech(nn.Layer):
+    def __init__(self, scale):
+        super().__init__()
+        self.scale = scale
+
+    def forward(self, x: paddle.Tensor):
+        mel_size = x.shape[-1]
+
+        x = F.interpolate(
+            x,
+            scale_factor=(1, self.scale),
+            align_corners=False,
+            mode='bilinear').squeeze()
+
+        if x.shape[-1] < mel_size:
+            noise_length = (mel_size - x.shape[-1])
+            random_pos = random.randint(0, x.shape[-1]) - noise_length
+            if random_pos < 0:
+                random_pos = 0
+            noise = x[..., random_pos:random_pos + noise_length]
+            x = paddle.concat([x, noise], axis=-1)
+        else:
+            x = x[..., :mel_size]
+
+        return x.unsqueeze(1)
+
+
+## 2. PitchShift
+class PitchShift(nn.Layer):
+    def __init__(self, shift):
+        super().__init__()
+        self.shift = shift
+
+    def forward(self, x: paddle.Tensor):
+        if len(x.shape) == 2:
+            x = x.unsqueeze(0)
+        x = x.squeeze()
+        mel_size = x.shape[1]
+        shift_scale = (mel_size + self.shift) / mel_size
+        x = F.interpolate(
+            x.unsqueeze(1),
+            scale_factor=(shift_scale, 1.),
+            align_corners=False,
+            mode='bilinear').squeeze(1)
+
+        x = x[:, :mel_size]
+        if x.shape[1] < mel_size:
+            pad_size = mel_size - x.shape[1]
+            x = paddle.cat(
+                [x, paddle.zeros(x.shape[0], pad_size, x.shape[2])], axis=1)
+        x = x.squeeze()
+        return x.unsqueeze(1)
+
+
+## 3. ShiftBias
+class ShiftBias(nn.Layer):
+    def __init__(self, bias):
+        super().__init__()
+        self.bias = bias
+
+    def forward(self, x: paddle.Tensor):
+        return x + self.bias
+
+
+## 4. Scaling
+class SpectScaling(nn.Layer):
+    def __init__(self, scale):
+        super().__init__()
+        self.scale = scale
+
+    def forward(self, x: paddle.Tensor):
+        return x * self.scale
+
+
+## 5. Time Flip
+class TimeFlip(nn.Layer):
+    def __init__(self, length):
+        super().__init__()
+        self.length = round(length)
+
+    def forward(self, x: paddle.Tensor):
+        if self.length > 1:
+            start = np.random.randint(0, x.shape[-1] - self.length)
+            x_ret = x.clone()
+            x_ret[..., start:start + self.length] = paddle.flip(
+                x[..., start:start + self.length], axis=[-1])
+            x = x_ret
+        return x
+
+
+class PhaseShuffle2D(nn.Layer):
+    def __init__(self, n: int=2):
+        super().__init__()
+        self.n = n
+        self.random = random.Random(1)
+
+    def forward(self, x: paddle.Tensor, move=None):
+        # x.size = (B, C, M, L)
+        if move is None:
+            move = self.random.randint(-self.n, self.n)
+
+        if move == 0:
+            return x
+        else:
+            left = x[:, :, :, :move]
+            right = x[:, :, :, move:]
+            shuffled = paddle.concat([right, left], axis=3)
+
+        return shuffled
+
+
+def build_transforms():
+    transforms = [
+        lambda M: TimeStrech(1 + (np.random.random() - 0.5) * M * 0.2),
+        lambda M: SpectScaling(1 + (np.random.random() - 1) * M * 0.1),
+        lambda M: PhaseShuffle2D(192),
+    ]
+    N, M = len(transforms), np.random.random()
+    composed = nn.Sequential(
+        * [trans(M) for trans in np.random.choice(transforms, N)])
+    return composed
diff --git a/paddlespeech/t2s/models/tacotron2/tacotron2_updater.py b/paddlespeech/t2s/models/tacotron2/tacotron2_updater.py
index 09e6827d..1db9248a 100644
--- a/paddlespeech/t2s/models/tacotron2/tacotron2_updater.py
+++ b/paddlespeech/t2s/models/tacotron2/tacotron2_updater.py
@@ -113,16 +113,18 @@ class Tacotron2Updater(StandardUpdater):
         loss.backward()
         optimizer.step()
 
+        if self.use_guided_attn_loss:
+            report("train/attn_loss", float(attn_loss))
+            losses_dict["attn_loss"] = float(attn_loss)
+        
         report("train/l1_loss", float(l1_loss))
         report("train/mse_loss", float(mse_loss))
         report("train/bce_loss", float(bce_loss))
-        report("train/attn_loss", float(attn_loss))
         report("train/loss", float(loss))
 
         losses_dict["l1_loss"] = float(l1_loss)
         losses_dict["mse_loss"] = float(mse_loss)
         losses_dict["bce_loss"] = float(bce_loss)
-        losses_dict["attn_loss"] = float(attn_loss)
         losses_dict["loss"] = float(loss)
         self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
                               for k, v in losses_dict.items())
@@ -202,17 +204,19 @@ class Tacotron2Evaluator(StandardEvaluator):
             attn_loss = self.attn_loss(
                 att_ws=att_ws, ilens=batch["text_lengths"] + 1, olens=olens_in)
             loss = loss + attn_loss
+        
+        if self.use_guided_attn_loss:
+            report("eval/attn_loss", float(attn_loss))
+            losses_dict["attn_loss"] = float(attn_loss)
 
         report("eval/l1_loss", float(l1_loss))
         report("eval/mse_loss", float(mse_loss))
         report("eval/bce_loss", float(bce_loss))
-        report("eval/attn_loss", float(attn_loss))
         report("eval/loss", float(loss))
 
         losses_dict["l1_loss"] = float(l1_loss)
         losses_dict["mse_loss"] = float(mse_loss)
         losses_dict["bce_loss"] = float(bce_loss)
-        losses_dict["attn_loss"] = float(attn_loss)
         losses_dict["loss"] = float(loss)
         self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
                               for k, v in losses_dict.items())
diff --git a/paddlespeech/t2s/models/vits/duration_predictor.py b/paddlespeech/t2s/models/vits/duration_predictor.py
index b0bb68d0..12177fbc 100644
--- a/paddlespeech/t2s/models/vits/duration_predictor.py
+++ b/paddlespeech/t2s/models/vits/duration_predictor.py
@@ -155,12 +155,10 @@ class StochasticDurationPredictor(nn.Layer):
             z_u, z1 = paddle.split(z_q, [1, 1], 1)
             u = F.sigmoid(z_u) * x_mask
             z0 = (w - u) * x_mask
-            logdet_tot_q += paddle.sum(
-                (F.log_sigmoid(z_u) + F.log_sigmoid(-z_u)) * x_mask, [1, 2])
-            logq = (paddle.sum(-0.5 *
-                               (math.log(2 * math.pi) +
-                                (e_q**2)) * x_mask, [1, 2]) - logdet_tot_q)
-
+            tmp1 = (F.log_sigmoid(z_u) + F.log_sigmoid(-z_u)) * x_mask
+            logdet_tot_q += paddle.sum(tmp1, [1, 2])
+            tmp2 = -0.5 * (math.log(2 * math.pi) + (e_q**2)) * x_mask
+            logq = (paddle.sum(tmp2, [1, 2]) - logdet_tot_q)
             logdet_tot = 0
             z0, logdet = self.log_flow(z0, x_mask)
             logdet_tot += logdet
@@ -168,8 +166,8 @@ class StochasticDurationPredictor(nn.Layer):
             for flow in self.flows:
                 z, logdet = flow(z, x_mask, g=x, inverse=inverse)
                 logdet_tot = logdet_tot + logdet
-            nll = (paddle.sum(0.5 * (math.log(2 * math.pi) +
-                                     (z**2)) * x_mask, [1, 2]) - logdet_tot)
+            tmp3 = 0.5 * (math.log(2 * math.pi) + (z**2)) * x_mask
+            nll = (paddle.sum(tmp3, [1, 2]) - logdet_tot)
             # (B,)
             return nll + logq
         else:
diff --git a/paddlespeech/t2s/models/vits/flow.py b/paddlespeech/t2s/models/vits/flow.py
index 7593eb72..94df968a 100644
--- a/paddlespeech/t2s/models/vits/flow.py
+++ b/paddlespeech/t2s/models/vits/flow.py
@@ -334,11 +334,12 @@ class ConvFlow(nn.Layer):
         unnorm_widths = h[..., :self.bins] / denom
         unnorm_heights = h[..., self.bins:2 * self.bins] / denom
         unnorm_derivatives = h[..., 2 * self.bins:]
+
         xb, logdet_abs = piecewise_rational_quadratic_transform(
-            xb,
-            unnorm_widths,
-            unnorm_heights,
-            unnorm_derivatives,
+            inputs=xb,
+            unnormalized_widths=unnorm_widths,
+            unnormalized_heights=unnorm_heights,
+            unnormalized_derivatives=unnorm_derivatives,
             inverse=inverse,
             tails="linear",
             tail_bound=self.tail_bound, )
diff --git a/paddlespeech/t2s/models/vits/generator.py b/paddlespeech/t2s/models/vits/generator.py
index 7ecc5161..427ae09e 100644
--- a/paddlespeech/t2s/models/vits/generator.py
+++ b/paddlespeech/t2s/models/vits/generator.py
@@ -279,6 +279,10 @@ class VITSGenerator(nn.Layer):
         from paddlespeech.t2s.models.vits.monotonic_align import maximum_path
 
         self.maximum_path = maximum_path
+        self.pad1d = nn.Pad1D(
+            padding=[1, 0],
+            mode='constant',
+            data_format='NLC', )
 
     def forward(
             self,
@@ -367,8 +371,9 @@ class VITSGenerator(nn.Layer):
             # (B, H, T_text)
             s_p_sq_r = paddle.exp(-2 * logs_p)
             # (B, 1, T_text)
+            tmp1 = -0.5 * math.log(2 * math.pi) - logs_p
             neg_x_ent_1 = paddle.sum(
-                -0.5 * math.log(2 * math.pi) - logs_p,
+                tmp1,
                 [1],
                 keepdim=True, )
             # (B, T_feats, H) x (B, H, T_text) = (B, T_feats, T_text)
@@ -380,8 +385,9 @@ class VITSGenerator(nn.Layer):
                 z_p.transpose([0, 2, 1]),
                 (m_p * s_p_sq_r), )
             # (B, 1, T_text)
+            tmp2 = -0.5 * (m_p**2) * s_p_sq_r
             neg_x_ent_4 = paddle.sum(
-                -0.5 * (m_p**2) * s_p_sq_r,
+                tmp2,
                 [1],
                 keepdim=True, )
             # (B, T_feats, T_text)
@@ -399,7 +405,6 @@ class VITSGenerator(nn.Layer):
         w = attn.sum(2)
         dur_nll = self.duration_predictor(x, x_mask, w=w, g=g)
         dur_nll = dur_nll / paddle.sum(x_mask)
-
         # expand the length to match with the feature sequence
         # (B, T_feats, T_text) x (B, T_text, H) -> (B, H, T_feats)
         m_p = paddle.matmul(attn.squeeze(1),
@@ -507,8 +512,9 @@ class VITSGenerator(nn.Layer):
             # (B, H, T_text)
             s_p_sq_r = paddle.exp(-2 * logs_p)
             # (B, 1, T_text)
+            tmp3 = -0.5 * math.log(2 * math.pi) - logs_p
             neg_x_ent_1 = paddle.sum(
-                -0.5 * math.log(2 * math.pi) - logs_p,
+                tmp3,
                 [1],
                 keepdim=True, )
             # (B, T_feats, H) x (B, H, T_text) = (B, T_feats, T_text)
@@ -520,8 +526,9 @@ class VITSGenerator(nn.Layer):
                 z_p.transpose([0, 2, 1]),
                 (m_p * s_p_sq_r), )
             # (B, 1, T_text)
+            tmp4 = -0.5 * (m_p**2) * s_p_sq_r
             neg_x_ent_4 = paddle.sum(
-                -0.5 * (m_p**2) * s_p_sq_r,
+                tmp4,
                 [1],
                 keepdim=True, )
             # (B, T_feats, T_text)
@@ -552,8 +559,9 @@ class VITSGenerator(nn.Layer):
             y_lengths = paddle.cast(
                 paddle.clip(paddle.sum(dur, [1, 2]), min=1), dtype='int64')
             y_mask = make_non_pad_mask(y_lengths).unsqueeze(1)
-            attn_mask = paddle.unsqueeze(x_mask, 2) * paddle.unsqueeze(y_mask,
-                                                                       -1)
+            tmp_a = paddle.cast(paddle.unsqueeze(x_mask, 2), dtype='int64')
+            tmp_b = paddle.cast(paddle.unsqueeze(y_mask, -1), dtype='int64')
+            attn_mask = tmp_a * tmp_b
             attn = self._generate_path(dur, attn_mask)
 
             # expand the length to match with the feature sequence
@@ -685,5 +693,6 @@ class VITSGenerator(nn.Layer):
         '''
 
         path = paddle.cast(path, dtype='float32')
-        path = path - F.pad(path, [0, 0, 1, 0, 0, 0])[:, :-1]
+        pad_tmp = self.pad1d(path)[:, :-1]
+        path = path - pad_tmp
         return path.unsqueeze(1).transpose([0, 1, 3, 2]) * mask
diff --git a/paddlespeech/t2s/models/vits/text_encoder.py b/paddlespeech/t2s/models/vits/text_encoder.py
index 799e0c75..015ed76c 100644
--- a/paddlespeech/t2s/models/vits/text_encoder.py
+++ b/paddlespeech/t2s/models/vits/text_encoder.py
@@ -24,6 +24,7 @@ from paddle import nn
 
 from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask
 from paddlespeech.t2s.modules.transformer.encoder import ConformerEncoder as Encoder
+from paddlespeech.utils.initialize import normal_
 
 
 class TextEncoder(nn.Layer):
@@ -105,10 +106,6 @@ class TextEncoder(nn.Layer):
         # define modules
         self.emb = nn.Embedding(vocabs, attention_dim)
 
-        dist = paddle.distribution.Normal(loc=0.0, scale=attention_dim**-0.5)
-        w = dist.sample(self.emb.weight.shape)
-        self.emb.weight.set_value(w)
-
         self.encoder = Encoder(
             idim=-1,
             input_layer=None,
@@ -130,6 +127,8 @@ class TextEncoder(nn.Layer):
             cnn_module_kernel=conformer_kernel_size, )
         self.proj = nn.Conv1D(attention_dim, attention_dim * 2, 1)
 
+        self.reset_parameters()
+
     def forward(
             self,
             x: paddle.Tensor,
@@ -166,3 +165,9 @@ class TextEncoder(nn.Layer):
         m, logs = paddle.split(stats, 2, axis=1)
 
         return x, m, logs, x_mask
+
+    def reset_parameters(self):
+        normal_(self.emb.weight, mean=0.0, std=self.attention_dim**-0.5)
+        if self.emb._padding_idx is not None:
+            with paddle.no_grad():
+                self.emb.weight[self.emb._padding_idx] = 0
diff --git a/paddlespeech/t2s/models/vits/transform.py b/paddlespeech/t2s/models/vits/transform.py
index fec80377..917f2843 100644
--- a/paddlespeech/t2s/models/vits/transform.py
+++ b/paddlespeech/t2s/models/vits/transform.py
@@ -18,6 +18,7 @@ This code is based on https://github.com/bayesiains/nflows.
 """
 import numpy as np
 import paddle
+from paddle import nn
 from paddle.nn import functional as F
 
 from paddlespeech.t2s.modules.nets_utils import paddle_gather
@@ -35,9 +36,10 @@ def piecewise_rational_quadratic_transform(
         inverse=False,
         tails=None,
         tail_bound=1.0,
-        min_bin_width=DEFAULT_MIN_BIN_WIDTH,
-        min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
-        min_derivative=DEFAULT_MIN_DERIVATIVE, ):
+        # for dygraph-to-static
+        min_bin_width=1e-3,
+        min_bin_height=1e-3,
+        min_derivative=1e-3, ):
     if tails is None:
         spline_fn = rational_quadratic_spline
         spline_kwargs = {}
@@ -59,8 +61,12 @@ def piecewise_rational_quadratic_transform(
 
 
 def mask_preprocess(x, mask):
+    # bins.dtype = int32
     B, C, T, bins = paddle.shape(x)
-    new_x = paddle.zeros([mask.sum(), bins])
+    mask_int = paddle.cast(mask, dtype='int64')
+    # paddle.sum 输入是 int32 或 bool 的时候，输出是 int64
+    # paddle.zeros (fill_constant) 的 shape 会被强制转成 int32 类型
+    new_x = paddle.zeros([paddle.sum(mask_int), bins])
     for i in range(bins):
         new_x[:, i] = x[:, :, :, i][mask]
     return new_x
@@ -74,23 +80,27 @@ def unconstrained_rational_quadratic_spline(
         inverse=False,
         tails="linear",
         tail_bound=1.0,
-        min_bin_width=DEFAULT_MIN_BIN_WIDTH,
-        min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
-        min_derivative=DEFAULT_MIN_DERIVATIVE, ):
+        # for dygraph-to-static
+        min_bin_width=1e-3,
+        min_bin_height=1e-3,
+        min_derivative=1e-3, ):
     inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
     outside_interval_mask = ~inside_interval_mask
-
-    outputs = paddle.zeros(paddle.shape(inputs))
-    logabsdet = paddle.zeros(paddle.shape(inputs))
+    # for dygraph to static
+    # 这里用 paddle.shape(x) 然后调用 zeros 会得到一个全 -1 shape 的 var
+    # 如果用 x.shape 的话可以保留确定的维度
+    outputs = paddle.zeros(inputs.shape)
+    logabsdet = paddle.zeros(inputs.shape)
     if tails == "linear":
-        unnormalized_derivatives = F.pad(
-            unnormalized_derivatives,
-            pad=[0] * (len(unnormalized_derivatives.shape) - 1) * 2 + [1, 1])
+        # 注意 padding 的参数顺序
+        pad2d = nn.Pad2D(padding=[1, 1, 0, 0], mode='constant')
+        unnormalized_derivatives = pad2d(unnormalized_derivatives)
         constant = np.log(np.exp(1 - min_derivative) - 1)
         unnormalized_derivatives[..., 0] = constant
         unnormalized_derivatives[..., -1] = constant
-
-        outputs[outside_interval_mask] = inputs[outside_interval_mask]
+        # for dygraph to static
+        tmp = inputs[outside_interval_mask]
+        outputs[outside_interval_mask] = tmp
         logabsdet[outside_interval_mask] = 0
     else:
         raise RuntimeError("{} tails are not implemented.".format(tails))
@@ -130,27 +140,30 @@ def rational_quadratic_spline(
         right=1.0,
         bottom=0.0,
         top=1.0,
-        min_bin_width=DEFAULT_MIN_BIN_WIDTH,
-        min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
-        min_derivative=DEFAULT_MIN_DERIVATIVE, ):
-    if paddle.min(inputs) < left or paddle.max(inputs) > right:
-        raise ValueError("Input to a transform is not within its domain")
+        # for dygraph-to-static
+        min_bin_width=1e-3,
+        min_bin_height=1e-3,
+        min_derivative=1e-3, ):
+    # for dygraph to static
+    # if paddle.min(inputs) < left or paddle.max(inputs) > right:
+    #     raise ValueError("Input to a transform is not within its domain")
+    pad1d = nn.Pad1D(
+        padding=[1, 0],
+        mode='constant',
+        data_format='NCL', )
 
     num_bins = unnormalized_widths.shape[-1]
-
-    if min_bin_width * num_bins > 1.0:
-        raise ValueError("Minimal bin width too large for the number of bins")
-    if min_bin_height * num_bins > 1.0:
-        raise ValueError("Minimal bin height too large for the number of bins")
+    # for dygraph to static
+    # if min_bin_width * num_bins > 1.0:
+    #     raise ValueError("Minimal bin width too large for the number of bins")
+    # if min_bin_height * num_bins > 1.0:
+    #     raise ValueError("Minimal bin height too large for the number of bins")
 
     widths = F.softmax(unnormalized_widths, axis=-1)
     widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
     cumwidths = paddle.cumsum(widths, axis=-1)
-    cumwidths = F.pad(
-        cumwidths,
-        pad=[0] * (len(cumwidths.shape) - 1) * 2 + [1, 0],
-        mode="constant",
-        value=0.0)
+
+    cumwidths = pad1d(cumwidths.unsqueeze(0)).squeeze()
     cumwidths = (right - left) * cumwidths + left
     cumwidths[..., 0] = left
     cumwidths[..., -1] = right
@@ -161,11 +174,7 @@ def rational_quadratic_spline(
     heights = F.softmax(unnormalized_heights, axis=-1)
     heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
     cumheights = paddle.cumsum(heights, axis=-1)
-    cumheights = F.pad(
-        cumheights,
-        pad=[0] * (len(cumheights.shape) - 1) * 2 + [1, 0],
-        mode="constant",
-        value=0.0)
+    cumheights = pad1d(cumheights.unsqueeze(0)).squeeze()
     cumheights = (top - bottom) * cumheights + bottom
     cumheights[..., 0] = bottom
     cumheights[..., -1] = top
@@ -235,4 +244,7 @@ def rational_quadratic_spline(
 
 def _searchsorted(bin_locations, inputs, eps=1e-6):
     bin_locations[..., -1] += eps
-    return paddle.sum(inputs[..., None] >= bin_locations, axis=-1) - 1
+    mask = inputs[..., None] >= bin_locations
+    mask_int = paddle.cast(mask, dtype='int64')
+    out = paddle.sum(mask_int, axis=-1) - 1
+    return out
diff --git a/paddlespeech/t2s/models/vits/vits.py b/paddlespeech/t2s/models/vits/vits.py
index 0ff3a546..7013e06c 100644
--- a/paddlespeech/t2s/models/vits/vits.py
+++ b/paddlespeech/t2s/models/vits/vits.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 # Modified from espnet(https://github.com/espnet/espnet)
 """VITS module"""
+import math
 from typing import Any
 from typing import Dict
 from typing import Optional
@@ -27,7 +28,12 @@ from paddlespeech.t2s.models.hifigan import HiFiGANMultiScaleMultiPeriodDiscrimi
 from paddlespeech.t2s.models.hifigan import HiFiGANPeriodDiscriminator
 from paddlespeech.t2s.models.hifigan import HiFiGANScaleDiscriminator
 from paddlespeech.t2s.models.vits.generator import VITSGenerator
-from paddlespeech.t2s.modules.nets_utils import initialize
+from paddlespeech.utils.initialize import _calculate_fan_in_and_fan_out
+from paddlespeech.utils.initialize import kaiming_uniform_
+from paddlespeech.utils.initialize import normal_
+from paddlespeech.utils.initialize import ones_
+from paddlespeech.utils.initialize import uniform_
+from paddlespeech.utils.initialize import zeros_
 
 AVAILABLE_GENERATERS = {
     "vits_generator": VITSGenerator,
@@ -152,8 +158,7 @@ class VITS(nn.Layer):
                     "use_spectral_norm": False,
                 },
             },
-            cache_generator_outputs: bool=True,
-            init_type: str="xavier_uniform", ):
+            cache_generator_outputs: bool=True, ):
         """Initialize VITS module.
         Args:
             idim (int):
@@ -179,9 +184,6 @@ class VITS(nn.Layer):
         assert check_argument_types()
         super().__init__()
 
-        # initialize parameters
-        initialize(self, init_type)
-
         # define modules
         generator_class = AVAILABLE_GENERATERS[generator_type]
         if generator_type == "vits_generator":
@@ -196,8 +198,6 @@ class VITS(nn.Layer):
         self.discriminator = discriminator_class(
             **discriminator_params, )
 
-        nn.initializer.set_global_initializer(None)
-
         # cache
         self.cache_generator_outputs = cache_generator_outputs
         self._cache = None
@@ -214,6 +214,10 @@ class VITS(nn.Layer):
         self.reuse_cache_gen = True
         self.reuse_cache_dis = True
 
+        self.reset_parameters()
+        self.generator.decoder.reset_parameters()
+        self.generator.text_encoder.reset_parameters()
+
     def forward(
             self,
             text: paddle.Tensor,
@@ -243,7 +247,7 @@ class VITS(nn.Layer):
             forward_generator (bool):
                     Whether to forward generator.
         Returns:
-        
+
         """
         if forward_generator:
             return self._forward_generator(
@@ -290,7 +294,7 @@ class VITS(nn.Layer):
             lids (Optional[Tensor]):
                 Language index tensor (B,) or (B, 1).
         Returns:
-            
+
         """
         # setup
         feats = feats.transpose([0, 2, 1])
@@ -497,3 +501,45 @@ class VITS(nn.Layer):
             lids, )
 
         return dict(wav=paddle.reshape(wav, [-1]))
+
+    def reset_parameters(self):
+        def _reset_parameters(module):
+            if isinstance(module,
+                        (nn.Conv1D, nn.Conv1DTranspose, nn.Conv2D, nn.Conv2DTranspose)):
+                kaiming_uniform_(module.weight, a=math.sqrt(5))
+                if module.bias is not None:
+                    fan_in, _ = _calculate_fan_in_and_fan_out(module.weight)
+                    if fan_in != 0:
+                        bound = 1 / math.sqrt(fan_in)
+                        uniform_(module.bias, -bound, bound)
+
+            if isinstance(module,
+                          (nn.BatchNorm1D, nn.BatchNorm2D, nn.GroupNorm, nn.LayerNorm)):
+                ones_(module.weight)
+                zeros_(module.bias)
+
+            if isinstance(module, nn.Linear):
+                kaiming_uniform_(module.weight, a=math.sqrt(5))
+                if module.bias is not None:
+                    fan_in, _ = _calculate_fan_in_and_fan_out(module.weight)
+                    bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+                    uniform_(module.bias, -bound, bound)
+
+            if isinstance(module, nn.Embedding):
+                normal_(module.weight)
+                if module._padding_idx is not None:
+                    with paddle.no_grad():
+                        module.weight[module._padding_idx] = 0
+
+        self.apply(_reset_parameters)
+
+class VITSInference(nn.Layer):
+    def __init__(self, model):
+        super().__init__()
+        self.acoustic_model = model
+
+    def forward(self, text, sids=None):
+        out = self.acoustic_model.inference(
+            text, sids=sids)
+        wav = out['wav']
+        return wav
diff --git a/paddlespeech/t2s/models/waveflow.py b/paddlespeech/t2s/models/waveflow.py
index 8e2ce822..b4818cab 100644
--- a/paddlespeech/t2s/models/waveflow.py
+++ b/paddlespeech/t2s/models/waveflow.py
@@ -236,7 +236,7 @@ class ResidualBlock(nn.Layer):
 
         Returns:
             res (Tensor): 
-                A row of the the residual output. shape=(batch_size, channel, 1, width)
+                A row of the residual output. shape=(batch_size, channel, 1, width)
             skip (Tensor): 
                 A row of the skip output. shape=(batch_size, channel, 1, width)
 
@@ -343,7 +343,7 @@ class ResidualNet(nn.LayerList):
             
         Returns:
             res (Tensor): 
-                A row of the the residual output. shape=(batch_size, channel, 1, width) 
+                A row of the residual output. shape=(batch_size, channel, 1, width) 
             skip (Tensor): 
                 A row of the skip output. shape=(batch_size, channel, 1, width)
                 
@@ -465,7 +465,7 @@ class Flow(nn.Layer):
         self.resnet.start_sequence()
 
     def inverse(self, z, condition):
-        """Sampling from the the distrition p(X). It is done by sample form
+        """Sampling from the distrition p(X). It is done by sample form
         p(Z) and transform the sample. It is a auto regressive transformation.
 
         Args:
@@ -600,7 +600,7 @@ class WaveFlow(nn.LayerList):
         return z, log_det_jacobian
 
     def inverse(self, z, condition):
-        """Sampling from the the distrition p(X).
+        """Sampling from the distrition p(X).
 
         It is done by sample a ``z`` form p(Z) and transform it into ``x``.
         Each Flow transform .. math:: `z_{i-1}` to .. math:: `z_{i}` in an
diff --git a/paddlespeech/t2s/modules/activation.py b/paddlespeech/t2s/modules/activation.py
index 8d8cd62e..f1c099b7 100644
--- a/paddlespeech/t2s/modules/activation.py
+++ b/paddlespeech/t2s/modules/activation.py
@@ -37,7 +37,8 @@ def get_activation(act, **kwargs):
         "selu": paddle.nn.SELU,
         "leakyrelu": paddle.nn.LeakyReLU,
         "swish": paddle.nn.Swish,
-        "glu": GLU
+        "glu": GLU,
+        "gelu": paddle.nn.GELU,
     }
 
     return activation_funcs[act](**kwargs)
diff --git a/paddlespeech/t2s/modules/conformer/encoder_layer.py b/paddlespeech/t2s/modules/conformer/encoder_layer.py
index 26a35456..6c416088 100644
--- a/paddlespeech/t2s/modules/conformer/encoder_layer.py
+++ b/paddlespeech/t2s/modules/conformer/encoder_layer.py
@@ -113,7 +113,6 @@ class EncoderLayer(nn.Layer):
             x, pos_emb = x_input[0], x_input[1]
         else:
             x, pos_emb = x_input, None
-
         skip_layer = False
         # with stochastic depth, residual connection `x + f(x)` becomes
         # `x <- x + 1 / (1 - p) * f(x)` at training time.
@@ -121,14 +120,12 @@ class EncoderLayer(nn.Layer):
         if self.training and self.stochastic_depth_rate > 0:
             skip_layer = paddle.rand(1).item() < self.stochastic_depth_rate
             stoch_layer_coeff = 1.0 / (1 - self.stochastic_depth_rate)
-
         if skip_layer:
             if cache is not None:
                 x = paddle.concat([cache, x], axis=1)
             if pos_emb is not None:
                 return (x, pos_emb), mask
             return x, mask
-
         # whether to use macaron style
         if self.feed_forward_macaron is not None:
             residual = x
@@ -138,7 +135,6 @@ class EncoderLayer(nn.Layer):
                 self.feed_forward_macaron(x))
             if not self.normalize_before:
                 x = self.norm_ff_macaron(x)
-
         # multi-headed self-attention module
         residual = x
         if self.normalize_before:
diff --git a/paddlespeech/t2s/modules/diffnet.py b/paddlespeech/t2s/modules/diffnet.py
new file mode 100644
index 00000000..2f433ad6
--- /dev/null
+++ b/paddlespeech/t2s/modules/diffnet.py
@@ -0,0 +1,244 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+
+import paddle
+import paddle.nn.functional as F
+from paddle import nn
+
+from paddlespeech.utils.initialize import _calculate_fan_in_and_fan_out
+from paddlespeech.utils.initialize import kaiming_normal_
+from paddlespeech.utils.initialize import kaiming_uniform_
+from paddlespeech.utils.initialize import uniform_
+from paddlespeech.utils.initialize import zeros_
+
+
+def Conv1D(*args, **kwargs):
+    layer = nn.Conv1D(*args, **kwargs)
+    # Initialize the weight to be consistent with the official
+    kaiming_normal_(layer.weight)
+
+    # Initialization is consistent with torch
+    if layer.bias is not None:
+        fan_in, _ = _calculate_fan_in_and_fan_out(layer.weight)
+        if fan_in != 0:
+            bound = 1 / math.sqrt(fan_in)
+            uniform_(layer.bias, -bound, bound)
+    return layer
+
+
+# Initialization is consistent with torch
+def Linear(*args, **kwargs):
+    layer = nn.Linear(*args, **kwargs)
+    kaiming_uniform_(layer.weight, a=math.sqrt(5))
+    if layer.bias is not None:
+        fan_in, _ = _calculate_fan_in_and_fan_out(layer.weight)
+        bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+        uniform_(layer.bias, -bound, bound)
+    return layer
+
+
+class ResidualBlock(nn.Layer):
+    """ResidualBlock
+
+    Args:
+        encoder_hidden (int, optional): 
+            Input feature size of the 1D convolution, by default 256
+        residual_channels (int, optional): 
+            Feature size of the residual output(and also the input), by default 256
+        gate_channels (int, optional): 
+            Output feature size of the 1D convolution, by default 512
+        kernel_size (int, optional): 
+            Kernel size of the 1D convolution, by default 3
+        dilation (int, optional): 
+            Dilation of the 1D convolution, by default 4
+    """
+
+    def __init__(self,
+                 encoder_hidden: int=256,
+                 residual_channels: int=256,
+                 gate_channels: int=512,
+                 kernel_size: int=3,
+                 dilation: int=4):
+        super().__init__()
+        self.dilated_conv = Conv1D(
+            residual_channels,
+            gate_channels,
+            kernel_size,
+            padding=dilation,
+            dilation=dilation)
+        self.diffusion_projection = Linear(residual_channels, residual_channels)
+        self.conditioner_projection = Conv1D(encoder_hidden, gate_channels, 1)
+        self.output_projection = Conv1D(residual_channels, gate_channels, 1)
+
+    def forward(
+            self,
+            x: paddle.Tensor,
+            diffusion_step: paddle.Tensor,
+            cond: paddle.Tensor, ):
+        """Calculate forward propagation.
+        Args:
+            spec (Tensor(float32)): input feature. (B, residual_channels, T)
+            diffusion_step (Tensor(int64)):  The timestep input (adding noise step). (B,)
+            cond (Tensor(float32)): The auxiliary input (e.g. fastspeech2 encoder output). (B, residual_channels, T)
+
+        Returns:
+            x (Tensor(float32)): output (B, residual_channels, T)
+
+        """
+        diffusion_step = self.diffusion_projection(diffusion_step).unsqueeze(-1)
+        cond = self.conditioner_projection(cond)
+        y = x + diffusion_step
+
+        y = self.dilated_conv(y) + cond
+
+        gate, filter = paddle.chunk(y, 2, axis=1)
+        y = F.sigmoid(gate) * paddle.tanh(filter)
+
+        y = self.output_projection(y)
+        residual, skip = paddle.chunk(y, 2, axis=1)
+        return (x + residual) / math.sqrt(2.0), skip
+
+
+class SinusoidalPosEmb(nn.Layer):
+    """Positional embedding
+    """
+
+    def __init__(self, dim: int=256):
+        super().__init__()
+        self.dim = dim
+
+    def forward(self, x: paddle.Tensor):
+        x = paddle.cast(x, 'float32')
+        half_dim = self.dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = paddle.exp(paddle.arange(half_dim) * -emb)
+        emb = x[:, None] * emb[None, :]
+        emb = paddle.concat([emb.sin(), emb.cos()], axis=-1)
+        return emb
+
+
+class DiffNet(nn.Layer):
+    """A Mel-Spectrogram Denoiser
+
+    Args:
+        in_channels (int, optional): 
+            Number of channels of the input mel-spectrogram, by default 80
+        out_channels (int, optional): 
+            Number of channels of the output mel-spectrogram, by default 80
+        kernel_size (int, optional): 
+            Kernel size of the residual blocks inside, by default 3
+        layers (int, optional): 
+            Number of residual blocks inside, by default 20
+        stacks (int, optional):
+            The number of groups to split the residual blocks into, by default 5
+            Within each group, the dilation of the residual block grows exponentially.
+        residual_channels (int, optional): 
+            Residual channel of the residual blocks, by default 256
+        gate_channels (int, optional): 
+            Gate channel of the residual blocks, by default 512
+        skip_channels (int, optional): 
+            Skip channel of the residual blocks, by default 256
+        aux_channels (int, optional): 
+            Auxiliary channel of the residual blocks, by default 256
+        dropout (float, optional): 
+            Dropout of the residual blocks, by default 0.
+        bias (bool, optional): 
+            Whether to use bias in residual blocks, by default True
+        use_weight_norm (bool, optional): 
+            Whether to use weight norm in all convolutions, by default False
+    """
+
+    def __init__(
+            self,
+            in_channels: int=80,
+            out_channels: int=80,
+            kernel_size: int=3,
+            layers: int=20,
+            stacks: int=5,
+            residual_channels: int=256,
+            gate_channels: int=512,
+            skip_channels: int=256,
+            aux_channels: int=256,
+            dropout: float=0.,
+            bias: bool=True,
+            use_weight_norm: bool=False,
+            init_type: str="kaiming_normal", ):
+        super().__init__()
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.layers = layers
+        self.aux_channels = aux_channels
+        self.residual_channels = residual_channels
+        self.gate_channels = gate_channels
+        self.kernel_size = kernel_size
+        self.dilation_cycle_length = layers // stacks
+        self.skip_channels = skip_channels
+
+        self.input_projection = Conv1D(self.in_channels, self.residual_channels,
+                                       1)
+        self.diffusion_embedding = SinusoidalPosEmb(self.residual_channels)
+        dim = self.residual_channels
+        self.mlp = nn.Sequential(
+            Linear(dim, dim * 4), nn.Mish(), Linear(dim * 4, dim))
+        self.residual_layers = nn.LayerList([
+            ResidualBlock(
+                encoder_hidden=self.aux_channels,
+                residual_channels=self.residual_channels,
+                gate_channels=self.gate_channels,
+                kernel_size=self.kernel_size,
+                dilation=2**(i % self.dilation_cycle_length))
+            for i in range(self.layers)
+        ])
+        self.skip_projection = Conv1D(self.residual_channels,
+                                      self.skip_channels, 1)
+        self.output_projection = Conv1D(self.residual_channels,
+                                        self.out_channels, 1)
+        zeros_(self.output_projection.weight)
+
+    def forward(
+            self,
+            spec: paddle.Tensor,
+            diffusion_step: paddle.Tensor,
+            cond: paddle.Tensor, ):
+        """Calculate forward propagation.
+        Args:
+            spec (Tensor(float32)): The input mel-spectrogram. (B, n_mel, T)
+            diffusion_step (Tensor(int64)):  The timestep input (adding noise step). (B,)
+            cond (Tensor(float32)): The auxiliary input (e.g. fastspeech2 encoder output). (B, D_enc_out, T)
+
+        Returns:
+            x (Tensor(float32)): pred noise (B, n_mel, T)
+
+        """
+        x = spec
+        x = self.input_projection(x)  # x [B, residual_channel, T]
+
+        x = F.relu(x)
+        diffusion_step = self.diffusion_embedding(diffusion_step)
+        diffusion_step = self.mlp(diffusion_step)
+        skip = []
+        for layer_id, layer in enumerate(self.residual_layers):
+            x, skip_connection = layer(
+                x=x,
+                diffusion_step=diffusion_step,
+                cond=cond, )
+            skip.append(skip_connection)
+        x = paddle.sum(
+            paddle.stack(skip), axis=0) / math.sqrt(len(self.residual_layers))
+        x = self.skip_projection(x)
+        x = F.relu(x)
+        x = self.output_projection(x)  # [B, 80, T]
+        return x
diff --git a/paddlespeech/t2s/modules/diffusion.py b/paddlespeech/t2s/modules/diffusion.py
new file mode 100644
index 00000000..adbd9ce7
--- /dev/null
+++ b/paddlespeech/t2s/modules/diffusion.py
@@ -0,0 +1,322 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Diffusion denoising related modules for paddle"""
+from typing import Callable
+from typing import Optional
+from typing import Tuple
+
+import numpy as np
+import paddle
+import ppdiffusers
+from paddle import nn
+from ppdiffusers.schedulers import DDPMScheduler
+
+
+class GaussianDiffusion(nn.Layer):
+    """Common Gaussian Diffusion Denoising Model Module 
+
+    Args:
+        denoiser (Layer, optional): 
+            The model used for denoising noises.
+        num_train_timesteps (int, optional): 
+            The number of timesteps between the noise and the real during training, by default 1000.
+        beta_start (float, optional): 
+            beta start parameter for the scheduler, by default 0.0001.
+        beta_end (float, optional): 
+            beta end parameter for the scheduler, by default 0.0001.
+        beta_schedule (str, optional): 
+            beta schedule parameter for the scheduler, by default 'squaredcos_cap_v2' (cosine schedule).
+        num_max_timesteps (int, optional): 
+            The max timestep transition from real to noise, by default None.
+        stretch (bool, optional): 
+            Whether to stretch before diffusion, by defalut True.
+        min_values: (paddle.Tensor):
+            The minimum value of the feature to stretch.
+        max_values: (paddle.Tensor):
+            The maximum value of the feature to stretch.
+    
+    
+    Examples: 
+        >>> import paddle
+        >>> import paddle.nn.functional as F
+        >>> from tqdm import tqdm
+        >>> 
+        >>> denoiser = WaveNetDenoiser()
+        >>> diffusion = GaussianDiffusion(denoiser, num_train_timesteps=1000, num_max_timesteps=100)
+        >>> x = paddle.ones([4, 80, 192]) # [B, mel_ch, T] # real mel input
+        >>> c = paddle.randn([4, 256, 192]) # [B, fs2_encoder_out_ch, T] # fastspeech2 encoder output
+        >>> loss = F.mse_loss(*diffusion(x, c))
+        >>> loss.backward()
+        >>> print('MSE Loss:', loss.item())
+        MSE Loss: 1.6669728755950928 
+        >>> def create_progress_callback():
+        >>>     pbar = None
+        >>>     def callback(index, timestep, num_timesteps, sample):
+        >>>         nonlocal pbar
+        >>>         if pbar is None:
+        >>>             pbar = tqdm(total=num_timesteps)
+        >>>             pbar.update(index)
+        >>>         pbar.update()
+        >>> 
+        >>>     return callback
+        >>> 
+        >>> # ds=1000, K_step=60, scheduler=ddpm, from aux fs2 mel output
+        >>> ds = 1000
+        >>> infer_steps = 1000
+        >>> K_step = 60
+        >>> scheduler_type = 'ddpm'
+        >>> x_in = x
+        >>> diffusion = GaussianDiffusion(denoiser, num_train_timesteps=ds, num_max_timesteps=K_step)
+        >>> with paddle.no_grad():
+        >>>     sample = diffusion.inference(
+        >>>         paddle.randn(x.shape), c, ref_x=x_in, 
+        >>>         num_inference_steps=infer_steps,
+        >>>         scheduler_type=scheduler_type,
+        >>>         callback=create_progress_callback())
+        100%|█████| 60/60 [00:03<00:00, 18.36it/s] 
+        >>> 
+        >>> # ds=100, K_step=100, scheduler=ddpm, from gaussian noise
+        >>> ds = 100
+        >>> infer_steps = 100
+        >>> K_step = 100
+        >>> scheduler_type = 'ddpm'
+        >>> x_in = None
+        >>> diffusion = GaussianDiffusion(denoiser, num_train_timesteps=ds, num_max_timesteps=K_step)
+        >>> with paddle.no_grad():
+        >>>     sample = diffusion.inference(
+        >>>         paddle.randn(x.shape), c, ref_x=x_in, 
+        >>>         num_inference_steps=infer_steps,
+        >>>         scheduler_type=scheduler_type,
+        >>>         callback=create_progress_callback())
+        100%|█████| 100/100 [00:05<00:00, 18.29it/s] 
+        >>> 
+        >>> # ds=1000, K_step=1000, scheduler=pndm, infer_step=25, from gaussian noise
+        >>> ds = 1000
+        >>> infer_steps = 25
+        >>> K_step = 1000
+        >>> scheduler_type = 'pndm'
+        >>> x_in = None
+        >>> diffusion = GaussianDiffusion(denoiser, num_train_timesteps=ds, num_max_timesteps=K_step)
+        >>> with paddle.no_grad():
+        >>>     sample = diffusion.inference(
+        >>>         paddle.randn(x.shape), c, ref_x=x_in, 
+        >>>         num_inference_steps=infer_steps,
+        >>>         scheduler_type=scheduler_type,
+        >>>         callback=create_progress_callback())
+        100%|█████| 34/34 [00:01<00:00, 19.75it/s]
+        >>> 
+        >>> # ds=1000, K_step=100, scheduler=pndm, infer_step=50, from aux fs2 mel output
+        >>> ds = 1000
+        >>> infer_steps = 50
+        >>> K_step = 100
+        >>> scheduler_type = 'pndm'
+        >>> x_in = x
+        >>> diffusion = GaussianDiffusion(denoiser, num_train_timesteps=ds, num_max_timesteps=K_step)
+        >>> with paddle.no_grad():
+        >>>     sample = diffusion.inference(
+        >>>         paddle.randn(x.shape), c, ref_x=x_in, 
+        >>>         num_inference_steps=infer_steps,
+        >>>         scheduler_type=scheduler_type,
+        >>>         callback=create_progress_callback())
+        100%|█████| 14/14 [00:00<00:00, 23.80it/s]
+
+    """
+
+    def __init__(
+            self,
+            denoiser: nn.Layer,
+            num_train_timesteps: Optional[int]=1000,
+            beta_start: Optional[float]=0.0001,
+            beta_end: Optional[float]=0.02,
+            beta_schedule: Optional[str]="squaredcos_cap_v2",
+            num_max_timesteps: Optional[int]=None,
+            stretch: bool=True,
+            min_values: paddle.Tensor=None,
+            max_values: paddle.Tensor=None, ):
+        super().__init__()
+
+        self.num_train_timesteps = num_train_timesteps
+        self.beta_start = beta_start
+        self.beta_end = beta_end
+        self.beta_schedule = beta_schedule
+
+        self.denoiser = denoiser
+        self.noise_scheduler = DDPMScheduler(
+            num_train_timesteps=num_train_timesteps,
+            beta_start=beta_start,
+            beta_end=beta_end,
+            beta_schedule=beta_schedule)
+        self.num_max_timesteps = num_max_timesteps
+        self.stretch = stretch
+        self.min_values = min_values
+        self.max_values = max_values
+
+    def norm_spec(self, x):
+        """
+        Linearly map x to [-1, 1]
+        Args:
+            x: [B, T, N]
+        """
+        return (x - self.min_values) / (self.max_values - self.min_values
+                                        ) * 2 - 1
+
+    def denorm_spec(self, x):
+        return (x + 1) / 2 * (self.max_values - self.min_values
+                              ) + self.min_values
+
+    def forward(self, x: paddle.Tensor, cond: Optional[paddle.Tensor]=None
+                ) -> Tuple[paddle.Tensor, paddle.Tensor]:
+        """Generate random timesteps noised x.
+
+        Args:
+            x (Tensor): 
+                The input for adding noises.
+            cond (Tensor, optional):
+                Conditional input for compute noises.
+          
+        Returns: 
+            y (Tensor): 
+                The output with noises added in.
+            target (Tensor):
+                The noises which is added to the input.
+
+        """
+        if self.stretch:
+            x = x.transpose((0, 2, 1))
+            x = self.norm_spec(x)
+            x = x.transpose((0, 2, 1))
+
+        noise_scheduler = self.noise_scheduler
+
+        # Sample noise that we'll add to the mel-spectrograms
+        target = noise = paddle.randn(x.shape)
+
+        # Sample a random timestep for each mel-spectrogram
+        num_timesteps = self.num_train_timesteps
+        if self.num_max_timesteps is not None:
+            num_timesteps = self.num_max_timesteps
+        timesteps = paddle.randint(0, num_timesteps, (x.shape[0], ))
+
+        # Add noise to the clean mel-spectrograms according to the noise magnitude at each timestep
+        # (this is the forward diffusion process)
+        noisy_images = noise_scheduler.add_noise(x, noise, timesteps)
+
+        y = self.denoiser(noisy_images, timesteps, cond)
+
+        # then compute loss use output y and noisy target for prediction_type == "epsilon"
+        return y, target
+
+    def inference(self,
+                  noise: paddle.Tensor,
+                  cond: Optional[paddle.Tensor]=None,
+                  ref_x: Optional[paddle.Tensor]=None,
+                  num_inference_steps: Optional[int]=1000,
+                  strength: Optional[float]=None,
+                  scheduler_type: Optional[str]="ddpm",
+                  clip_noise: Optional[bool]=False,
+                  clip_noise_range: Optional[Tuple[float, float]]=(-1, 1),
+                  callback: Optional[Callable[[int, int, int, paddle.Tensor],
+                                              None]]=None,
+                  callback_steps: Optional[int]=1):
+        """Denoising input from noises. Refer to ppdiffusers img2img pipeline.
+
+        Args:
+            noise (Tensor): 
+                The input tensor as a starting point for denoising. 
+            cond (Tensor, optional):
+                Conditional input for compute noises. (N, C_aux, T)
+            ref_x (Tensor, optional):
+                The real output for the denoising process to refer.
+            num_inference_steps (int, optional):
+                The number of timesteps between the noise and the real during inference, by default 1000.
+            strength (float, optional):
+                Mixing strength of ref_x with noise. The larger the value, the stronger the noise. 
+                Range [0,1], by default None.
+            scheduler_type (str, optional):
+                Noise scheduler for generate noises. 
+                Choose a great scheduler can skip many denoising step, by default 'ddpm'.
+                only support 'ddpm' now !
+            clip_noise (bool, optional):
+                Whether to clip each denoised output, by default True.
+            clip_noise_range (tuple, optional):
+                denoised output min and max value range after clip, by default (-1, 1).
+            callback (Callable[[int,int,int,Tensor], None], optional):
+                Callback function during denoising steps.
+
+                Args:
+                    index (int):
+                        Current denoising index.
+                    timestep (int):
+                        Current denoising timestep.
+                    num_timesteps (int):
+                        Number of the denoising timesteps.
+                    denoised_output (Tensor):
+                        Current intermediate result produced during denoising.
+
+            callback_steps (int, optional):
+                The step to call the callback function.
+          
+        Returns: 
+            denoised_output (Tensor): 
+                The denoised output tensor.
+
+        """
+        scheduler_cls = None
+        for clsname in dir(ppdiffusers.schedulers):
+            if clsname.lower() == scheduler_type + "scheduler":
+                scheduler_cls = getattr(ppdiffusers.schedulers, clsname)
+                break
+
+        if scheduler_cls is None:
+            raise ValueError(f"No such scheduler type named {scheduler_type}")
+
+        scheduler = scheduler_cls(
+            num_train_timesteps=self.num_train_timesteps,
+            beta_start=self.beta_start,
+            beta_end=self.beta_end,
+            beta_schedule=self.beta_schedule)
+
+        # set timesteps
+        scheduler.set_timesteps(num_inference_steps)
+
+        noisy_input = noise
+        if self.stretch and ref_x is not None:
+            ref_x = ref_x.transpose((0, 2, 1))
+            ref_x = self.norm_spec(ref_x)
+            ref_x = ref_x.transpose((0, 2, 1))
+
+            # for ddpm
+            timesteps = paddle.to_tensor(
+                np.flipud(np.arange(num_inference_steps)))
+            noisy_input = scheduler.add_noise(ref_x, noise, timesteps[0])
+
+        denoised_output = noisy_input
+        if clip_noise:
+            n_min, n_max = clip_noise_range
+            denoised_output = paddle.clip(denoised_output, n_min, n_max)
+        for i, t in enumerate(timesteps):
+            denoised_output = scheduler.scale_model_input(denoised_output, t)
+            noise_pred = self.denoiser(denoised_output, t, cond)
+            # compute the previous noisy sample x_t -> x_t-1
+            denoised_output = scheduler.step(noise_pred, t,
+                                             denoised_output).prev_sample
+            if clip_noise:
+                denoised_output = paddle.clip(denoised_output, n_min, n_max)
+
+        if self.stretch:
+            denoised_output = denoised_output.transpose((0, 2, 1))
+            denoised_output = self.denorm_spec(denoised_output)
+            denoised_output = denoised_output.transpose((0, 2, 1))
+
+        return denoised_output
diff --git a/paddlespeech/t2s/modules/losses.py b/paddlespeech/t2s/modules/losses.py
index 1a43f5ef..b4d78364 100644
--- a/paddlespeech/t2s/modules/losses.py
+++ b/paddlespeech/t2s/modules/losses.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
+from typing import Tuple
 
 import librosa
 import numpy as np
@@ -19,8 +20,13 @@ import paddle
 from paddle import nn
 from paddle.nn import functional as F
 from scipy import signal
+from scipy.stats import betabinom
+from typeguard import check_argument_types
 
 from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask
+from paddlespeech.t2s.modules.predictor.duration_predictor import (
+    DurationPredictorLoss,  # noqa: H301
+)
 
 
 # Losses for WaveRNN
@@ -1126,3 +1132,195 @@ class MLMLoss(nn.Layer):
                 text_masked_pos_reshape) / paddle.sum((text_masked_pos) + 1e-10)
 
         return mlm_loss, text_mlm_loss
+
+
+class VarianceLoss(nn.Layer):
+    def __init__(self, use_masking: bool=True,
+                 use_weighted_masking: bool=False):
+        """Initialize JETS variance loss module.
+        Args:
+            use_masking (bool): Whether to apply masking for padded part in loss
+                calculation.
+            use_weighted_masking (bool): Whether to weighted masking in loss
+                calculation.
+
+        """
+        assert check_argument_types()
+        super().__init__()
+
+        assert (use_masking != use_weighted_masking) or not use_masking
+        self.use_masking = use_masking
+        self.use_weighted_masking = use_weighted_masking
+
+        # define criterions
+        reduction = "none" if self.use_weighted_masking else "mean"
+        self.mse_criterion = nn.MSELoss(reduction=reduction)
+        self.duration_criterion = DurationPredictorLoss(reduction=reduction)
+
+    def forward(
+            self,
+            d_outs: paddle.Tensor,
+            ds: paddle.Tensor,
+            p_outs: paddle.Tensor,
+            ps: paddle.Tensor,
+            e_outs: paddle.Tensor,
+            es: paddle.Tensor,
+            ilens: paddle.Tensor,
+    ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]:
+        """Calculate forward propagation.
+
+        Args:
+            d_outs (LongTensor): Batch of outputs of duration predictor (B, T_text).
+            ds (LongTensor): Batch of durations (B, T_text).
+            p_outs (Tensor): Batch of outputs of pitch predictor (B, T_text, 1).
+            ps (Tensor): Batch of target token-averaged pitch (B, T_text, 1).
+            e_outs (Tensor): Batch of outputs of energy predictor (B, T_text, 1).
+            es (Tensor): Batch of target token-averaged energy (B, T_text, 1).
+            ilens (LongTensor): Batch of the lengths of each input (B,).
+
+        Returns:
+            Tensor: Duration predictor loss value.
+            Tensor: Pitch predictor loss value.
+            Tensor: Energy predictor loss value.
+
+        """
+        # apply mask to remove padded part
+        if self.use_masking:
+            duration_masks = paddle.to_tensor(
+                make_non_pad_mask(ilens), place=ds.place)
+            d_outs = d_outs.masked_select(duration_masks)
+            ds = ds.masked_select(duration_masks)
+            pitch_masks = paddle.to_tensor(
+                make_non_pad_mask(ilens).unsqueeze(-1), place=ds.place)
+            p_outs = p_outs.masked_select(pitch_masks)
+            e_outs = e_outs.masked_select(pitch_masks)
+            ps = ps.masked_select(pitch_masks)
+            es = es.masked_select(pitch_masks)
+
+        # calculate loss
+        duration_loss = self.duration_criterion(d_outs, ds)
+        pitch_loss = self.mse_criterion(p_outs, ps)
+        energy_loss = self.mse_criterion(e_outs, es)
+
+        # make weighted mask and apply it
+        if self.use_weighted_masking:
+            duration_masks = paddle.to_tensor(
+                make_non_pad_mask(ilens), place=ds.place)
+            duration_weights = (duration_masks.float() /
+                                duration_masks.sum(dim=1, keepdim=True).float())
+            duration_weights /= ds.size(0)
+
+            # apply weight
+            duration_loss = (duration_loss.mul(duration_weights).masked_select(
+                duration_masks).sum())
+            pitch_masks = duration_masks.unsqueeze(-1)
+            pitch_weights = duration_weights.unsqueeze(-1)
+            pitch_loss = pitch_loss.mul(pitch_weights).masked_select(
+                pitch_masks).sum()
+            energy_loss = (
+                energy_loss.mul(pitch_weights).masked_select(pitch_masks).sum())
+
+        return duration_loss, pitch_loss, energy_loss
+
+
+class ForwardSumLoss(nn.Layer):
+    """
+    https://openreview.net/forum?id=0NQwnnwAORi
+    """
+
+    def __init__(self, cache_prior: bool=True):
+        """
+        Args:
+            cache_prior (bool): Whether to cache beta-binomial prior
+        """
+        super().__init__()
+        self.cache_prior = cache_prior
+        self._cache = {}
+
+    def forward(
+            self,
+            log_p_attn: paddle.Tensor,
+            ilens: paddle.Tensor,
+            olens: paddle.Tensor,
+            blank_prob: float=np.e**-1, ) -> paddle.Tensor:
+        """
+        Args:
+            log_p_attn (Tensor): Batch of log probability of attention matrix (B, T_feats, T_text).
+            ilens (Tensor): Batch of the lengths of each input (B,).
+            olens (Tensor): Batch of the lengths of each target (B,).
+            blank_prob (float): Blank symbol probability
+
+        Returns:
+            Tensor: forwardsum loss value.
+        """
+
+        B = log_p_attn.shape[0]
+        # add beta-binomial prior
+        bb_prior = self._generate_prior(ilens, olens)
+        bb_prior = paddle.to_tensor(
+            bb_prior, dtype=log_p_attn.dtype, place=log_p_attn.place)
+        log_p_attn = log_p_attn + bb_prior
+
+        # a row must be added to the attention matrix to account for blank token of CTC loss
+        # (B,T_feats,T_text+1)
+        log_p_attn_pd = F.pad(
+            log_p_attn, (0, 0, 0, 0, 1, 0), value=np.log(blank_prob))
+        loss = 0
+        for bidx in range(B):
+            # construct target sequnece.
+            # Every text token is mapped to a unique sequnece number.
+            target_seq = paddle.arange(
+                1, ilens[bidx] + 1, dtype="int32").unsqueeze(0)
+            cur_log_p_attn_pd = log_p_attn_pd[bidx, :olens[bidx], :ilens[
+                bidx] + 1].unsqueeze(1)  # (T_feats,1,T_text+1)
+            # The input of ctc_loss API need to be fixed
+            loss += F.ctc_loss(
+                log_probs=cur_log_p_attn_pd,
+                labels=target_seq,
+                input_lengths=olens[bidx:bidx + 1],
+                label_lengths=ilens[bidx:bidx + 1])
+        loss = loss / B
+
+        return loss
+
+    def _generate_prior(self, text_lengths, feats_lengths,
+                        w=1) -> paddle.Tensor:
+        """Generate alignment prior formulated as beta-binomial distribution
+
+        Args:
+            text_lengths (Tensor): Batch of the lengths of each input (B,).
+            feats_lengths (Tensor): Batch of the lengths of each target (B,).
+            w (float): Scaling factor; lower -> wider the width
+
+        Returns:
+            Tensor: Batched 2d static prior matrix (B, T_feats, T_text)   
+        """
+        B = len(text_lengths)
+        T_text = text_lengths.max()
+        T_feats = feats_lengths.max()
+
+        bb_prior = paddle.full((B, T_feats, T_text), fill_value=-np.inf)
+        for bidx in range(B):
+            T = feats_lengths[bidx].item()
+            N = text_lengths[bidx].item()
+
+            key = str(T) + ',' + str(N)
+            if self.cache_prior and key in self._cache:
+                prob = self._cache[key]
+            else:
+                alpha = w * np.arange(1, T + 1, dtype=float)  # (T,)
+                beta = w * np.array([T - t + 1 for t in alpha])
+                k = np.arange(N)
+                batched_k = k[..., None]  # (N,1)
+                prob = betabinom.pmf(batched_k, N, alpha, beta)  # (N,T)
+
+            # store cache
+            if self.cache_prior and key not in self._cache:
+                self._cache[key] = prob
+
+            prob = paddle.to_tensor(
+                prob, place=text_lengths.place, dtype="float32").transpose(
+                    (1, 0))  # -> (T,N)
+            bb_prior[bidx, :T, :N] = prob
+
+        return bb_prior
diff --git a/paddlespeech/t2s/modules/masked_fill.py b/paddlespeech/t2s/modules/masked_fill.py
index b3222254..1445a926 100644
--- a/paddlespeech/t2s/modules/masked_fill.py
+++ b/paddlespeech/t2s/modules/masked_fill.py
@@ -38,11 +38,9 @@ def masked_fill(xs: paddle.Tensor,
                 value: Union[float, int]):
     # comment following line for converting dygraph to static graph. 
     # assert is_broadcastable(xs.shape, mask.shape) is True
-    # bshape = paddle.broadcast_shape(xs.shape, mask.shape)   
     bshape = broadcast_shape(xs.shape, mask.shape)
     mask.stop_gradient = True
     mask = mask.broadcast_to(bshape)
-
     trues = paddle.ones_like(xs) * value
     mask = mask.cast(dtype=paddle.bool)
     xs = paddle.where(mask, trues, xs)
diff --git a/paddlespeech/t2s/modules/nets_utils.py b/paddlespeech/t2s/modules/nets_utils.py
index 798e4dee..3d1b48de 100644
--- a/paddlespeech/t2s/modules/nets_utils.py
+++ b/paddlespeech/t2s/modules/nets_utils.py
@@ -20,6 +20,44 @@ import paddle
 from paddle import nn
 from typeguard import check_argument_types
 
+from paddlespeech.utils.initialize import _calculate_fan_in_and_fan_out
+from paddlespeech.utils.initialize import kaiming_uniform_
+from paddlespeech.utils.initialize import normal_
+from paddlespeech.utils.initialize import ones_
+from paddlespeech.utils.initialize import uniform_
+from paddlespeech.utils.initialize import zeros_
+
+
+# default init method of torch
+# copy from https://github.com/PaddlePaddle/PaddleSpeech/blob/9cf8c1985a98bb380c183116123672976bdfe5c9/paddlespeech/t2s/models/vits/vits.py#L506
+def _reset_parameters(module):
+    if isinstance(module, (nn.Conv1D, nn.Conv1DTranspose, nn.Conv2D,
+                           nn.Conv2DTranspose)):
+        kaiming_uniform_(module.weight, a=math.sqrt(5))
+        if module.bias is not None:
+            fan_in, _ = _calculate_fan_in_and_fan_out(module.weight)
+            if fan_in != 0:
+                bound = 1 / math.sqrt(fan_in)
+                uniform_(module.bias, -bound, bound)
+
+    if isinstance(module,
+                  (nn.BatchNorm1D, nn.BatchNorm2D, nn.GroupNorm, nn.LayerNorm)):
+        ones_(module.weight)
+        zeros_(module.bias)
+
+    if isinstance(module, nn.Linear):
+        kaiming_uniform_(module.weight, a=math.sqrt(5))
+        if module.bias is not None:
+            fan_in, _ = _calculate_fan_in_and_fan_out(module.weight)
+            bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+            uniform_(module.bias, -bound, bound)
+
+    if isinstance(module, nn.Embedding):
+        normal_(module.weight)
+        if module._padding_idx is not None:
+            with paddle.no_grad():
+                module.weight[module._padding_idx] = 0
+
 
 def pad_list(xs, pad_value):
     """Perform padding for the list of tensors.
@@ -145,18 +183,18 @@ def make_pad_mask(lengths, xs=None, length_dim=-1):
 
     bs = paddle.shape(lengths)[0]
     if xs is None:
-        maxlen = lengths.max()
+        maxlen = paddle.cast(lengths.max(), dtype=bs.dtype)
     else:
         maxlen = paddle.shape(xs)[length_dim]
 
     seq_range = paddle.arange(0, maxlen, dtype=paddle.int64)
+    # VITS 最后一个 expand 的位置
     seq_range_expand = seq_range.unsqueeze(0).expand([bs, maxlen])
     seq_length_expand = lengths.unsqueeze(-1)
     mask = seq_range_expand >= seq_length_expand.cast(seq_range_expand.dtype)
 
     if xs is not None:
         assert paddle.shape(xs)[0] == bs, (paddle.shape(xs)[0], bs)
-
         if length_dim < 0:
             length_dim = len(paddle.shape(xs)) + length_dim
         # ind = (:, None, ..., None, :, , None, ..., None)
diff --git a/paddlespeech/t2s/modules/predictor/variance_predictor.py b/paddlespeech/t2s/modules/predictor/variance_predictor.py
index 4c2a67cc..197f7359 100644
--- a/paddlespeech/t2s/modules/predictor/variance_predictor.py
+++ b/paddlespeech/t2s/modules/predictor/variance_predictor.py
@@ -96,7 +96,7 @@ class VariancePredictor(nn.Layer):
             xs = f(xs)
         # (B, Tmax, 1)
         xs = self.linear(xs.transpose([0, 2, 1]))
-
+    
         if x_masks is not None:
             xs = masked_fill(xs, x_masks, 0.0)
         return xs
diff --git a/paddlespeech/t2s/modules/tacotron2/attentions.py b/paddlespeech/t2s/modules/tacotron2/attentions.py
index cdaef460..5d1a2484 100644
--- a/paddlespeech/t2s/modules/tacotron2/attentions.py
+++ b/paddlespeech/t2s/modules/tacotron2/attentions.py
@@ -47,11 +47,13 @@ def _apply_attention_constraint(e,
         https://arxiv.org/abs/1710.07654
 
     """
-    if paddle.shape(e)[0] != 1:
-        raise NotImplementedError(
-            "Batch attention constraining is not yet supported.")
-    backward_idx = last_attended_idx - backward_window
-    forward_idx = last_attended_idx + forward_window
+    # for dygraph to static graph
+    # if e.shape[0] != 1:
+    #     raise NotImplementedError(
+    #         "Batch attention constraining is not yet supported.")
+    backward_idx = paddle.cast(
+        last_attended_idx - backward_window, dtype='int64')
+    forward_idx = paddle.cast(last_attended_idx + forward_window, dtype='int64')
     if backward_idx > 0:
         e[:, :backward_idx] = -float("inf")
     if forward_idx < paddle.shape(e)[1]:
@@ -122,7 +124,7 @@ class AttLoc(nn.Layer):
             dec_z,
             att_prev,
             scaling=2.0,
-            last_attended_idx=None,
+            last_attended_idx=-1,
             backward_window=1,
             forward_window=3, ):
         """Calculate AttLoc forward propagation.
@@ -192,7 +194,7 @@ class AttLoc(nn.Layer):
 
         e = masked_fill(e, self.mask, -float("inf"))
         # apply monotonic attention constraint (mainly for TTS)
-        if last_attended_idx is not None:
+        if last_attended_idx != -1:
             e = _apply_attention_constraint(e, last_attended_idx,
                                             backward_window, forward_window)
 
diff --git a/paddlespeech/t2s/modules/tacotron2/decoder.py b/paddlespeech/t2s/modules/tacotron2/decoder.py
index 41c94b63..15e29194 100644
--- a/paddlespeech/t2s/modules/tacotron2/decoder.py
+++ b/paddlespeech/t2s/modules/tacotron2/decoder.py
@@ -556,13 +556,15 @@ class Decoder(nn.Layer):
         if use_att_constraint:
             last_attended_idx = 0
         else:
-            last_attended_idx = None
+            last_attended_idx = -1
 
         # loop for an output sequence
         idx = 0
         outs, att_ws, probs = [], [], []
         prob = paddle.zeros([1])
-        while True:
+        while paddle.to_tensor(True):
+            z_list = z_list
+            c_list = c_list
             # updated index
             idx += self.reduction_factor
 
diff --git a/paddlespeech/t2s/modules/transformer/attention.py b/paddlespeech/t2s/modules/transformer/attention.py
index d7a03244..3237be1b 100644
--- a/paddlespeech/t2s/modules/transformer/attention.py
+++ b/paddlespeech/t2s/modules/transformer/attention.py
@@ -103,7 +103,7 @@ class MultiHeadedAttention(nn.Layer):
             mask = paddle.logical_not(mask)
             # assume scores.dtype==paddle.float32, we only use "float32" here
             dtype = str(scores.dtype).split(".")[-1]
-            min_value = numpy.finfo(dtype).min
+            min_value = float(numpy.finfo(dtype).min)
             scores = masked_fill(scores, mask, min_value)
             # (batch, head, time1, time2)
             self.attn = softmax(scores)
@@ -192,12 +192,11 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):
         x_padded = paddle.concat([zero_pad, x], axis=-1)
         x_padded = x_padded.reshape([b, h, t2 + 1, t1])
         # only keep the positions from 0 to time2
-        x = x_padded[:, :, 1:].reshape([b, h, t1, t2])[:, :, :, :t2 // 2 + 1]
-
+        new_t = paddle.cast(paddle.floor(t2 / 2) + 1, dtype='int32')
+        x = x_padded[:, :, 1:].reshape([b, h, t1, t2])[:, :, :, :new_t]
         if self.zero_triu:
             ones = paddle.ones((t1, t2))
-            x = x * paddle.tril(ones, t2 - 1)[None, None, :, :]
-
+            x = x * paddle.tril(ones, t2 - t1)[None, None, :, :]
         return x
 
     def forward(self, query, key, value, pos_emb, mask):
@@ -221,7 +220,6 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):
         q, k, v = self.forward_qkv(query, key, value)
         # (batch, time1, head, d_k)
         q = q.transpose([0, 2, 1, 3])
-
         n_batch_pos = paddle.shape(pos_emb)[0]
         p = self.linear_pos(pos_emb).reshape(
             [n_batch_pos, -1, self.h, self.d_k])
@@ -299,7 +297,7 @@ class LegacyRelPositionMultiHeadedAttention(MultiHeadedAttention):
 
         if self.zero_triu:
             ones = paddle.ones((t1, t2))
-            x = x * paddle.tril(ones, t2 - 1)[None, None, :, :]
+            x = x * paddle.tril(ones, t2 - t1)[None, None, :, :]
 
         return x
 
diff --git a/paddlespeech/t2s/modules/transformer/embedding.py b/paddlespeech/t2s/modules/transformer/embedding.py
index 7ba301cb..f90eb44a 100644
--- a/paddlespeech/t2s/modules/transformer/embedding.py
+++ b/paddlespeech/t2s/modules/transformer/embedding.py
@@ -198,7 +198,8 @@ class RelPositionalEncoding(nn.Layer):
         x = x * self.xscale
         T = paddle.shape(x)[1]
         pe_size = paddle.shape(self.pe)
-        pos_emb = self.pe[:, pe_size[1] // 2 - T + 1:pe_size[1] // 2 + T, ]
+        tmp = paddle.cast(paddle.floor(pe_size[1] / 2), dtype='int32')
+        pos_emb = self.pe[:, tmp - T + 1:tmp + T, ]
         return self.dropout(x), self.dropout(pos_emb)
 
 
diff --git a/paddlespeech/t2s/modules/transformer/encoder.py b/paddlespeech/t2s/modules/transformer/encoder.py
index f2aed589..0fd94689 100644
--- a/paddlespeech/t2s/modules/transformer/encoder.py
+++ b/paddlespeech/t2s/modules/transformer/encoder.py
@@ -15,6 +15,7 @@
 from typing import List
 from typing import Union
 
+import paddle
 from paddle import nn
 
 from paddlespeech.t2s.modules.activation import get_activation
@@ -390,7 +391,13 @@ class TransformerEncoder(BaseEncoder):
             padding_idx=padding_idx,
             encoder_type="transformer")
 
-    def forward(self, xs, masks):
+    def forward(self,
+                xs: paddle.Tensor,
+                masks: paddle.Tensor,
+                note_emb: paddle.Tensor=None,
+                note_dur_emb: paddle.Tensor=None,
+                is_slur_emb: paddle.Tensor=None,
+                scale: int=16):
         """Encoder input sequence.
 
         Args:
@@ -398,6 +405,12 @@ class TransformerEncoder(BaseEncoder):
                 Input tensor (#batch, time, idim).
             masks(Tensor): 
                 Mask tensor (#batch, 1, time).
+            note_emb(Tensor): 
+                Input tensor (#batch, time, attention_dim).
+            note_dur_emb(Tensor): 
+                Input tensor (#batch, time, attention_dim).
+            is_slur_emb(Tensor): 
+                Input tensor (#batch, time, attention_dim).
 
         Returns:
             Tensor: 
@@ -406,6 +419,8 @@ class TransformerEncoder(BaseEncoder):
                 Mask tensor (#batch, 1, time).
         """
         xs = self.embed(xs)
+        if note_emb is not None:
+            xs = scale * xs + note_emb + note_dur_emb + is_slur_emb
         xs, masks = self.encoders(xs, masks)
         if self.normalize_before:
             xs = self.after_norm(xs)
diff --git a/paddlespeech/t2s/modules/transformer/lightconv.py b/paddlespeech/t2s/modules/transformer/lightconv.py
index 22217d50..85336f4f 100644
--- a/paddlespeech/t2s/modules/transformer/lightconv.py
+++ b/paddlespeech/t2s/modules/transformer/lightconv.py
@@ -110,7 +110,7 @@ class LightweightConvolution(nn.Layer):
                 (batch, time1, time2) mask
 
         Return:
-            Tensor: ouput. (batch, time1, d_model) 
+            Tensor: output. (batch, time1, d_model) 
 
         """
         # linear -> GLU -> lightconv -> linear
diff --git a/paddlespeech/t2s/modules/transformer/multi_layer_conv.py b/paddlespeech/t2s/modules/transformer/multi_layer_conv.py
index 91d67ca5..a322becd 100644
--- a/paddlespeech/t2s/modules/transformer/multi_layer_conv.py
+++ b/paddlespeech/t2s/modules/transformer/multi_layer_conv.py
@@ -69,8 +69,8 @@ class MultiLayeredConv1d(nn.Layer):
             Tensor: Batch of output tensors (B, T, in_chans).
         """
         x = self.relu(self.w_1(x.transpose([0, 2, 1]))).transpose([0, 2, 1])
-        return self.w_2(self.dropout(x).transpose([0, 2, 1])).transpose(
-            [0, 2, 1])
+        out = self.w_2(self.dropout(x).transpose([0, 2, 1])).transpose([0, 2, 1])
+        return out
 
 
 class Conv1dLinear(nn.Layer):
diff --git a/paddlespeech/t2s/modules/wavenet_denoiser.py b/paddlespeech/t2s/modules/wavenet_denoiser.py
new file mode 100644
index 00000000..61c92265
--- /dev/null
+++ b/paddlespeech/t2s/modules/wavenet_denoiser.py
@@ -0,0 +1,185 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+
+import paddle
+from paddle import nn
+from ppdiffusers.models.embeddings import Timesteps
+
+from paddlespeech.t2s.modules.nets_utils import initialize
+from paddlespeech.t2s.modules.residual_block import WaveNetResidualBlock
+
+
+class WaveNetDenoiser(nn.Layer):
+    """A Mel-Spectrogram Denoiser modified from WaveNet
+
+    Args:
+        in_channels (int, optional): 
+            Number of channels of the input mel-spectrogram, by default 80
+        out_channels (int, optional): 
+            Number of channels of the output mel-spectrogram, by default 80
+        kernel_size (int, optional): 
+            Kernel size of the residual blocks inside, by default 3
+        layers (int, optional): 
+            Number of residual blocks inside, by default 20
+        stacks (int, optional):
+            The number of groups to split the residual blocks into, by default 5
+            Within each group, the dilation of the residual block grows exponentially.
+        residual_channels (int, optional): 
+            Residual channel of the residual blocks, by default 256
+        gate_channels (int, optional): 
+            Gate channel of the residual blocks, by default 512
+        skip_channels (int, optional): 
+            Skip channel of the residual blocks, by default 256
+        aux_channels (int, optional): 
+            Auxiliary channel of the residual blocks, by default 256
+        dropout (float, optional): 
+            Dropout of the residual blocks, by default 0.
+        bias (bool, optional): 
+            Whether to use bias in residual blocks, by default True
+        use_weight_norm (bool, optional): 
+            Whether to use weight norm in all convolutions, by default False
+    """
+
+    def __init__(
+            self,
+            in_channels: int=80,
+            out_channels: int=80,
+            kernel_size: int=3,
+            layers: int=20,
+            stacks: int=5,
+            residual_channels: int=256,
+            gate_channels: int=512,
+            skip_channels: int=256,
+            aux_channels: int=256,
+            dropout: float=0.,
+            bias: bool=True,
+            use_weight_norm: bool=False,
+            init_type: str="kaiming_normal", ):
+        super().__init__()
+
+        # initialize parameters
+        initialize(self, init_type)
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.aux_channels = aux_channels
+        self.layers = layers
+        self.stacks = stacks
+        self.kernel_size = kernel_size
+
+        assert layers % stacks == 0
+        layers_per_stack = layers // stacks
+
+        self.first_t_emb = nn.Sequential(
+            Timesteps(
+                residual_channels,
+                flip_sin_to_cos=False,
+                downscale_freq_shift=1),
+            nn.Linear(residual_channels, residual_channels * 4),
+            nn.Mish(), nn.Linear(residual_channels * 4, residual_channels))
+        self.t_emb_layers = nn.LayerList([
+            nn.Linear(residual_channels, residual_channels)
+            for _ in range(layers)
+        ])
+
+        self.first_conv = nn.Conv1D(
+            in_channels, residual_channels, 1, bias_attr=True)
+        self.first_act = nn.ReLU()
+
+        self.conv_layers = nn.LayerList()
+        for layer in range(layers):
+            dilation = 2**(layer % layers_per_stack)
+            conv = WaveNetResidualBlock(
+                kernel_size=kernel_size,
+                residual_channels=residual_channels,
+                gate_channels=gate_channels,
+                skip_channels=skip_channels,
+                aux_channels=aux_channels,
+                dilation=dilation,
+                dropout=dropout,
+                bias=bias)
+            self.conv_layers.append(conv)
+
+        final_conv = nn.Conv1D(skip_channels, out_channels, 1, bias_attr=True)
+        nn.initializer.Constant(0.0)(final_conv.weight)
+        self.last_conv_layers = nn.Sequential(nn.ReLU(),
+                                              nn.Conv1D(
+                                                  skip_channels,
+                                                  skip_channels,
+                                                  1,
+                                                  bias_attr=True),
+                                              nn.ReLU(), final_conv)
+
+        if use_weight_norm:
+            self.apply_weight_norm()
+
+    def forward(self, x: paddle.Tensor, t: paddle.Tensor, c: paddle.Tensor):
+        """Denoise mel-spectrogram.
+
+        Args:
+            x(Tensor): 
+                Shape (B, C_in, T), The input mel-spectrogram.
+            t(Tensor): 
+                Shape (B), The timestep input.
+            c(Tensor): 
+                Shape (B, C_aux, T'). The auxiliary input (e.g. fastspeech2 encoder output).
+
+        Returns:
+            Tensor: Shape (B, C_out, T), the pred noise.
+        """
+        assert c.shape[-1] == x.shape[-1]
+
+        if t.shape[0] != x.shape[0]:
+            t = t.tile([x.shape[0]])
+        t_emb = self.first_t_emb(t)
+        t_embs = [
+            t_emb_layer(t_emb)[..., None] for t_emb_layer in self.t_emb_layers
+        ]
+
+        x = self.first_conv(x)
+        x = self.first_act(x)
+        skips = 0
+        for f, t in zip(self.conv_layers, t_embs):
+            x = x + t
+            x, s = f(x, c)
+            skips += s
+        skips *= math.sqrt(1.0 / len(self.conv_layers))
+
+        x = self.last_conv_layers(skips)
+        return x
+
+    def apply_weight_norm(self):
+        """Recursively apply weight normalization to all the Convolution layers
+        in the sublayers.
+        """
+
+        def _apply_weight_norm(layer):
+            if isinstance(layer, (nn.Conv1D, nn.Conv2D)):
+                nn.utils.weight_norm(layer)
+
+        self.apply(_apply_weight_norm)
+
+    def remove_weight_norm(self):
+        """Recursively remove weight normalization from all the Convolution 
+        layers in the sublayers.
+        """
+
+        def _remove_weight_norm(layer):
+            try:
+                nn.utils.remove_weight_norm(layer)
+            except ValueError:
+                pass
+
+        self.apply(_remove_weight_norm)
diff --git a/paddlespeech/t2s/ssml/xml_processor.py b/paddlespeech/t2s/ssml/xml_processor.py
index b3912134..892ca371 100644
--- a/paddlespeech/t2s/ssml/xml_processor.py
+++ b/paddlespeech/t2s/ssml/xml_processor.py
@@ -74,6 +74,28 @@ class MixTextProcessor():
             ctlist.append([mixstr, []])
         return ctlist
 
+    @classmethod
+    def get_dom_split(self, mixstr):
+        ''' 文本分解，顺序加了列表中，返回文本和say-as标签
+        '''
+        ctlist = []
+        patn = re.compile(r'(.*\s*?)(<speak>.*?</speak>)(.*\s*)$', re.M | re.S)
+        mat = re.match(patn, mixstr)
+        if mat:
+            pre_xml = mat.group(1)
+            in_xml = mat.group(2)
+            after_xml = mat.group(3)
+
+            ctlist.append(pre_xml)
+            dom = DomXml(in_xml)
+            tags = dom.get_text_and_sayas_tags()
+            ctlist.extend(tags)
+            
+            ctlist.append(after_xml)
+            return ctlist
+        else:
+            ctlist.append(mixstr)
+        return ctlist
 
 class DomXml():
     def __init__(self, xmlstr):
@@ -156,3 +178,15 @@ class DomXml():
             if x.hasAttribute('pinyin'):  # pinyin
                 print(x.tagName, 'pinyin',
                       x.getAttribute('pinyin'), x.firstChild.data)
+
+    def get_text_and_sayas_tags(self):
+        '''返回 xml 内容的列表，包括所有文本内容和<say-as> tag'''
+        res = []
+
+        for x1 in self.rnode:
+            if x1.nodeType == Node.TEXT_NODE:
+                res.append(x1.value)
+            else:
+                for x2 in x1.childNodes:
+                    res.append(x2.toxml())
+        return res
diff --git a/paddlespeech/text/models/ernie_crf/model.py b/paddlespeech/text/models/ernie_crf/model.py
index d1ce8099..19f591b8 100644
--- a/paddlespeech/text/models/ernie_crf/model.py
+++ b/paddlespeech/text/models/ernie_crf/model.py
@@ -27,7 +27,7 @@ class ErnieCrf(nn.Layer):
                  **kwargs):
         super().__init__()
         self.ernie = ErnieForTokenClassification.from_pretrained(
-            pretrained_token, num_classes=num_classes, **kwargs)
+            pretrained_token, num_labels=num_classes, **kwargs)
         self.num_classes = num_classes
         self.crf = LinearChainCrf(
             self.num_classes, crf_lr=crf_lr, with_start_stop_tag=False)
diff --git a/paddlespeech/text/models/ernie_linear/ernie_linear.py b/paddlespeech/text/models/ernie_linear/ernie_linear.py
index c450a904..48a261ab 100644
--- a/paddlespeech/text/models/ernie_linear/ernie_linear.py
+++ b/paddlespeech/text/models/ernie_linear/ernie_linear.py
@@ -43,9 +43,9 @@ class ErnieLinear(nn.Layer):
                 num_classes, int
             ) and num_classes > 0, 'Argument `num_classes` must be an integer.'
             self.ernie = ErnieForTokenClassification.from_pretrained(
-                pretrained_token, num_classes=num_classes, **kwargs)
+                pretrained_token, num_labels=num_classes, **kwargs)
 
-        self.num_classes = self.ernie.num_classes
+        self.num_classes = self.ernie.num_labels
         self.softmax = nn.Softmax()
 
     def forward(self,
diff --git a/paddlespeech/utils/argparse.py b/paddlespeech/utils/argparse.py
new file mode 100644
index 00000000..aad3801e
--- /dev/null
+++ b/paddlespeech/utils/argparse.py
@@ -0,0 +1,100 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import hashlib
+import os
+import sys
+from typing import Text
+
+import distutils
+
+__all__ = ["print_arguments", "add_arguments", "get_commandline_args"]
+
+
+def get_commandline_args():
+    extra_chars = [
+        " ",
+        ";",
+        "&",
+        "(",
+        ")",
+        "|",
+        "^",
+        "<",
+        ">",
+        "?",
+        "*",
+        "[",
+        "]",
+        "$",
+        "`",
+        '"',
+        "\\",
+        "!",
+        "{",
+        "}",
+    ]
+
+    # Escape the extra characters for shell
+    argv = [
+        arg.replace("'", "'\\''") if all(char not in arg
+                                         for char in extra_chars) else
+        "'" + arg.replace("'", "'\\''") + "'" for arg in sys.argv
+    ]
+
+    return sys.executable + " " + " ".join(argv)
+
+
+def print_arguments(args, info=None):
+    """Print argparse's arguments.
+
+    Usage:
+
+    .. code-block:: python
+
+        parser = argparse.ArgumentParser()
+        parser.add_argument("name", default="Jonh", type=str, help="User name.")
+        args = parser.parse_args()
+        print_arguments(args)
+
+    :param args: Input argparse.Namespace for printing.
+    :type args: argparse.Namespace
+    """
+    filename = ""
+    if info:
+        filename = info["__file__"]
+    filename = os.path.basename(filename)
+    print(f"----------- {filename} Configuration Arguments -----------")
+    for arg, value in sorted(vars(args).items()):
+        print("%s: %s" % (arg, value))
+    print("-----------------------------------------------------------")
+
+
+def add_arguments(argname, type, default, help, argparser, **kwargs):
+    """Add argparse's argument.
+
+    Usage:
+
+    .. code-block:: python
+
+        parser = argparse.ArgumentParser()
+        add_argument("name", str, "Jonh", "User name.", parser)
+        args = parser.parse_args()
+    """
+    type = distutils.util.strtobool if type == bool else type
+    argparser.add_argument(
+        "--" + argname,
+        default=default,
+        type=type,
+        help=help + ' Default: %(default)s.',
+        **kwargs)
diff --git a/paddlespeech/utils/initialize.py b/paddlespeech/utils/initialize.py
new file mode 100644
index 00000000..8ebe6845
--- /dev/null
+++ b/paddlespeech/utils/initialize.py
@@ -0,0 +1,321 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This code is based on https://github.com/pytorch/pytorch/blob/master/torch/nn/init.py
+Ths copyright of pytorch/pytorch is a BSD-style license, as found in the LICENSE file.
+"""
+import math
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+
+__all__ = [
+    "uniform_",
+    "normal_",
+    "constant_",
+    "ones_",
+    "zeros_",
+    "xavier_uniform_",
+    "xavier_normal_",
+    "kaiming_uniform_",
+    "kaiming_normal_",
+    "linear_init_",
+    "conv_init_",
+    "reset_initialized_parameter",
+    "_calculate_fan_in_and_fan_out",
+]
+
+
+def _no_grad_uniform_(tensor, a, b):
+    with paddle.no_grad():
+        tensor.set_value(
+            paddle.uniform(
+                shape=tensor.shape, dtype=tensor.dtype, min=a, max=b))
+    return tensor
+
+
+def _no_grad_normal_(tensor, mean=0.0, std=1.0):
+    with paddle.no_grad():
+        tensor.set_value(paddle.normal(mean=mean, std=std, shape=tensor.shape))
+    return tensor
+
+
+def _no_grad_fill_(tensor, value=0.0):
+    with paddle.no_grad():
+        tensor.set_value(paddle.full_like(tensor, value, dtype=tensor.dtype))
+    return tensor
+
+
+def uniform_(tensor, a, b):
+    """
+    Modified tensor inspace using uniform_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        a (float|int): min value.
+        b (float|int): max value.
+    Return:
+        tensor
+    """
+    return _no_grad_uniform_(tensor, a, b)
+
+
+def normal_(tensor, mean=0.0, std=1.0):
+    """
+    Modified tensor inspace using normal_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        mean (float|int): mean value.
+        std (float|int): std value.
+    Return:
+        tensor
+    """
+    return _no_grad_normal_(tensor, mean, std)
+
+
+def constant_(tensor, value=0.0):
+    """
+    Modified tensor inspace using constant_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        value (float|int): value to fill tensor.
+    Return:
+        tensor
+    """
+    return _no_grad_fill_(tensor, value)
+
+
+def ones_(tensor):
+    """
+    Modified tensor inspace using ones_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+    Return:
+        tensor
+    """
+    return _no_grad_fill_(tensor, 1)
+
+
+def zeros_(tensor):
+    """
+    Modified tensor inspace using zeros_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+    Return:
+        tensor
+    """
+    return _no_grad_fill_(tensor, 0)
+
+
+def vector_(tensor, vector):
+    with paddle.no_grad():
+        tensor.set_value(paddle.to_tensor(vector, dtype=tensor.dtype))
+    return tensor
+
+
+def _calculate_fan_in_and_fan_out(tensor, reverse=False):
+    """
+    Calculate (fan_in, _fan_out) for tensor
+    Args:
+        tensor (Tensor): paddle.Tensor
+        reverse (bool: False): tensor data format order, False by default as [fout, fin, ...]. e.g. : conv.weight [cout, cin, kh, kw] is False; linear.weight [cin, cout] is True
+    Return:
+        Tuple[fan_in, fan_out]
+    """
+    if tensor.ndim < 2:
+        raise ValueError(
+            "Fan in and fan out can not be computed for tensor with fewer than 2 dimensions"
+        )
+
+    if reverse:
+        num_input_fmaps, num_output_fmaps = tensor.shape[0], tensor.shape[1]
+    else:
+        num_input_fmaps, num_output_fmaps = tensor.shape[1], tensor.shape[0]
+
+    receptive_field_size = 1
+    if tensor.ndim > 2:
+        receptive_field_size = np.prod(tensor.shape[2:])
+
+    fan_in = num_input_fmaps * receptive_field_size
+    fan_out = num_output_fmaps * receptive_field_size
+
+    return fan_in, fan_out
+
+
+def xavier_uniform_(tensor, gain=1.0, reverse=False):
+    """
+    Modified tensor inspace using xavier_uniform_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        gain (float): super parameter, 1. default.
+        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
+    Return:
+        tensor
+    """
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse=reverse)
+    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
+    k = math.sqrt(3.0) * std
+    return _no_grad_uniform_(tensor, -k, k)
+
+
+def xavier_normal_(tensor, gain=1.0, reverse=False):
+    """
+    Modified tensor inspace using xavier_normal_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        gain (float): super parameter, 1. default.
+        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
+    Return:
+        tensor
+    """
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse=reverse)
+    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
+    return _no_grad_normal_(tensor, 0, std)
+
+
+# reference: https://pytorch.org/docs/stable/_modules/torch/nn/init.html
+def _calculate_correct_fan(tensor, mode, reverse=False):
+    mode = mode.lower()
+    valid_modes = ["fan_in", "fan_out"]
+    if mode not in valid_modes:
+        raise ValueError("Mode {} not supported, please use one of {}".format(
+            mode, valid_modes))
+
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse)
+
+    return fan_in if mode == "fan_in" else fan_out
+
+
+def _calculate_gain(nonlinearity, param=None):
+    linear_fns = [
+        "linear", "conv1d", "conv2d", "conv3d", "conv_transpose1d",
+        "conv_transpose2d", "conv_transpose3d"
+    ]
+    if nonlinearity in linear_fns or nonlinearity == "sigmoid":
+        return 1
+    elif nonlinearity == "tanh":
+        return 5.0 / 3
+    elif nonlinearity == "relu":
+        return math.sqrt(2.0)
+    elif nonlinearity == "leaky_relu":
+        if param is None:
+            negative_slope = 0.01
+        elif not isinstance(param, bool) and isinstance(
+                param, int) or isinstance(param, float):
+            # True/False are instances of int, hence check above
+            negative_slope = param
+        else:
+            raise ValueError(
+                "negative_slope {} not a valid number".format(param))
+        return math.sqrt(2.0 / (1 + negative_slope**2))
+    elif nonlinearity == "selu":
+        return 3.0 / 4
+    else:
+        raise ValueError("Unsupported nonlinearity {}".format(nonlinearity))
+
+
+def kaiming_uniform_(tensor,
+                     a=0,
+                     mode="fan_in",
+                     nonlinearity="leaky_relu",
+                     reverse=False):
+    """
+    Modified tensor inspace using kaiming_uniform method
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        mode (str): ['fan_in', 'fan_out'], 'fin_in' defalut
+        nonlinearity (str): nonlinearity method name
+        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
+    Return:
+        tensor
+    """
+    fan = _calculate_correct_fan(tensor, mode, reverse)
+    gain = _calculate_gain(nonlinearity, a)
+    std = gain / math.sqrt(fan)
+    k = math.sqrt(3.0) * std
+    return _no_grad_uniform_(tensor, -k, k)
+
+
+def kaiming_normal_(tensor,
+                    a=0,
+                    mode="fan_in",
+                    nonlinearity="leaky_relu",
+                    reverse=False):
+    """
+    Modified tensor inspace using kaiming_normal_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        mode (str): ['fan_in', 'fan_out'], 'fin_in' defalut
+        nonlinearity (str): nonlinearity method name
+        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
+    Return:
+        tensor
+    """
+    fan = _calculate_correct_fan(tensor, mode, reverse)
+    gain = _calculate_gain(nonlinearity, a)
+    std = gain / math.sqrt(fan)
+    return _no_grad_normal_(tensor, 0, std)
+
+
+def linear_init_(module):
+    bound = 1 / math.sqrt(module.weight.shape[0])
+    uniform_(module.weight, -bound, bound)
+    uniform_(module.bias, -bound, bound)
+
+
+def conv_init_(module):
+    bound = 1 / np.sqrt(np.prod(module.weight.shape[1:]))
+    uniform_(module.weight, -bound, bound)
+    if module.bias is not None:
+        uniform_(module.bias, -bound, bound)
+
+
+def bias_init_with_prob(prior_prob=0.01):
+    """initialize conv/fc bias value according to a given probability value."""
+    bias_init = float(-np.log((1 - prior_prob) / prior_prob))
+    return bias_init
+
+
+@paddle.no_grad()
+def reset_initialized_parameter(model, include_self=True):
+    """
+    Reset initialized parameter using following method for [conv, linear, embedding, bn]
+    Args:
+        model (paddle.Layer): paddle Layer
+        include_self (bool: False): include_self for Layer.named_sublayers method. Indicate whether including itself
+    Return:
+        None
+    """
+    for _, m in model.named_sublayers(include_self=include_self):
+        if isinstance(m, nn.Conv2D):
+            k = float(m._groups) / (m._in_channels * m._kernel_size[0] *
+                                    m._kernel_size[1])
+            k = math.sqrt(k)
+            _no_grad_uniform_(m.weight, -k, k)
+            if hasattr(m, "bias") and getattr(m, "bias") is not None:
+                _no_grad_uniform_(m.bias, -k, k)
+
+        elif isinstance(m, nn.Linear):
+            k = math.sqrt(1.0 / m.weight.shape[0])
+            _no_grad_uniform_(m.weight, -k, k)
+            if hasattr(m, "bias") and getattr(m, "bias") is not None:
+                _no_grad_uniform_(m.bias, -k, k)
+
+        elif isinstance(m, nn.Embedding):
+            _no_grad_normal_(m.weight, mean=0.0, std=1.0)
+
+        elif isinstance(m, (nn.BatchNorm2D, nn.LayerNorm)):
+            _no_grad_fill_(m.weight, 1.0)
+            if hasattr(m, "bias") and getattr(m, "bias") is not None:
+                _no_grad_fill_(m.bias, 0)
diff --git a/paddlespeech/vector/exps/ecapa_tdnn/train.py b/paddlespeech/vector/exps/ecapa_tdnn/train.py
index bf014045..2dc7a716 100644
--- a/paddlespeech/vector/exps/ecapa_tdnn/train.py
+++ b/paddlespeech/vector/exps/ecapa_tdnn/train.py
@@ -51,7 +51,7 @@ def main(args, config):
     # stage0: set the training device, cpu or gpu
     paddle.set_device(args.device)
 
-    # stage1: we must call the paddle.distributed.init_parallel_env() api at the begining
+    # stage1: we must call the paddle.distributed.init_parallel_env() api at the beginning
     paddle.distributed.init_parallel_env()
     nranks = paddle.distributed.get_world_size()
     rank = paddle.distributed.get_rank()
@@ -146,7 +146,7 @@ def main(args, config):
     timer.start()
 
     for epoch in range(start_epoch + 1, config.epochs + 1):
-        # at the begining, model must set to train mode
+        # at the beginning, model must set to train mode
         model.train()
 
         avg_loss = 0
diff --git a/paddlespeech/vector/exps/ge2e/preprocess.py b/paddlespeech/vector/exps/ge2e/preprocess.py
index dabe0ce7..ee59e624 100644
--- a/paddlespeech/vector/exps/ge2e/preprocess.py
+++ b/paddlespeech/vector/exps/ge2e/preprocess.py
@@ -42,7 +42,7 @@ if __name__ == "__main__":
     parser.add_argument(
         "--skip_existing",
         action="store_true",
-        help="Whether to skip ouput files with the same name. Useful if this script was interrupted."
+        help="Whether to skip output files with the same name. Useful if this script was interrupted."
     )
     parser.add_argument(
         "--no_trim",
diff --git a/runtime/engine/asr/server/websocket/websocket_server.cc b/runtime/engine/asr/server/websocket/websocket_server.cc
index 14f2f6e9..d1bed1ca 100644
--- a/runtime/engine/asr/server/websocket/websocket_server.cc
+++ b/runtime/engine/asr/server/websocket/websocket_server.cc
@@ -32,14 +32,14 @@ void ConnectionHandler::OnSpeechStart() {
     decode_thread_ = std::make_shared<std::thread>(
         &ConnectionHandler::DecodeThreadFunc, this);
     got_start_tag_ = true;
-    LOG(INFO) << "Server: Recieved speech start signal, start reading speech";
+    LOG(INFO) << "Server: Received speech start signal, start reading speech";
     json::value rv = {{"status", "ok"}, {"type", "server_ready"}};
     ws_.text(true);
     ws_.write(asio::buffer(json::serialize(rv)));
 }
 
 void ConnectionHandler::OnSpeechEnd() {
-    LOG(INFO) << "Server: Recieved speech end signal";
+    LOG(INFO) << "Server: Received speech end signal";
     if (recognizer_ != nullptr) {
         recognizer_->SetFinished();
     }
@@ -70,8 +70,8 @@ void ConnectionHandler::OnSpeechData(const beast::flat_buffer& buffer) {
         pcm_data(i) = static_cast<float>(*pdata);
         pdata++;
     }
-    VLOG(2) << "Server: Recieved " << num_samples << " samples";
-    LOG(INFO) << "Server: Recieved " << num_samples << " samples";
+    VLOG(2) << "Server: Received " << num_samples << " samples";
+    LOG(INFO) << "Server: Received " << num_samples << " samples";
     CHECK(recognizer_ != nullptr);
     recognizer_->Accept(pcm_data);
 
diff --git a/runtime/engine/common/frontend/db_norm.cc b/runtime/engine/common/frontend/db_norm.cc
index ad79fcc3..7141fc80 100644
--- a/runtime/engine/common/frontend/db_norm.cc
+++ b/runtime/engine/common/frontend/db_norm.cc
@@ -76,7 +76,7 @@ bool DecibelNormalizer::Compute(VectorBase<BaseFloat>* waves) const {
     if (gain > opts_.max_gain_db) {
         LOG(ERROR)
             << "Unable to normalize segment to " << opts_.target_db << "dB,"
-            << "because the the probable gain have exceeds opts_.max_gain_db"
+            << "because the probable gain has exceeded opts_.max_gain_db"
             << opts_.max_gain_db << "dB.";
         return false;
     }
diff --git a/runtime/engine/kaldi/base/kaldi-types.h b/runtime/engine/kaldi/base/kaldi-types.h
index f371e3da..bf8a2722 100644
--- a/runtime/engine/kaldi/base/kaldi-types.h
+++ b/runtime/engine/kaldi/base/kaldi-types.h
@@ -40,7 +40,7 @@ typedef float   BaseFloat;
 #include <stdint.h>
 
 // for discussion on what to do if you need compile kaldi
-// without OpenFST, see the bottom of this this file
+// without OpenFST, see the bottom of this file
 
 #ifndef COMPILE_WITHOUT_OPENFST
 
diff --git a/runtime/engine/kaldi/lat/lattice-functions.h b/runtime/engine/kaldi/lat/lattice-functions.h
index 6b1b6656..785d3f96 100644
--- a/runtime/engine/kaldi/lat/lattice-functions.h
+++ b/runtime/engine/kaldi/lat/lattice-functions.h
@@ -355,12 +355,12 @@ bool PruneLattice(BaseFloat beam, LatticeType *lat);
 //
 //
 // /// This function returns the number of words in the longest sentence in a
-// /// CompactLattice (i.e. the the maximum of any path, of the count of
+// /// CompactLattice (i.e. the maximum of any path, of the count of
 // /// olabels on that path).
 // int32 LongestSentenceLength(const Lattice &lat);
 //
 // /// This function returns the number of words in the longest sentence in a
-// /// CompactLattice, i.e. the the maximum of any path, of the count of
+// /// CompactLattice, i.e. the maximum of any path, of the count of
 // /// labels on that path... note, in CompactLattice, the ilabels and olabels
 // /// are identical because it is an acceptor.
 // int32 LongestSentenceLength(const CompactLattice &lat);
@@ -408,7 +408,7 @@ bool PruneLattice(BaseFloat beam, LatticeType *lat);
 //
 // /// This function computes the mapping from the pair
 // /// (frame-index, transition-id) to the pair
-// /// (sum-of-acoustic-scores, num-of-occurences) over all occurences of the
+// /// (sum-of-acoustic-scores, num-of-occurrences) over all occurrences of the
 // /// transition-id in that frame.
 // /// frame-index in the lattice.
 // /// This function is useful for retaining the acoustic scores in a
@@ -422,13 +422,13 @@ bool PruneLattice(BaseFloat beam, LatticeType *lat);
 // ///   @param [out] acoustic_scores
 // ///                     Pointer to a map from the pair (frame-index,
 // ///                     transition-id) to a pair (sum-of-acoustic-scores,
-// ///                     num-of-occurences).
+// ///                     num-of-occurrences).
 // ///                     Usually the acoustic scores for a pdf-id (and hence
 // ///                     transition-id) on a frame will be the same for all the
-// ///                     occurences of the pdf-id in that frame.
+// ///                     occurrences of the pdf-id in that frame.
 // ///                     But if not, we will take the average of the acoustic
 // ///                     scores. Hence, we store both the sum-of-acoustic-scores
-// ///                     and the num-of-occurences of the transition-id in that
+// ///                     and the num-of-occurrences of the transition-id in that
 // ///                     frame.
 // void ComputeAcousticScoresMap(
 //     const Lattice &lat,
@@ -440,8 +440,8 @@ bool PruneLattice(BaseFloat beam, LatticeType *lat);
 // ///
 // ///   @param [in] acoustic_scores
 // ///                      A map from the pair (frame-index, transition-id) to a
-// ///                      pair (sum-of-acoustic-scores, num-of-occurences) of
-// ///                      the occurences of the transition-id in that frame.
+// ///                      pair (sum-of-acoustic-scores, num-of-occurrences) of
+// ///                      the occurrences of the transition-id in that frame.
 // ///                      See the comments for ComputeAcousticScoresMap for
 // ///                      details.
 // ///   @param [out] lat   Pointer to the output lattice.
diff --git a/runtime/engine/kaldi/util/kaldi-table-inl.h b/runtime/engine/kaldi/util/kaldi-table-inl.h
index 6aca2f13..175e2704 100644
--- a/runtime/engine/kaldi/util/kaldi-table-inl.h
+++ b/runtime/engine/kaldi/util/kaldi-table-inl.h
@@ -1587,7 +1587,7 @@ template<class Holder> class RandomAccessTableReaderImplBase {
 // this from a pipe.  In principle we could read it on-demand as for the
 // archives, but this would probably be overkill.
 
-// Note: the code for this this class is similar to TableWriterScriptImpl:
+// Note: the code for this class is similar to TableWriterScriptImpl:
 // try to keep them in sync.
 template<class Holder>
 class RandomAccessTableReaderScriptImpl:
diff --git a/setup.py b/setup.py
index f3e5e728..1545c613 100644
--- a/setup.py
+++ b/setup.py
@@ -32,54 +32,46 @@ VERSION = '0.0.0'
 COMMITID = 'none'
 
 base = [
+    "braceexpand",
     "editdistance",
     "g2p_en",
     "g2pM",
     "h5py",
+    "hyperpyyaml",
     "inflect",
-    "jieba",
     "jsonlines",
-    "kaldiio",
     "librosa==0.8.1",
     "loguru",
     "matplotlib",
     "nara_wpe",
-    "onnxruntime==1.11.0",
+    "onnxruntime>=1.11.0",
     "opencc",
+    "opencc-python-reimplemented",
     "pandas",
-    "paddlenlp>=2.4.3",
+    "paddleaudio>=1.1.0",
+    "paddlenlp>=2.4.8",
+    "paddleslim>=2.3.4",
+    "ppdiffusers>=0.9.0",
     "paddlespeech_feat",
-    "Pillow>=9.0.0",
-    "praatio==5.0.0",
-    "protobuf>=3.1.0, <=3.20.0",
+    "praatio>=5.0.0, <=5.1.1",
+    "prettytable",
     "pypinyin<=0.44.0",
     "pypinyin-dict",
     "python-dateutil",
-    "pyworld==0.2.12",
-    "resampy==0.2.2",
+    "pyworld>=0.2.12",
+    "pyyaml",
+    "resampy",
     "sacrebleu",
-    "scipy",
-    "sentencepiece~=0.1.96",
-    "soundfile~=0.10",
     "textgrid",
     "timer",
-    "tqdm",
-    "typeguard",
-    "visualdl",
+    "ToJyutping==0.2.1",
+    "typeguard==2.13.3",
     "webrtcvad",
     "yacs~=0.1.8",
-    "prettytable",
     "zhon",
-    "colorlog",
-    "pathos==0.2.8",
-    "braceexpand",
-    "pyyaml",
-    "pybind11",
-    "paddleslim==2.3.4",
-    "paddleaudio>=1.0.2",
 ]
 
-server = ["fastapi", "uvicorn", "pattern_singleton", "websockets"]
+server = ["pattern_singleton", "websockets"]
 
 requirements = {
     "install":
@@ -280,10 +272,8 @@ setup_info = dict(
         "deepspeech2",
         "transformer",
         "conformer",
-        "fastspeech",
-        "vocoder",
-        "pwgan",
-        "gan",
+        "fastspeech2",
+        "gan vocoders",
     ],
     python_requires='>=3.7',
     install_requires=requirements["install"],
@@ -304,7 +294,8 @@ setup_info = dict(
     },
 
     # Package info
-    packages=find_packages(include=('paddlespeech*')),
+    packages=find_packages(
+        include=['paddlespeech*'], exclude=['utils', 'third_party']),
     zip_safe=True,
     classifiers=[
         'Development Status :: 5 - Production/Stable',
diff --git a/tests/test_tipc/conformer/scripts/aishell_tiny.py b/tests/test_tipc/conformer/scripts/aishell_tiny.py
index 14f09f17..c87463b5 100644
--- a/tests/test_tipc/conformer/scripts/aishell_tiny.py
+++ b/tests/test_tipc/conformer/scripts/aishell_tiny.py
@@ -26,8 +26,8 @@ from pathlib import Path
 
 import soundfile
 
-from utils.utility import download
-from utils.utility import unpack
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unpack
 
 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
 
diff --git a/tests/test_tipc/prepare.sh b/tests/test_tipc/prepare.sh
index cb05a1d0..9ff81bd8 100755
--- a/tests/test_tipc/prepare.sh
+++ b/tests/test_tipc/prepare.sh
@@ -73,6 +73,9 @@ if [[ ${MODE} = "benchmark_train" ]];then
         mkdir -p BZNSYP
         unrar x BZNSYP.rar BZNSYP
         wget -nc https://paddlespeech.bj.bcebos.com/Parakeet/benchmark/durations.txt
+        # 避免网络问题导致的 nltk_data 无法下载使程序 hang 住
+        wget -nc https://paddlespeech.bj.bcebos.com/Parakeet/tools/nltk_data.tar.gz
+        tar -xzf nltk_data.tar.gz -C ${HOME}
         # 数据预处理
         python ../paddlespeech/t2s/exps/gan_vocoder/preprocess.py --rootdir=BZNSYP/ --dumpdir=dump --num-cpu=20 --cut-sil=True --dur-file=durations.txt --config=../examples/csmsc/voc1/conf/default.yaml
         python ../utils/compute_statistics.py --metadata=dump/train/raw/metadata.jsonl --field-name="feats"
diff --git a/tests/unit/cli/aishell_test_prepare.py b/tests/unit/cli/aishell_test_prepare.py
index ed542c57..c364e4fd 100644
--- a/tests/unit/cli/aishell_test_prepare.py
+++ b/tests/unit/cli/aishell_test_prepare.py
@@ -25,8 +25,8 @@ from pathlib import Path
 
 import soundfile
 
-from utils.utility import download
-from utils.utility import unpack
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unpack
 
 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
 
diff --git a/tests/unit/cli/test_cli.sh b/tests/unit/cli/test_cli.sh
index 3a58626d..a7f7d11e 100755
--- a/tests/unit/cli/test_cli.sh
+++ b/tests/unit/cli/test_cli.sh
@@ -14,7 +14,7 @@ paddlespeech ssl --task asr --lang en --input ./en.wav
 paddlespeech ssl --task vector --lang en --input ./en.wav
 
 # Speech_recognition
-wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
+wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav
 paddlespeech asr --input ./zh.wav
 paddlespeech asr --model conformer_aishell --input ./zh.wav
 paddlespeech asr --model conformer_online_aishell --input ./zh.wav
@@ -26,6 +26,7 @@ paddlespeech asr --model deepspeech2offline_aishell --input ./zh.wav
 paddlespeech asr --model deepspeech2online_wenetspeech --input ./zh.wav
 paddlespeech asr --model deepspeech2online_aishell --input ./zh.wav
 paddlespeech asr --model deepspeech2offline_librispeech --lang en --input ./en.wav
+paddlespeech asr --model conformer_talcs --lang zh_en --codeswitch True --input ./ch_zh_mix.wav
 
 # Support editing num_decoding_left_chunks
 paddlespeech asr --model conformer_online_wenetspeech --num_decoding_left_chunks 3 --input ./zh.wav
@@ -58,7 +59,10 @@ paddlespeech tts --am fastspeech2_vctk --voc hifigan_vctk --input "Life was like
 paddlespeech tts --am tacotron2_csmsc --input "你好，欢迎使用百度飞桨深度学习框架！"
 paddlespeech tts --am tacotron2_csmsc --voc wavernn_csmsc --input "你好，欢迎使用百度飞桨深度学习框架！"
 paddlespeech tts --am tacotron2_ljspeech --voc pwgan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get."
-paddlespeech tts --am fastspeech2_male --voc pwgan_male --input "你好，欢迎使用百度飞桨深度学习框架！"
+paddlespeech tts --am fastspeech2_male --voc pwgan_male --lang zh --input "你好，欢迎使用百度飞桨深度学习框架！"
+paddlespeech tts --am fastspeech2_male --voc pwgan_male --lang en --input "Life was like a box of chocolates, you never know what you're gonna get."
+paddlespeech tts --am fastspeech2_canton --voc pwgan_aishell3 --input "各个国家有各个国家嘅国歌" --lang canton --spk_id 10
+
 # mix tts
 # The `am` must be `fastspeech2_mix`!
 # The `lang` must be `mix`!
@@ -69,6 +73,8 @@ paddlespeech tts --am fastspeech2_mix --voc hifigan_aishell3 --lang mix --input
 paddlespeech tts --am fastspeech2_mix --voc pwgan_csmsc --lang mix --input "我们的声学模型使用了 Fast Speech Two, 声码器使用了 Parallel Wave GAN and Hifi GAN." --spk_id 175 --output mix_spk175_pwgan.wav
 paddlespeech tts --am fastspeech2_mix --voc hifigan_csmsc --lang mix --input "我们的声学模型使用了 Fast Speech Two, 声码器使用了 Parallel Wave GAN and Hifi GAN." --spk_id 175 --output mix_spk175.wav
 
+# male mix tts
+paddlespeech tts --am fastspeech2_male --voc pwgan_male --lang mix --input "我们的声学模型使用了 Fast Speech Two, 声码器使用了 Parallel Wave GAN and Hifi GAN." --output male_mix_fs2_pwgan.wav
 
 # Speech Translation (only support linux)
 paddlespeech st --input ./en.wav
diff --git a/third_party/ctc_decoders/setup.py b/third_party/ctc_decoders/setup.py
index 9a8b292a..5ae5b3bf 100644
--- a/third_party/ctc_decoders/setup.py
+++ b/third_party/ctc_decoders/setup.py
@@ -19,7 +19,8 @@ import os
 import platform
 import sys
 
-from setuptools import distutils
+import setuptools
+from distutils import ccompiler
 from setuptools import Extension
 from setuptools import setup
 
@@ -75,7 +76,7 @@ def compile_test(header, library):
 
 
 # hack compile to support parallel compiling
-distutils.ccompiler.CCompiler.compile = parallelCCompile
+ccompiler.CCompiler.compile = parallelCCompile
 
 FILES = glob.glob('kenlm/util/*.cc') \
     + glob.glob('kenlm/lm/*.cc') \
@@ -128,7 +129,7 @@ decoders_module = [
 
 setup(
     name='paddlespeech_ctcdecoders',
-    version='0.2.0',
+    version='0.2.2',
     description="CTC decoders in paddlespeech",
     author="PaddlePaddle Speech and Language Team",
     author_email="paddlesl@baidu.com",
diff --git a/tools/extras/install_mkl.sh b/tools/extras/install_mkl.sh
index 8c1899bd..01bce64f 100755
--- a/tools/extras/install_mkl.sh
+++ b/tools/extras/install_mkl.sh
@@ -166,7 +166,7 @@ variable, sudo might not allow it to propagate to the command that it invokes."
 fi
 
 # The install variants, each in a function to simplify error reporting.
-# Each one invokes a subshell with a 'set -x' to to show system-modifying
+# Each one invokes a subshell with a 'set -x' to show system-modifying
 # commands it runs. The subshells simply limit the scope of this diagnostics
 # and avoid creating noise (if we were using 'set +x', it would be printed).
 Install_redhat () {
diff --git a/utils/avg_model.py b/utils/avg_model.py
index 6ee16408..039ea626 100755
--- a/utils/avg_model.py
+++ b/utils/avg_model.py
@@ -12,105 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import argparse
-import glob
-import json
-import os
-
-import numpy as np
-import paddle
-
-
-def main(args):
-    paddle.set_device('cpu')
-
-    val_scores = []
-    beat_val_scores = None
-    selected_epochs = None
-
-    jsons = glob.glob(f'{args.ckpt_dir}/[!train]*.json')
-    jsons = sorted(jsons, key=os.path.getmtime, reverse=True)
-    for y in jsons:
-        with open(y, 'r') as f:
-            dic_json = json.load(f)
-        loss = dic_json['val_loss']
-        epoch = dic_json['epoch']
-        if epoch >= args.min_epoch and epoch <= args.max_epoch:
-            val_scores.append((epoch, loss))
-    val_scores = np.array(val_scores)
-
-    if args.val_best:
-        sort_idx = np.argsort(val_scores[:, 1])
-        sorted_val_scores = val_scores[sort_idx]
-    else:
-        sorted_val_scores = val_scores
-
-    beat_val_scores = sorted_val_scores[:args.num, 1]
-    selected_epochs = sorted_val_scores[:args.num, 0].astype(np.int64)
-    avg_val_score = np.mean(beat_val_scores)
-    print("selected val scores = " + str(beat_val_scores))
-    print("selected epochs = " + str(selected_epochs))
-    print("averaged val score = " + str(avg_val_score))
-
-    path_list = [
-        args.ckpt_dir + '/{}.pdparams'.format(int(epoch))
-        for epoch in sorted_val_scores[:args.num, 0]
-    ]
-    print(path_list)
-
-    avg = None
-    num = args.num
-    assert num == len(path_list)
-    for path in path_list:
-        print(f'Processing {path}')
-        states = paddle.load(path)
-        if avg is None:
-            avg = states
-        else:
-            for k in avg.keys():
-                avg[k] += states[k]
-    # average
-    for k in avg.keys():
-        if avg[k] is not None:
-            avg[k] /= num
-
-    paddle.save(avg, args.dst_model)
-    print(f'Saving to {args.dst_model}')
-
-    meta_path = os.path.splitext(args.dst_model)[0] + '.avg.json'
-    with open(meta_path, 'w') as f:
-        data = json.dumps({
-            "mode": 'val_best' if args.val_best else 'latest',
-            "avg_ckpt": args.dst_model,
-            "val_loss_mean": avg_val_score,
-            "ckpts": path_list,
-            "epochs": selected_epochs.tolist(),
-            "val_losses": beat_val_scores.tolist(),
-        })
-        f.write(data + "\n")
-
+from paddlespeech.dataset.s2t import avg_ckpts_main
 
 if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='average model')
-    parser.add_argument('--dst_model', required=True, help='averaged model')
-    parser.add_argument(
-        '--ckpt_dir', required=True, help='ckpt model dir for average')
-    parser.add_argument(
-        '--val_best', action="store_true", help='averaged model')
-    parser.add_argument(
-        '--num', default=5, type=int, help='nums for averaged model')
-    parser.add_argument(
-        '--min_epoch',
-        default=0,
-        type=int,
-        help='min epoch used for averaging model')
-    parser.add_argument(
-        '--max_epoch',
-        default=65536,  # Big enough
-        type=int,
-        help='max epoch used for averaging model')
-
-    args = parser.parse_args()
-    print(args)
-
-    main(args)
+    avg_ckpts_main()
diff --git a/utils/build_vocab.py b/utils/build_vocab.py
index e364e821..9b29dfa5 100755
--- a/utils/build_vocab.py
+++ b/utils/build_vocab.py
@@ -15,134 +15,7 @@
 """Build vocabulary from manifest files.
 Each item in vocabulary file is a character.
 """
-import argparse
-import functools
-import os
-import tempfile
-from collections import Counter
-
-import jsonlines
-
-from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
-from paddlespeech.s2t.frontend.utility import BLANK
-from paddlespeech.s2t.frontend.utility import SOS
-from paddlespeech.s2t.frontend.utility import SPACE
-from paddlespeech.s2t.frontend.utility import UNK
-from paddlespeech.s2t.utils.utility import add_arguments
-from paddlespeech.s2t.utils.utility import print_arguments
-
-parser = argparse.ArgumentParser(description=__doc__)
-add_arg = functools.partial(add_arguments, argparser=parser)
-# yapf: disable
-add_arg('unit_type', str, "char", "Unit type, e.g. char, word, spm")
-add_arg('count_threshold', int, 0,
-        "Truncation threshold for char/word counts.Default 0, no truncate.")
-add_arg('vocab_path', str,
-        'examples/librispeech/data/vocab.txt',
-        "Filepath to write the vocabulary.")
-add_arg('manifest_paths', str,
-        None,
-        "Filepaths of manifests for building vocabulary. "
-        "You can provide multiple manifest files.",
-        nargs='+',
-        required=True)
-add_arg('text_keys', str,
-        'text',
-        "keys of the text in manifest for building vocabulary. "
-        "You can provide multiple k.",
-        nargs='+')
-# bpe
-add_arg('spm_vocab_size', int, 0, "Vocab size for spm.")
-add_arg('spm_mode', str, 'unigram', "spm model type, e.g. unigram, spm, char, word. only need when `unit_type` is spm")
-add_arg('spm_model_prefix', str, "", "spm_model_%(spm_mode)_%(count_threshold), spm model prefix, only need when `unit_type` is spm")
-add_arg('spm_character_coverage', float, 0.9995, "character coverage to determine the minimum symbols")
-
-# yapf: disable
-args = parser.parse_args()
-
-
-def count_manifest(counter, text_feature, manifest_path):
-    manifest_jsons = []
-    with jsonlines.open(manifest_path, 'r') as reader:
-        for json_data in reader:
-            manifest_jsons.append(json_data)
-
-    for line_json in manifest_jsons:
-        if isinstance(line_json['text'], str):
-            line = text_feature.tokenize(line_json['text'], replace_space=False)
-            counter.update(line)
-        else:
-            assert isinstance(line_json['text'], list)
-            for text in line_json['text']:
-                line = text_feature.tokenize(text, replace_space=False)
-                counter.update(line)
-
-def dump_text_manifest(fileobj, manifest_path, key='text'):
-    manifest_jsons = []
-    with jsonlines.open(manifest_path, 'r') as reader:
-        for json_data in reader:
-            manifest_jsons.append(json_data)
-
-    for line_json in manifest_jsons:
-        if isinstance(line_json[key], str):
-            fileobj.write(line_json[key] + "\n")
-        else:
-            assert isinstance(line_json[key], list)
-            for line in line_json[key]:
-                fileobj.write(line + "\n")
-
-def main():
-    print_arguments(args, globals())
-
-    fout = open(args.vocab_path, 'w', encoding='utf-8')
-    fout.write(BLANK + "\n")  # 0 will be used for "blank" in CTC
-    fout.write(UNK + '\n')  # <unk> must be 1
-
-    if args.unit_type == 'spm':
-        # tools/spm_train --input=$wave_data/lang_char/input.txt
-        # --vocab_size=${nbpe} --model_type=${bpemode}
-        # --model_prefix=${bpemodel} --input_sentence_size=100000000
-        import sentencepiece as spm
-
-        fp = tempfile.NamedTemporaryFile(mode='w', delete=False)
-        for manifest_path in args.manifest_paths:
-            text_keys = [args.text_keys] if type(args.text_keys) is not list else args.text_keys
-            for text_key in text_keys:
-                dump_text_manifest(fp, manifest_path, key=text_key)
-        fp.close()
-        # train
-        spm.SentencePieceTrainer.Train(
-            input=fp.name,
-            vocab_size=args.spm_vocab_size,
-            model_type=args.spm_mode,
-            model_prefix=args.spm_model_prefix,
-            input_sentence_size=100000000,
-            character_coverage=args.spm_character_coverage)
-        os.unlink(fp.name)
-
-    # encode
-    text_feature = TextFeaturizer(args.unit_type, "", args.spm_model_prefix)
-    counter = Counter()
-
-    for manifest_path in args.manifest_paths:
-        count_manifest(counter, text_feature, manifest_path)
-
-    count_sorted = sorted(counter.items(), key=lambda x: x[1], reverse=True)
-    tokens = []
-    for token, count in count_sorted:
-        if count < args.count_threshold:
-            break
-        # replace space by `<space>`
-        token = SPACE if token == ' ' else token
-        tokens.append(token)
-
-    tokens = sorted(tokens)
-    for token in tokens:
-        fout.write(token + '\n')
-
-    fout.write(SOS + "\n")  # <sos/eos>
-    fout.close()
-
+from paddlespeech.dataset.s2t import build_vocab_main
 
 if __name__ == '__main__':
-    main()
+    build_vocab_main()
diff --git a/utils/compute-wer.py b/utils/compute-wer.py
index 98bb24a7..1fa77216 100755
--- a/utils/compute-wer.py
+++ b/utils/compute-wer.py
@@ -1,554 +1,5 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
 # Copyright 2021 Mobvoi Inc. All Rights Reserved.
-import codecs
-import re
-import sys
-import unicodedata
-
-remove_tag = True
-spacelist = [' ', '\t', '\r', '\n']
-puncts = [
-    '!', ',', '?', '、', '。', '！', '，', '；', '？', '：', '「', '」', '︰', '『', '』',
-    '《', '》'
-]
-
-
-def characterize(string):
-    res = []
-    i = 0
-    while i < len(string):
-        char = string[i]
-        if char in puncts:
-            i += 1
-            continue
-        cat1 = unicodedata.category(char)
-        #https://unicodebook.readthedocs.io/unicode.html#unicode-categories
-        if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist:  # space or not assigned
-            i += 1
-            continue
-        if cat1 == 'Lo':  # letter-other
-            res.append(char)
-            i += 1
-        else:
-            # some input looks like: <unk><noise>, we want to separate it to two words.
-            sep = ' '
-            if char == '<': sep = '>'
-            j = i + 1
-            while j < len(string):
-                c = string[j]
-                if ord(c) >= 128 or (c in spacelist) or (c == sep):
-                    break
-                j += 1
-            if j < len(string) and string[j] == '>':
-                j += 1
-            res.append(string[i:j])
-            i = j
-    return res
-
-
-def stripoff_tags(x):
-    if not x: return ''
-    chars = []
-    i = 0
-    T = len(x)
-    while i < T:
-        if x[i] == '<':
-            while i < T and x[i] != '>':
-                i += 1
-            i += 1
-        else:
-            chars.append(x[i])
-            i += 1
-    return ''.join(chars)
-
-
-def normalize(sentence, ignore_words, cs, split=None):
-    """ sentence, ignore_words are both in unicode
-    """
-    new_sentence = []
-    for token in sentence:
-        x = token
-        if not cs:
-            x = x.upper()
-        if x in ignore_words:
-            continue
-        if remove_tag:
-            x = stripoff_tags(x)
-        if not x:
-            continue
-        if split and x in split:
-            new_sentence += split[x]
-        else:
-            new_sentence.append(x)
-    return new_sentence
-
-
-class Calculator:
-    def __init__(self):
-        self.data = {}
-        self.space = []
-        self.cost = {}
-        self.cost['cor'] = 0
-        self.cost['sub'] = 1
-        self.cost['del'] = 1
-        self.cost['ins'] = 1
-
-    def calculate(self, lab, rec):
-        # Initialization
-        lab.insert(0, '')
-        rec.insert(0, '')
-        while len(self.space) < len(lab):
-            self.space.append([])
-        for row in self.space:
-            for element in row:
-                element['dist'] = 0
-                element['error'] = 'non'
-            while len(row) < len(rec):
-                row.append({'dist': 0, 'error': 'non'})
-        for i in range(len(lab)):
-            self.space[i][0]['dist'] = i
-            self.space[i][0]['error'] = 'del'
-        for j in range(len(rec)):
-            self.space[0][j]['dist'] = j
-            self.space[0][j]['error'] = 'ins'
-        self.space[0][0]['error'] = 'non'
-        for token in lab:
-            if token not in self.data and len(token) > 0:
-                self.data[token] = {
-                    'all': 0,
-                    'cor': 0,
-                    'sub': 0,
-                    'ins': 0,
-                    'del': 0
-                }
-        for token in rec:
-            if token not in self.data and len(token) > 0:
-                self.data[token] = {
-                    'all': 0,
-                    'cor': 0,
-                    'sub': 0,
-                    'ins': 0,
-                    'del': 0
-                }
-        # Computing edit distance
-        for i, lab_token in enumerate(lab):
-            for j, rec_token in enumerate(rec):
-                if i == 0 or j == 0:
-                    continue
-                min_dist = sys.maxsize
-                min_error = 'none'
-                dist = self.space[i - 1][j]['dist'] + self.cost['del']
-                error = 'del'
-                if dist < min_dist:
-                    min_dist = dist
-                    min_error = error
-                dist = self.space[i][j - 1]['dist'] + self.cost['ins']
-                error = 'ins'
-                if dist < min_dist:
-                    min_dist = dist
-                    min_error = error
-                if lab_token == rec_token:
-                    dist = self.space[i - 1][j - 1]['dist'] + self.cost['cor']
-                    error = 'cor'
-                else:
-                    dist = self.space[i - 1][j - 1]['dist'] + self.cost['sub']
-                    error = 'sub'
-                if dist < min_dist:
-                    min_dist = dist
-                    min_error = error
-                self.space[i][j]['dist'] = min_dist
-                self.space[i][j]['error'] = min_error
-        # Tracing back
-        result = {
-            'lab': [],
-            'rec': [],
-            'all': 0,
-            'cor': 0,
-            'sub': 0,
-            'ins': 0,
-            'del': 0
-        }
-        i = len(lab) - 1
-        j = len(rec) - 1
-        while True:
-            if self.space[i][j]['error'] == 'cor':  # correct
-                if len(lab[i]) > 0:
-                    self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1
-                    self.data[lab[i]]['cor'] = self.data[lab[i]]['cor'] + 1
-                    result['all'] = result['all'] + 1
-                    result['cor'] = result['cor'] + 1
-                result['lab'].insert(0, lab[i])
-                result['rec'].insert(0, rec[j])
-                i = i - 1
-                j = j - 1
-            elif self.space[i][j]['error'] == 'sub':  # substitution
-                if len(lab[i]) > 0:
-                    self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1
-                    self.data[lab[i]]['sub'] = self.data[lab[i]]['sub'] + 1
-                    result['all'] = result['all'] + 1
-                    result['sub'] = result['sub'] + 1
-                result['lab'].insert(0, lab[i])
-                result['rec'].insert(0, rec[j])
-                i = i - 1
-                j = j - 1
-            elif self.space[i][j]['error'] == 'del':  # deletion
-                if len(lab[i]) > 0:
-                    self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1
-                    self.data[lab[i]]['del'] = self.data[lab[i]]['del'] + 1
-                    result['all'] = result['all'] + 1
-                    result['del'] = result['del'] + 1
-                result['lab'].insert(0, lab[i])
-                result['rec'].insert(0, "")
-                i = i - 1
-            elif self.space[i][j]['error'] == 'ins':  # insertion
-                if len(rec[j]) > 0:
-                    self.data[rec[j]]['ins'] = self.data[rec[j]]['ins'] + 1
-                    result['ins'] = result['ins'] + 1
-                result['lab'].insert(0, "")
-                result['rec'].insert(0, rec[j])
-                j = j - 1
-            elif self.space[i][j]['error'] == 'non':  # starting point
-                break
-            else:  # shouldn't reach here
-                print(
-                    'this should not happen , i = {i} , j = {j} , error = {error}'.
-                    format(i=i, j=j, error=self.space[i][j]['error']))
-        return result
-
-    def overall(self):
-        result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0}
-        for token in self.data:
-            result['all'] = result['all'] + self.data[token]['all']
-            result['cor'] = result['cor'] + self.data[token]['cor']
-            result['sub'] = result['sub'] + self.data[token]['sub']
-            result['ins'] = result['ins'] + self.data[token]['ins']
-            result['del'] = result['del'] + self.data[token]['del']
-        return result
-
-    def cluster(self, data):
-        result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0}
-        for token in data:
-            if token in self.data:
-                result['all'] = result['all'] + self.data[token]['all']
-                result['cor'] = result['cor'] + self.data[token]['cor']
-                result['sub'] = result['sub'] + self.data[token]['sub']
-                result['ins'] = result['ins'] + self.data[token]['ins']
-                result['del'] = result['del'] + self.data[token]['del']
-        return result
-
-    def keys(self):
-        return list(self.data.keys())
-
-
-def width(string):
-    return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string)
-
-
-def default_cluster(word):
-    unicode_names = [unicodedata.name(char) for char in word]
-    for i in reversed(range(len(unicode_names))):
-        if unicode_names[i].startswith('DIGIT'):  # 1
-            unicode_names[i] = 'Number'  # 'DIGIT'
-        elif (unicode_names[i].startswith('CJK UNIFIED IDEOGRAPH') or
-              unicode_names[i].startswith('CJK COMPATIBILITY IDEOGRAPH')):
-            # 明 / 郎
-            unicode_names[i] = 'Mandarin'  # 'CJK IDEOGRAPH'
-        elif (unicode_names[i].startswith('LATIN CAPITAL LETTER') or
-              unicode_names[i].startswith('LATIN SMALL LETTER')):
-            # A / a
-            unicode_names[i] = 'English'  # 'LATIN LETTER'
-        elif unicode_names[i].startswith('HIRAGANA LETTER'):  # は こ め
-            unicode_names[i] = 'Japanese'  # 'GANA LETTER'
-        elif (unicode_names[i].startswith('AMPERSAND') or
-              unicode_names[i].startswith('APOSTROPHE') or
-              unicode_names[i].startswith('COMMERCIAL AT') or
-              unicode_names[i].startswith('DEGREE CELSIUS') or
-              unicode_names[i].startswith('EQUALS SIGN') or
-              unicode_names[i].startswith('FULL STOP') or
-              unicode_names[i].startswith('HYPHEN-MINUS') or
-              unicode_names[i].startswith('LOW LINE') or
-              unicode_names[i].startswith('NUMBER SIGN') or
-              unicode_names[i].startswith('PLUS SIGN') or
-              unicode_names[i].startswith('SEMICOLON')):
-            # & / ' / @ / ℃ / = / . / - / _ / # / + / ;
-            del unicode_names[i]
-        else:
-            return 'Other'
-    if len(unicode_names) == 0:
-        return 'Other'
-    if len(unicode_names) == 1:
-        return unicode_names[0]
-    for i in range(len(unicode_names) - 1):
-        if unicode_names[i] != unicode_names[i + 1]:
-            return 'Other'
-    return unicode_names[0]
-
-
-def usage():
-    print(
-        "compute-wer.py : compute word error rate (WER) and align recognition results and references."
-    )
-    print(
-        "         usage : python compute-wer.py [--cs={0,1}] [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] [--padding-symbol={space,underline}] test.ref test.hyp > test.wer"
-    )
-
+from paddlespeech.dataset.s2t import compute_wer_main
 
 if __name__ == '__main__':
-    if len(sys.argv) == 1:
-        usage()
-        sys.exit(0)
-    calculator = Calculator()
-    cluster_file = ''
-    ignore_words = set()
-    tochar = False
-    verbose = 1
-    padding_symbol = ' '
-    case_sensitive = False
-    max_words_per_line = sys.maxsize
-    split = None
-    while len(sys.argv) > 3:
-        a = '--maxw='
-        if sys.argv[1].startswith(a):
-            b = sys.argv[1][len(a):]
-            del sys.argv[1]
-            max_words_per_line = int(b)
-            continue
-        a = '--rt='
-        if sys.argv[1].startswith(a):
-            b = sys.argv[1][len(a):].lower()
-            del sys.argv[1]
-            remove_tag = (b == 'true') or (b != '0')
-            continue
-        a = '--cs='
-        if sys.argv[1].startswith(a):
-            b = sys.argv[1][len(a):].lower()
-            del sys.argv[1]
-            case_sensitive = (b == 'true') or (b != '0')
-            continue
-        a = '--cluster='
-        if sys.argv[1].startswith(a):
-            cluster_file = sys.argv[1][len(a):]
-            del sys.argv[1]
-            continue
-        a = '--splitfile='
-        if sys.argv[1].startswith(a):
-            split_file = sys.argv[1][len(a):]
-            del sys.argv[1]
-            split = dict()
-            with codecs.open(split_file, 'r', 'utf-8') as fh:
-                for line in fh:  # line in unicode
-                    words = line.strip().split()
-                    if len(words) >= 2:
-                        split[words[0]] = words[1:]
-            continue
-        a = '--ig='
-        if sys.argv[1].startswith(a):
-            ignore_file = sys.argv[1][len(a):]
-            del sys.argv[1]
-            with codecs.open(ignore_file, 'r', 'utf-8') as fh:
-                for line in fh:  # line in unicode
-                    line = line.strip()
-                    if len(line) > 0:
-                        ignore_words.add(line)
-            continue
-        a = '--char='
-        if sys.argv[1].startswith(a):
-            b = sys.argv[1][len(a):].lower()
-            del sys.argv[1]
-            tochar = (b == 'true') or (b != '0')
-            continue
-        a = '--v='
-        if sys.argv[1].startswith(a):
-            b = sys.argv[1][len(a):].lower()
-            del sys.argv[1]
-            verbose = 0
-            try:
-                verbose = int(b)
-            except:
-                if b == 'true' or b != '0':
-                    verbose = 1
-            continue
-        a = '--padding-symbol='
-        if sys.argv[1].startswith(a):
-            b = sys.argv[1][len(a):].lower()
-            del sys.argv[1]
-            if b == 'space':
-                padding_symbol = ' '
-            elif b == 'underline':
-                padding_symbol = '_'
-            continue
-        if True or sys.argv[1].startswith('-'):
-            #ignore invalid switch
-            del sys.argv[1]
-            continue
-
-    if not case_sensitive:
-        ig = set([w.upper() for w in ignore_words])
-        ignore_words = ig
-
-    default_clusters = {}
-    default_words = {}
-
-    ref_file = sys.argv[1]
-    hyp_file = sys.argv[2]
-    rec_set = {}
-    if split and not case_sensitive:
-        newsplit = dict()
-        for w in split:
-            words = split[w]
-            for i in range(len(words)):
-                words[i] = words[i].upper()
-            newsplit[w.upper()] = words
-        split = newsplit
-
-    with codecs.open(hyp_file, 'r', 'utf-8') as fh:
-        for line in fh:
-            if tochar:
-                array = characterize(line)
-            else:
-                array = line.strip().split()
-            if len(array) == 0: continue
-            fid = array[0]
-            rec_set[fid] = normalize(array[1:], ignore_words, case_sensitive,
-                                     split)
-
-    # compute error rate on the interaction of reference file and hyp file
-    for line in open(ref_file, 'r', encoding='utf-8'):
-        if tochar:
-            array = characterize(line)
-        else:
-            array = line.rstrip('\n').split()
-        if len(array) == 0: continue
-        fid = array[0]
-        if fid not in rec_set:
-            continue
-        lab = normalize(array[1:], ignore_words, case_sensitive, split)
-        rec = rec_set[fid]
-        if verbose:
-            print('\nutt: %s' % fid)
-
-        for word in rec + lab:
-            if word not in default_words:
-                default_cluster_name = default_cluster(word)
-                if default_cluster_name not in default_clusters:
-                    default_clusters[default_cluster_name] = {}
-                if word not in default_clusters[default_cluster_name]:
-                    default_clusters[default_cluster_name][word] = 1
-                default_words[word] = default_cluster_name
-
-        result = calculator.calculate(lab, rec)
-        if verbose:
-            if result['all'] != 0:
-                wer = float(result['ins'] + result['sub'] + result[
-                    'del']) * 100.0 / result['all']
-            else:
-                wer = 0.0
-            print('WER: %4.2f %%' % wer, end=' ')
-            print('N=%d C=%d S=%d D=%d I=%d' %
-                  (result['all'], result['cor'], result['sub'], result['del'],
-                   result['ins']))
-            space = {}
-            space['lab'] = []
-            space['rec'] = []
-            for idx in range(len(result['lab'])):
-                len_lab = width(result['lab'][idx])
-                len_rec = width(result['rec'][idx])
-                length = max(len_lab, len_rec)
-                space['lab'].append(length - len_lab)
-                space['rec'].append(length - len_rec)
-            upper_lab = len(result['lab'])
-            upper_rec = len(result['rec'])
-            lab1, rec1 = 0, 0
-            while lab1 < upper_lab or rec1 < upper_rec:
-                if verbose > 1:
-                    print('lab(%s):' % fid.encode('utf-8'), end=' ')
-                else:
-                    print('lab:', end=' ')
-                lab2 = min(upper_lab, lab1 + max_words_per_line)
-                for idx in range(lab1, lab2):
-                    token = result['lab'][idx]
-                    print('{token}'.format(token=token), end='')
-                    for n in range(space['lab'][idx]):
-                        print(padding_symbol, end='')
-                    print(' ', end='')
-                print()
-                if verbose > 1:
-                    print('rec(%s):' % fid.encode('utf-8'), end=' ')
-                else:
-                    print('rec:', end=' ')
-                rec2 = min(upper_rec, rec1 + max_words_per_line)
-                for idx in range(rec1, rec2):
-                    token = result['rec'][idx]
-                    print('{token}'.format(token=token), end='')
-                    for n in range(space['rec'][idx]):
-                        print(padding_symbol, end='')
-                    print(' ', end='')
-                print('\n', end='\n')
-                lab1 = lab2
-                rec1 = rec2
-
-    if verbose:
-        print(
-            '==========================================================================='
-        )
-        print()
-
-    result = calculator.overall()
-    if result['all'] != 0:
-        wer = float(result['ins'] + result['sub'] + result[
-            'del']) * 100.0 / result['all']
-    else:
-        wer = 0.0
-    print('Overall -> %4.2f %%' % wer, end=' ')
-    print('N=%d C=%d S=%d D=%d I=%d' %
-          (result['all'], result['cor'], result['sub'], result['del'],
-           result['ins']))
-    if not verbose:
-        print()
-
-    if verbose:
-        for cluster_id in default_clusters:
-            result = calculator.cluster(
-                [k for k in default_clusters[cluster_id]])
-            if result['all'] != 0:
-                wer = float(result['ins'] + result['sub'] + result[
-                    'del']) * 100.0 / result['all']
-            else:
-                wer = 0.0
-            print('%s -> %4.2f %%' % (cluster_id, wer), end=' ')
-            print('N=%d C=%d S=%d D=%d I=%d' %
-                  (result['all'], result['cor'], result['sub'], result['del'],
-                   result['ins']))
-        if len(cluster_file) > 0:  # compute separated WERs for word clusters
-            cluster_id = ''
-            cluster = []
-            for line in open(cluster_file, 'r', encoding='utf-8'):
-                for token in line.decode('utf-8').rstrip('\n').split():
-                    # end of cluster reached, like </Keyword>
-                    if token[0:2] == '</' and token[len(token)-1] == '>' and \
-                       token.lstrip('</').rstrip('>') == cluster_id :
-                        result = calculator.cluster(cluster)
-                        if result['all'] != 0:
-                            wer = float(result['ins'] + result['sub'] + result[
-                                'del']) * 100.0 / result['all']
-                        else:
-                            wer = 0.0
-                        print('%s -> %4.2f %%' % (cluster_id, wer), end=' ')
-                        print('N=%d C=%d S=%d D=%d I=%d' %
-                              (result['all'], result['cor'], result['sub'],
-                               result['del'], result['ins']))
-                        cluster_id = ''
-                        cluster = []
-                    # begin of cluster reached, like <Keyword>
-                    elif token[0] == '<' and token[len(token)-1] == '>' and \
-                         cluster_id == '' :
-                        cluster_id = token.lstrip('<').rstrip('>')
-                        cluster = []
-                    # general terms, like WEATHER / CAR / ...
-                    else:
-                        cluster.append(token)
-        print()
-        print(
-            '==========================================================================='
-        )
+    compute_wer_main()
diff --git a/utils/compute_mean_std.py b/utils/compute_mean_std.py
index e47554dc..6e3fc0db 100755
--- a/utils/compute_mean_std.py
+++ b/utils/compute_mean_std.py
@@ -13,75 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Compute mean and std for feature normalizer, and save to file."""
-import argparse
-import functools
-
-from paddlespeech.s2t.frontend.augmentor.augmentation import AugmentationPipeline
-from paddlespeech.s2t.frontend.featurizer.audio_featurizer import AudioFeaturizer
-from paddlespeech.s2t.frontend.normalizer import FeatureNormalizer
-from paddlespeech.s2t.utils.utility import add_arguments
-from paddlespeech.s2t.utils.utility import print_arguments
-
-parser = argparse.ArgumentParser(description=__doc__)
-add_arg = functools.partial(add_arguments, argparser=parser)
-# yapf: disable
-add_arg('num_samples',      int,    2000,    "# of samples to for statistics.")
-
-add_arg('spectrum_type',    str,
-        'linear',
-        "Audio feature type. Options: linear, mfcc, fbank.",
-        choices=['linear', 'mfcc', 'fbank'])
-add_arg('feat_dim',    int, 13, "Audio feature dim.")
-add_arg('delta_delta', bool,  False, "Audio feature with delta delta.")
-add_arg('stride_ms', int, 10,  "stride length in ms.")
-add_arg('window_ms', int, 20,  "stride length in ms.")
-add_arg('sample_rate',  int, 16000,  "target sample rate.")
-add_arg('use_dB_normalization', bool, True, "do dB normalization.")
-add_arg('target_dB',   int, -20,  "target dB.")
-
-add_arg('manifest_path',    str,
-        'data/librispeech/manifest.train',
-        "Filepath of manifest to compute normalizer's mean and stddev.")
-add_arg('num_workers',
-                        default=0,
-                        type=int,
-                        help='num of subprocess workers for processing')
-add_arg('output_path',    str,
-        'data/librispeech/mean_std.npz',
-        "Filepath of write mean and stddev to (.npz).")
-# yapf: disable
-args = parser.parse_args()
-
-
-def main():
-    print_arguments(args, globals())
-
-    augmentation_pipeline = AugmentationPipeline('{}')
-    audio_featurizer = AudioFeaturizer(
-        spectrum_type=args.spectrum_type,
-        feat_dim=args.feat_dim,
-        delta_delta=args.delta_delta,
-        stride_ms=float(args.stride_ms),
-        window_ms=float(args.window_ms),
-        n_fft=None,
-        max_freq=None,
-        target_sample_rate=args.sample_rate,
-        use_dB_normalization=args.use_dB_normalization,
-        target_dB=args.target_dB,
-        dither=0.0)
-
-    def augment_and_featurize(audio_segment):
-        augmentation_pipeline.transform_audio(audio_segment)
-        return audio_featurizer.featurize(audio_segment)
-
-    normalizer = FeatureNormalizer(
-        mean_std_filepath=None,
-        manifest_path=args.manifest_path,
-        featurize_func=augment_and_featurize,
-        num_samples=args.num_samples,
-        num_workers=args.num_workers)
-    normalizer.write_to_file(args.output_path)
-
+from paddlespeech.dataset.s2t import compute_mean_std_main
 
 if __name__ == '__main__':
-    main()
+    compute_mean_std_main()
diff --git a/utils/format_data.py b/utils/format_data.py
index 6db2a1bb..574cb735 100755
--- a/utils/format_data.py
+++ b/utils/format_data.py
@@ -13,130 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """format manifest with more metadata."""
-import argparse
-import functools
-import json
-
-import jsonlines
-
-from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
-from paddlespeech.s2t.frontend.utility import load_cmvn
-from paddlespeech.s2t.io.utility import feat_type
-from paddlespeech.s2t.utils.utility import add_arguments
-from paddlespeech.s2t.utils.utility import print_arguments
-
-parser = argparse.ArgumentParser(description=__doc__)
-add_arg = functools.partial(add_arguments, argparser=parser)
-# yapf: disable
-add_arg('cmvn_path',       str,
-        'examples/librispeech/data/mean_std.json',
-        "Filepath of cmvn.")
-add_arg('unit_type', str, "char", "Unit type, e.g. char, word, spm")
-add_arg('vocab_path',       str,
-        'examples/librispeech/data/vocab.txt',
-        "Filepath of the vocabulary.")
-add_arg('manifest_paths',   str,
-        None,
-        "Filepaths of manifests for building vocabulary. "
-        "You can provide multiple manifest files.",
-        nargs='+',
-        required=True)
-# bpe
-add_arg('spm_model_prefix', str, None,
-     "spm model prefix, spm_model_%(bpe_mode)_%(count_threshold), only need when `unit_type` is spm")
-add_arg('output_path',  str, None, "filepath of formated manifest.", required=True)
-# yapf: disable
-args = parser.parse_args()
-
-
-def main():
-    print_arguments(args, globals())
-    fout = open(args.output_path, 'w', encoding='utf-8')
-
-    # get feat dim
-    filetype = args.cmvn_path.split(".")[-1]
-    mean, istd = load_cmvn(args.cmvn_path, filetype=filetype)
-    feat_dim = mean.shape[0] #(D)
-    print(f"Feature dim: {feat_dim}")
-
-    text_feature = TextFeaturizer(args.unit_type, args.vocab_path, args.spm_model_prefix)
-    vocab_size = text_feature.vocab_size
-    print(f"Vocab size: {vocab_size}")
-
-    # josnline like this
-    # {
-    #   "input": [{"name": "input1", "shape": (100, 83), "feat": "xxx.ark:123"}],
-    #   "output": [{"name":"target1", "shape": (40, 5002), "text": "a b c de"}],
-    #   "utt2spk": "111-2222",
-    #   "utt": "111-2222-333"
-    # }
-    count = 0
-    for manifest_path in args.manifest_paths:
-        with jsonlines.open(str(manifest_path), 'r') as reader:
-            manifest_jsons = list(reader)
-
-        for line_json in manifest_jsons:
-            output_json = {
-                "input": [],
-                "output": [],
-                'utt': line_json['utt'],
-                'utt2spk': line_json.get('utt2spk', 'global'),
-            }
-
-            # output
-            line = line_json['text']
-            if isinstance(line, str):
-                # only one target
-                tokens = text_feature.tokenize(line)
-                tokenids = text_feature.featurize(line)
-                output_json['output'].append({
-                    'name': 'target1',
-                    'shape': (len(tokenids), vocab_size),
-                    'text': line,
-                    'token': ' '.join(tokens),
-                    'tokenid': ' '.join(map(str, tokenids)),
-                })
-            else:
-                # isinstance(line, list), multi target in one vocab
-                for i, item in enumerate(line, 1):
-                    tokens = text_feature.tokenize(item)
-                    tokenids = text_feature.featurize(item)
-                    output_json['output'].append({
-                        'name': f'target{i}',
-                        'shape': (len(tokenids), vocab_size),
-                        'text': item,
-                        'token': ' '.join(tokens),
-                        'tokenid': ' '.join(map(str, tokenids)),
-                    })
-
-            # input
-            line = line_json['feat']
-            if isinstance(line, str):
-                # only one input
-                feat_shape = line_json['feat_shape']
-                assert isinstance(feat_shape, (list, tuple)), type(feat_shape)
-                filetype = feat_type(line)
-                if filetype == 'sound':
-                    feat_shape.append(feat_dim)
-                else: # kaldi
-                    raise NotImplementedError('no support kaldi feat now!')
-
-                output_json['input'].append({
-                    "name": "input1",
-                    "shape": feat_shape,
-                    "feat": line,
-                    "filetype": filetype,
-                })
-            else:
-                # isinstance(line, list), multi input 
-                raise NotImplementedError("not support multi input now!")
-
-            fout.write(json.dumps(output_json) + '\n')
-            count += 1
-
-    print(f"{args.manifest_paths} Examples number: {count}")
-    fout.close()
-
+from paddlespeech.dataset.s2t import format_data_main
 
 if __name__ == '__main__':
-    main()
+    format_data_main()
diff --git a/utils/format_rsl.py b/utils/format_rsl.py
index 8230416c..a6845a67 100644
--- a/utils/format_rsl.py
+++ b/utils/format_rsl.py
@@ -11,96 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import argparse
+from paddlespeech.dataset.s2t import format_rsl_main
 
-import jsonlines
-
-
-def trans_hyp(origin_hyp, trans_hyp=None, trans_hyp_sclite=None):
-    """
-    Args:
-        origin_hyp: The input json file which contains the model output
-        trans_hyp: The output file for caculate CER/WER
-        trans_hyp_sclite: The output file for caculate CER/WER using sclite
-    """
-    input_dict = {}
-
-    with open(origin_hyp, "r+", encoding="utf8") as f:
-        for item in jsonlines.Reader(f):
-            input_dict[item["utt"]] = item["hyps"][0]
-    if trans_hyp is not None:
-        with open(trans_hyp, "w+", encoding="utf8") as f:
-            for key in input_dict.keys():
-                f.write(key + " " + input_dict[key] + "\n")
-    if trans_hyp_sclite is not None:
-        with open(trans_hyp_sclite, "w+") as f:
-            for key in input_dict.keys():
-                line = input_dict[key] + "(" + key + ".wav" + ")" + "\n"
-                f.write(line)
-
-
-def trans_ref(origin_ref, trans_ref=None, trans_ref_sclite=None):
-    """
-    Args:
-        origin_hyp: The input json file which contains the model output
-        trans_hyp: The output file for caculate CER/WER
-        trans_hyp_sclite: The output file for caculate CER/WER using sclite
-    """
-    input_dict = {}
-
-    with open(origin_ref, "r", encoding="utf8") as f:
-        for item in jsonlines.Reader(f):
-            input_dict[item["utt"]] = item["text"]
-    if trans_ref is not None:
-        with open(trans_ref, "w", encoding="utf8") as f:
-            for key in input_dict.keys():
-                f.write(key + " " + input_dict[key] + "\n")
-
-    if trans_ref_sclite is not None:
-        with open(trans_ref_sclite, "w") as f:
-            for key in input_dict.keys():
-                line = input_dict[key] + "(" + key + ".wav" + ")" + "\n"
-                f.write(line)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        prog='format hyp file for compute CER/WER', add_help=True)
-    parser.add_argument(
-        '--origin_hyp', type=str, default=None, help='origin hyp file')
-    parser.add_argument(
-        '--trans_hyp',
-        type=str,
-        default=None,
-        help='hyp file for caculating CER/WER')
-    parser.add_argument(
-        '--trans_hyp_sclite',
-        type=str,
-        default=None,
-        help='hyp file for caculating CER/WER by sclite')
-
-    parser.add_argument(
-        '--origin_ref', type=str, default=None, help='origin ref file')
-    parser.add_argument(
-        '--trans_ref',
-        type=str,
-        default=None,
-        help='ref file for caculating CER/WER')
-    parser.add_argument(
-        '--trans_ref_sclite',
-        type=str,
-        default=None,
-        help='ref file for caculating CER/WER by sclite')
-    parser_args = parser.parse_args()
-
-    if parser_args.origin_hyp is not None:
-        trans_hyp(
-            origin_hyp=parser_args.origin_hyp,
-            trans_hyp=parser_args.trans_hyp,
-            trans_hyp_sclite=parser_args.trans_hyp_sclite, )
-
-    if parser_args.origin_ref is not None:
-        trans_ref(
-            origin_ref=parser_args.origin_ref,
-            trans_ref=parser_args.trans_ref,
-            trans_ref_sclite=parser_args.trans_ref_sclite, )
+if __name__ == '__main__':
+    format_rsl_main()
diff --git a/utils/format_triplet_data.py b/utils/format_triplet_data.py
index 44ff4527..e9a0cf54 100755
--- a/utils/format_triplet_data.py
+++ b/utils/format_triplet_data.py
@@ -22,8 +22,8 @@ import jsonlines
 from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.frontend.utility import load_cmvn
 from paddlespeech.s2t.io.utility import feat_type
-from paddlespeech.s2t.utils.utility import add_arguments
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import add_arguments
+from paddlespeech.utils.argparse import print_arguments
 
 parser = argparse.ArgumentParser(description=__doc__)
 add_arg = functools.partial(add_arguments, argparser=parser)
diff --git a/utils/fst/ctc_token_fst.py b/utils/fst/ctc_token_fst.py
index 2262912c..f63e9cda 100755
--- a/utils/fst/ctc_token_fst.py
+++ b/utils/fst/ctc_token_fst.py
@@ -6,7 +6,7 @@ def main(args):
     """Token Transducer"""
     # <eps> entry
     print('0 1 <eps> <eps>')
-    # skip begining and ending <blank>
+    # skip beginning and ending <blank>
     print('1 1 <blank> <eps>')
     print('2 2 <blank> <eps>')
     # <eps> exit
diff --git a/utils/manifest_key_value.py b/utils/manifest_key_value.py
index 3825fb9b..5ffe8e55 100755
--- a/utils/manifest_key_value.py
+++ b/utils/manifest_key_value.py
@@ -6,8 +6,8 @@ from pathlib import Path
 
 import jsonlines
 
-from utils.utility import add_arguments
-from utils.utility import print_arguments
+from paddlespeech.utils.argparse import add_arguments
+from paddlespeech.utils.argparse import print_arguments
 
 
 def main(args):
diff --git a/utils/tokenizer.perl b/utils/tokenizer.perl
index ae97d658..836fe19c 100644
--- a/utils/tokenizer.perl
+++ b/utils/tokenizer.perl
@@ -296,7 +296,7 @@ sub tokenize
         $text =~ s/DOTMULTI\./DOTDOTMULTI/g;
     }
 
-    # seperate out "," except if within numbers (5,300)
+    # separate out "," except if within numbers (5,300)
     #$text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
 
     # separate out "," except if within numbers (5,300)

序号	说明	合成音频（diffsinger_opencpop + pwgan_opencpop）
1	原始 opencpop 标注的 notes，note_durs，is_slurs，升F大调，起始在小字组（第3组）	+ + +
2	原始 opencpop 标注的 notes 和 is_slurs，note_durs 改变（从谱子获取）	+ + +
3	原始 opencpop 标注的 notes 去掉 rest（毛字一拍），is_slurs 和 note_durs 改变（从谱子获取）	+ + +
4	从谱子获取 notes，note durs，is_slurs，不含 rest（毛字一拍），起始在小字一组（第3组）	+ + +
5	从谱子获取 notes，note durs，is_slurs，加上 rest （毛字半拍，rest半拍），起始在小字一组（第3组）	+ + +
6	从谱子获取 notes， is_slurs，包含 rest，note_durs 从原始标注获取，起始在小字一组（第3组）	+ + +
7	从谱子获取 notes，note durs，is_slurs，不含 rest（毛字一拍），起始在小字一组（第4组）	+ + +