diff --git a/.gitignore b/.gitignore
index ad8e74925..639472001 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,7 @@
 .DS_Store
 *.pyc
 .vscode
-*log
+*.log
 *.wav
 *.pdmodel
 *.pdiparams*
@@ -34,4 +34,6 @@ tools/activate_python.sh
 tools/miniconda.sh
 tools/CRF++-0.58/
 
-speechx/fc_patch/
\ No newline at end of file
+speechx/fc_patch/
+
+third_party/ctc_decoders/paddlespeech_ctcdecoders.py
diff --git a/.mergify.yml b/.mergify.yml
index 6dae66d04..68b248101 100644
--- a/.mergify.yml
+++ b/.mergify.yml
@@ -52,7 +52,7 @@ pull_request_rules:
         add: ["T2S"]
   - name: "auto add label=Audio"
     conditions:
-      - files~=^audio/
+      - files~=^paddleaudio/
     actions:
       label:
         add: ["Audio"]
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 7fb01708a..09e92a667 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -50,13 +50,13 @@ repos:
         entry: bash .pre-commit-hooks/clang-format.hook -i
         language: system
         files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|cuh|proto)$
-        exclude: (?=speechx/speechx/kaldi).*(\.cpp|\.cc|\.h|\.py)$
+        exclude: (?=speechx/speechx/kaldi|speechx/patch).*(\.cpp|\.cc|\.h|\.py)$
     -   id: copyright_checker
         name: copyright_checker
         entry: python .pre-commit-hooks/copyright-check.hook
         language: system
         files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$
-        exclude: (?=third_party|pypinyin|speechx/speechx/kaldi).*(\.cpp|\.cc|\.h|\.py)$
+        exclude: (?=third_party|pypinyin|speechx/speechx/kaldi|speechx/patch).*(\.cpp|\.cc|\.h|\.py)$
 -   repo: https://github.com/asottile/reorder_python_imports
     rev: v2.4.0
     hooks:
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6e8315e76..2782b8176 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,25 @@
 # Changelog
 
+Date: 2022-3-22, Author: yt605155624.
+Add features to: CLI:
+  - Support aishell3_hifigan、vctk_hifigan
+  - PRLink: https://github.com/PaddlePaddle/PaddleSpeech/pull/1587
+
+Date: 2022-3-09, Author: yt605155624.
+Add features to: T2S:
+  - Add ljspeech hifigan egs.
+  - PRLink: https://github.com/PaddlePaddle/PaddleSpeech/pull/1549
+
+Date: 2022-3-08, Author: yt605155624.
+Add features to: T2S:
+  - Add aishell3 hifigan egs.
+  - PRLink: https://github.com/PaddlePaddle/PaddleSpeech/pull/1545
+
+Date: 2022-3-08, Author: yt605155624.
+Add features to: T2S:
+  - Add vctk hifigan egs.
+  - PRLink: https://github.com/PaddlePaddle/PaddleSpeech/pull/1544
+
 Date: 2022-1-29, Author: yt605155624.
 Add features to: T2S:
   - Update aishell3 vc0 with new Tacotron2.
diff --git a/README.md b/README.md
index 46f492e99..a90498293 100644
--- a/README.md
+++ b/README.md
@@ -7,6 +7,7 @@
 
   <h3>
   <a href="#quick-start"> Quick Start </a>
+  | <a href="#quick-start-server"> Quick Start Server </a>
   | <a href="#documents"> Documents </a>
   | <a href="#model-list"> Models List </a>
 </div>
@@ -178,7 +179,9 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision
 <!---
 2021.12.14: We would like to have an online courses to introduce basics and research of speech, as well as code practice with `paddlespeech`. Please pay attention to our [Calendar](https://www.paddlepaddle.org.cn/live).
 --->
-- 🤗  2021.12.14: Our PaddleSpeech [ASR](https://huggingface.co/spaces/KPatrick/PaddleSpeechASR) and [TTS](https://huggingface.co/spaces/akhaliq/paddlespeech) Demos on Hugging Face Spaces are available!
+- 👏🏻  2022.03.28: PaddleSpeech Server is available for Audio Classification, Automatic Speech Recognition and Text-to-Speech.
+- 👏🏻  2022.03.28: PaddleSpeech CLI is available for Speaker Verification.
+- 🤗  2021.12.14: Our PaddleSpeech [ASR](https://huggingface.co/spaces/KPatrick/PaddleSpeechASR) and [TTS](https://huggingface.co/spaces/KPatrick/PaddleSpeechTTS) Demos on Hugging Face Spaces are available!
 - 👏🏻  2021.12.10: PaddleSpeech CLI is available for Audio Classification, Automatic Speech Recognition, Speech Translation (English to Chinese) and Text-to-Speech.
 
 ### Community
@@ -203,10 +206,16 @@ Developers can have a try of our models with [PaddleSpeech Command Line](./paddl
 paddlespeech cls --input input.wav
 ```
 
+**Speaker Verification**
+```
+paddlespeech vector --task spk --input input_16k.wav
+```
+
 **Automatic Speech Recognition**
 ```shell
 paddlespeech asr --lang zh --input input_16k.wav
 ```
+- web demo for Automatic Speech Recognition is integrated to [Huggingface Spaces](https://huggingface.co/spaces) with [Gradio](https://github.com/gradio-app/gradio). See Demo: [ASR Demo](https://huggingface.co/spaces/KPatrick/PaddleSpeechASR)
 
 **Speech Translation** (English to Chinese)
 (not support for Mac and Windows now)
@@ -218,7 +227,7 @@ paddlespeech st --input input_16k.wav
 ```shell
 paddlespeech tts --input "你好，欢迎使用飞桨深度学习框架！" --output output.wav
 ```
-- web demo for Text to Speech is integrated to [Huggingface Spaces](https://huggingface.co/spaces) with [Gradio](https://github.com/gradio-app/gradio). See Demo: [TTS Demo](https://huggingface.co/spaces/akhaliq/paddlespeech)
+- web demo for Text to Speech is integrated to [Huggingface Spaces](https://huggingface.co/spaces) with [Gradio](https://github.com/gradio-app/gradio). See Demo: [TTS Demo](https://huggingface.co/spaces/KPatrick/PaddleSpeechTTS)
 
 **Text Postprocessing** 
 - Punctuation Restoration
@@ -241,6 +250,36 @@ For more command lines, please see: [demos](https://github.com/PaddlePaddle/Padd
 
 If you want to try more functions like training and tuning, please have a look at [Speech-to-Text Quick Start](./docs/source/asr/quick_start.md) and [Text-to-Speech Quick Start](./docs/source/tts/quick_start.md).
 
+
+<a name="quickstartserver"></a>
+## Quick Start Server
+
+Developers can have a try of our speech server with [PaddleSpeech Server Command Line](./paddlespeech/server/README.md).
+
+**Start server**     
+```shell
+paddlespeech_server start --config_file ./paddlespeech/server/conf/application.yaml
+```
+
+**Access Speech Recognition Services**     
+```shell
+paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input input_16k.wav
+```
+
+**Access Text to Speech Services**     
+```shell
+paddlespeech_client tts --server_ip 127.0.0.1 --port 8090 --input "您好，欢迎使用百度飞桨语音合成服务。" --output output.wav
+```
+
+**Access Audio Classification Services**     
+```shell
+paddlespeech_client cls --server_ip 127.0.0.1 --port 8090 --input input.wav
+```
+
+
+For more information about server command lines, please see: [speech server demos](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/speech_server)
+
+
 ## Model List
 
 PaddleSpeech supports a series of most popular models. They are summarized in [released models](./docs/source/released_model.md) and attached with available pretrained models.
@@ -397,9 +436,9 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r
     </tr>
     <tr>
       <td >HiFiGAN</td>
-      <td >CSMSC</td>
+      <td >LJSpeech / VCTK / CSMSC / AISHELL-3</td>
       <td>
-      <a href = "./examples/csmsc/voc5">HiFiGAN-csmsc</a> 
+      <a href = "./examples/ljspeech/voc5">HiFiGAN-ljspeech</a> / <a href = "./examples/vctk/voc5">HiFiGAN-vctk</a> / <a href = "./examples/csmsc/voc5">HiFiGAN-csmsc</a> / <a href = "./examples/aishell3/voc5">HiFiGAN-aishell3</a>
       </td>
     </tr>
     <tr>
@@ -457,6 +496,29 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r
   </tbody>
 </table>
 
+**Speaker Verification**
+
+<table style="width:100%">
+  <thead>
+    <tr>
+      <th> Task </th>
+      <th> Dataset </th>
+      <th> Model Type </th>
+      <th> Link </th>
+    </tr>
+  </thead>
+  <tbody>
+  <tr>
+      <td>Speaker Verification</td>
+      <td>VoxCeleb12</td>
+      <td>ECAPA-TDNN</td>
+      <td>
+      <a href = "./examples/voxceleb/sv0">ecapa-tdnn-voxceleb12</a>
+      </td>
+    </tr>
+  </tbody>
+</table>
+
 **Punctuation Restoration**
 
 <table style="width:100%">
@@ -498,6 +560,7 @@ Normally, [Speech SoTA](https://paperswithcode.com/area/speech), [Audio SoTA](ht
     - [Chinese Rule Based Text Frontend](./docs/source/tts/zh_text_frontend.md)
     - [Test Audio Samples](https://paddlespeech.readthedocs.io/en/latest/tts/demo.html)
   - [Audio Classification](./demos/audio_tagging/README.md)
+  - [Speaker Verification](./demos/speaker_verification/README.md)
   - [Speech Translation](./demos/speech_translation/README.md)
 - [Released Models](./docs/source/released_model.md)
 - [Community](#Community)
@@ -573,7 +636,6 @@ You are warmly welcome to submit questions in [discussions](https://github.com/P
 
 
 - Many thanks to [yeyupiaoling](https://github.com/yeyupiaoling)/[PPASR](https://github.com/yeyupiaoling/PPASR)/[PaddlePaddle-DeepSpeech](https://github.com/yeyupiaoling/PaddlePaddle-DeepSpeech)/[VoiceprintRecognition-PaddlePaddle](https://github.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle)/[AudioClassification-PaddlePaddle](https://github.com/yeyupiaoling/AudioClassification-PaddlePaddle) for years of attention, constructive advice and great help.
-- Many thanks to [AK391](https://github.com/AK391) for TTS web demo on Huggingface Spaces using Gradio.
 - Many thanks to [mymagicpower](https://github.com/mymagicpower) for the Java implementation of ASR upon [short](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_sdk) and [long](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_long_audio_sdk) audio files.
 - Many thanks to [JiehangXie](https://github.com/JiehangXie)/[PaddleBoBo](https://github.com/JiehangXie/PaddleBoBo) for developing Virtual Uploader(VUP)/Virtual YouTuber(VTuber) with PaddleSpeech TTS function.
 - Many thanks to [745165806](https://github.com/745165806)/[PaddleSpeechTask](https://github.com/745165806/PaddleSpeechTask) for contributing Punctuation Restoration model.
diff --git a/README_cn.md b/README_cn.md
index e84947372..ab4ce6e6b 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -6,6 +6,7 @@
 
   <h3>
   <a href="#quick-start"> 快速开始 </a>
+  | <a href="#quick-start-server"> 快速使用服务 </a>
   | <a href="#documents"> 教程文档 </a>
   | <a href="#model-list"> 模型列表 </a>
 </div>
@@ -179,7 +180,9 @@ from https://github.com/18F/open-source-guide/blob/18f-pages/pages/making-readme
 <!---
 2021.12.14: We would like to have an online courses to introduce basics and research of speech, as well as code practice with `paddlespeech`. Please pay attention to our [Calendar](https://www.paddlepaddle.org.cn/live).
 --->
-- 🤗 2021.12.14: 我们在 Hugging Face Spaces 上的 [ASR](https://huggingface.co/spaces/KPatrick/PaddleSpeechASR) 以及 [TTS](https://huggingface.co/spaces/akhaliq/paddlespeech) Demos 上线啦!
+- 👏🏻 2022.03.28: PaddleSpeech Server 上线! 覆盖了声音分类、语音识别、以及语音合成。
+- 👏🏻 2022.03.28: PaddleSpeech CLI 上线声纹验证。
+- 🤗  2021.12.14: Our PaddleSpeech [ASR](https://huggingface.co/spaces/KPatrick/PaddleSpeechASR) and [TTS](https://huggingface.co/spaces/KPatrick/PaddleSpeechTTS) Demos on Hugging Face Spaces are available!
 - 👏🏻 2021.12.10: PaddleSpeech CLI 上线！覆盖了声音分类、语音识别、语音翻译（英译中）以及语音合成。
 
 ### 技术交流群
@@ -202,6 +205,10 @@ from https://github.com/18F/open-source-guide/blob/18f-pages/pages/making-readme
 ```shell
 paddlespeech cls --input input.wav
 ```
+**声纹识别**
+```shell
+paddlespeech vector --task spk --input input_16k.wav
+```
 **语音识别**
 ```shell
 paddlespeech asr --lang zh --input input_16k.wav
@@ -236,6 +243,33 @@ paddlespeech asr --input ./zh.wav | paddlespeech text --task punc
 更多命令行命令请参考 [demos](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos)
 > Note: 如果需要训练或者微调，请查看[语音识别](./docs/source/asr/quick_start.md)， [语音合成](./docs/source/tts/quick_start.md)。
 
+
+## 快速使用服务
+安装完成后，开发者可以通过命令行快速使用服务。
+
+**启动服务**     
+```shell
+paddlespeech_server start --config_file ./paddlespeech/server/conf/application.yaml
+```
+
+**访问语音识别服务**     
+```shell
+paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input input_16k.wav
+```
+
+**访问语音合成服务**     
+```shell
+paddlespeech_client tts --server_ip 127.0.0.1 --port 8090 --input "您好，欢迎使用百度飞桨语音合成服务。" --output output.wav
+```
+
+**访问音频分类服务**     
+```shell
+paddlespeech_client cls --server_ip 127.0.0.1 --port 8090 --input input.wav
+```
+
+更多服务相关的命令行使用信息，请参考 [demos](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/speech_server)
+
+
 ## 模型列表
 PaddleSpeech 支持很多主流的模型，并提供了预训练模型，详情请见[模型列表](./docs/source/released_model.md)。
 
@@ -392,9 +426,9 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块：文本前端、声
     </tr>
     <tr>
       <td >HiFiGAN</td>
-      <td >CSMSC</td>
+      <td >LJSpeech / VCTK / CSMSC / AISHELL-3</td>
       <td>
-      <a href = "./examples/csmsc/voc5">HiFiGAN-csmsc</a> 
+      <a href = "./examples/ljspeech/voc5">HiFiGAN-ljspeech</a> / <a href = "./examples/vctk/voc5">HiFiGAN-vctk</a> / <a href = "./examples/csmsc/voc5">HiFiGAN-csmsc</a> / <a href = "./examples/aishell3/voc5">HiFiGAN-aishell3</a>
       </td>
     </tr>
     <tr>
@@ -453,6 +487,30 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块：文本前端、声
   </tbody>
 </table>
 
+
+**声纹识别**
+
+<table style="width:100%">
+  <thead>
+    <tr>
+      <th> Task </th>
+      <th> Dataset </th>
+      <th> Model Type </th>
+      <th> Link </th>
+    </tr>
+  </thead>
+  <tbody>
+  <tr>
+      <td>Speaker Verification</td>
+      <td>VoxCeleb12</td>
+      <td>ECAPA-TDNN</td>
+      <td>
+      <a href = "./examples/voxceleb/sv0">ecapa-tdnn-voxceleb12</a>
+      </td>
+    </tr>
+  </tbody>
+</table>
+
 **标点恢复**
 
 <table style="width:100%">
@@ -499,6 +557,7 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块：文本前端、声
     - [中文文本前端](./docs/source/tts/zh_text_frontend.md)
     - [测试语音样本](https://paddlespeech.readthedocs.io/en/latest/tts/demo.html)
   - [声音分类](./demos/audio_tagging/README_cn.md)
+  - [声纹识别](./demos/speaker_verification/README_cn.md)
   - [语音翻译](./demos/speech_translation/README_cn.md)
 - [模型列表](#模型列表)
   - [语音识别](#语音识别模型)
@@ -521,6 +580,15 @@ author={PaddlePaddle Authors},
 howpublished = {\url{https://github.com/PaddlePaddle/PaddleSpeech}},
 year={2021}
 }
+
+@inproceedings{zheng2021fused,
+  title={Fused acoustic and text encoding for multimodal bilingual pretraining and speech translation},
+  author={Zheng, Renjie and Chen, Junkun and Ma, Mingbo and Huang, Liang},
+  booktitle={International Conference on Machine Learning},
+  pages={12736--12746},
+  year={2021},
+  organization={PMLR}
+}
 ```
 
 <a name="欢迎贡献"></a>
@@ -568,7 +636,6 @@ year={2021}
 ## 致谢
 
 - 非常感谢 [yeyupiaoling](https://github.com/yeyupiaoling)/[PPASR](https://github.com/yeyupiaoling/PPASR)/[PaddlePaddle-DeepSpeech](https://github.com/yeyupiaoling/PaddlePaddle-DeepSpeech)/[VoiceprintRecognition-PaddlePaddle](https://github.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle)/[AudioClassification-PaddlePaddle](https://github.com/yeyupiaoling/AudioClassification-PaddlePaddle) 多年来的关注和建议，以及在诸多问题上的帮助。
-- 非常感谢 [AK391](https://github.com/AK391) 在 Huggingface Spaces 上使用 Gradio 对我们的语音合成功能进行网页版演示。
 - 非常感谢 [mymagicpower](https://github.com/mymagicpower) 采用PaddleSpeech 对 ASR 的[短语音](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_sdk)及[长语音](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_long_audio_sdk)进行 Java 实现。
 - 非常感谢 [JiehangXie](https://github.com/JiehangXie)/[PaddleBoBo](https://github.com/JiehangXie/PaddleBoBo) 采用 PaddleSpeech 语音合成功能实现 Virtual Uploader(VUP)/Virtual YouTuber(VTuber) 虚拟主播。
 - 非常感谢 [745165806](https://github.com/745165806)/[PaddleSpeechTask](https://github.com/745165806/PaddleSpeechTask) 贡献标点重建相关模型。
diff --git a/dataset/librispeech/librispeech.py b/dataset/librispeech/librispeech.py
index 69f0db599..65cab2490 100644
--- a/dataset/librispeech/librispeech.py
+++ b/dataset/librispeech/librispeech.py
@@ -20,12 +20,12 @@ of each audio file in the data set.
 """
 import argparse
 import codecs
-import distutils.util
 import io
 import json
 import os
 from multiprocessing.pool import Pool
 
+import distutils.util
 import soundfile
 
 from utils.utility import download
diff --git a/dataset/voxceleb/voxceleb1.py b/dataset/voxceleb/voxceleb1.py
index e50c91bc1..905862008 100644
--- a/dataset/voxceleb/voxceleb1.py
+++ b/dataset/voxceleb/voxceleb1.py
@@ -59,12 +59,19 @@ DEV_TARGET_DATA = "vox1_dev_wav_parta* vox1_dev_wav.zip ae63e55b951748cc486645f5
 TEST_LIST = {"vox1_test_wav.zip": "185fdc63c3c739954633d50379a3d102"}
 TEST_TARGET_DATA = "vox1_test_wav.zip vox1_test_wav.zip 185fdc63c3c739954633d50379a3d102"
 
-# kaldi trial
-# this trial file is organized by kaldi according the official file,
-# which is a little different with the official trial veri_test2.txt
-KALDI_BASE_URL = "http://www.openslr.org/resources/49/"
-TRIAL_LIST = {"voxceleb1_test_v2.txt": "29fc7cc1c5d59f0816dc15d6e8be60f7"}
-TRIAL_TARGET_DATA = "voxceleb1_test_v2.txt voxceleb1_test_v2.txt 29fc7cc1c5d59f0816dc15d6e8be60f7"
+# voxceleb trial
+
+TRIAL_BASE_URL = "https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/"
+TRIAL_LIST = {
+    "veri_test.txt": "29fc7cc1c5d59f0816dc15d6e8be60f7",  # voxceleb1
+    "veri_test2.txt": "b73110731c9223c1461fe49cb48dddfc",  # voxceleb1(cleaned)
+    "list_test_hard.txt": "21c341b6b2168eea2634df0fb4b8fff1",  # voxceleb1-H
+    "list_test_hard2.txt":
+    "857790e09d579a68eb2e339a090343c8",  # voxceleb1-H(cleaned)
+    "list_test_all.txt": "b9ecf7aa49d4b656aa927a8092844e4a",  # voxceleb1-E
+    "list_test_all2.txt":
+    "a53e059deb562ffcfc092bf5d90d9f3a"  # voxceleb1-E(cleaned)
+}
 
 parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument(
@@ -82,7 +89,7 @@ args = parser.parse_args()
 
 
 def create_manifest(data_dir, manifest_path_prefix):
-    print("Creating manifest %s ..." % manifest_path_prefix)
+    print(f"Creating manifest {manifest_path_prefix} from {data_dir}")
     json_lines = []
     data_path = os.path.join(data_dir, "wav", "**", "*.wav")
     total_sec = 0.0
@@ -114,6 +121,9 @@ def create_manifest(data_dir, manifest_path_prefix):
     # voxceleb1 is given explicit in the path
     data_dir_name = Path(data_dir).name
     manifest_path_prefix = manifest_path_prefix + "." + data_dir_name
+    if not os.path.exists(os.path.dirname(manifest_path_prefix)):
+        os.makedirs(os.path.dirname(manifest_path_prefix))
+
     with codecs.open(manifest_path_prefix, 'w', encoding='utf-8') as f:
         for line in json_lines:
             f.write(line + "\n")
@@ -133,11 +143,13 @@ def create_manifest(data_dir, manifest_path_prefix):
 def prepare_dataset(base_url, data_list, target_dir, manifest_path,
                     target_data):
     if not os.path.exists(target_dir):
-        os.mkdir(target_dir)
+        os.makedirs(target_dir)
 
     # wav directory already exists, it need do nothing
+    # we will download the voxceleb1 data to ${target_dir}/vox1/dev/ or ${target_dir}/vox1/test directory 
     if not os.path.exists(os.path.join(target_dir, "wav")):
         # download all dataset part
+        print("start to download the vox1 dev zip package")
         for zip_part in data_list.keys():
             download_url = " --no-check-certificate " + base_url + "/" + zip_part
             download(
@@ -167,10 +179,22 @@ def prepare_dataset(base_url, data_list, target_dir, manifest_path,
     create_manifest(data_dir=target_dir, manifest_path_prefix=manifest_path)
 
 
+def prepare_trial(base_url, data_list, target_dir):
+    if not os.path.exists(target_dir):
+        os.makedirs(target_dir)
+
+    for trial, md5sum in data_list.items():
+        target_trial = os.path.join(target_dir, trial)
+        if not os.path.exists(os.path.join(target_dir, trial)):
+            download_url = " --no-check-certificate " + base_url + "/" + trial
+            download(url=download_url, md5sum=md5sum, target_dir=target_dir)
+
+
 def main():
     if args.target_dir.startswith('~'):
         args.target_dir = os.path.expanduser(args.target_dir)
 
+    # prepare the vox1 dev data
     prepare_dataset(
         base_url=BASE_URL,
         data_list=DEV_LIST,
@@ -178,6 +202,7 @@ def main():
         manifest_path=args.manifest_prefix,
         target_data=DEV_TARGET_DATA)
 
+    # prepare the vox1 test data
     prepare_dataset(
         base_url=BASE_URL,
         data_list=TEST_LIST,
@@ -185,6 +210,12 @@ def main():
         manifest_path=args.manifest_prefix,
         target_data=TEST_TARGET_DATA)
 
+    # prepare the vox1 trial
+    prepare_trial(
+        base_url=TRIAL_BASE_URL,
+        data_list=TRIAL_LIST,
+        target_dir=os.path.dirname(args.manifest_prefix))
+
     print("Manifest prepare done!")
 
 
diff --git a/dataset/voxceleb/voxceleb2.py b/dataset/voxceleb/voxceleb2.py
new file mode 100644
index 000000000..22a2e2ffe
--- /dev/null
+++ b/dataset/voxceleb/voxceleb2.py
@@ -0,0 +1,164 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Prepare VoxCeleb2 dataset
+
+Download and unpack the voxceleb2 data files.
+Voxceleb2 data is stored as the m4a format, 
+so we need convert the m4a to wav with the convert.sh scripts
+"""
+import argparse
+import codecs
+import glob
+import json
+import os
+from pathlib import Path
+
+import soundfile
+
+from utils.utility import download
+from utils.utility import unzip
+
+# all the data will be download in the current data/voxceleb directory default
+DATA_HOME = os.path.expanduser('.')
+
+BASE_URL = "--no-check-certificate https://www.robots.ox.ac.uk/~vgg/data/voxceleb/data/"
+
+# dev data
+DEV_DATA_URL = BASE_URL + '/vox2_aac.zip'
+DEV_MD5SUM = "bbc063c46078a602ca71605645c2a402"
+
+# test data
+TEST_DATA_URL = BASE_URL + '/vox2_test_aac.zip'
+TEST_MD5SUM = "0d2b3ea430a821c33263b5ea37ede312"
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--target_dir",
+    default=DATA_HOME + "/voxceleb2/",
+    type=str,
+    help="Directory to save the voxceleb1 dataset. (default: %(default)s)")
+parser.add_argument(
+    "--manifest_prefix",
+    default="manifest",
+    type=str,
+    help="Filepath prefix for output manifests. (default: %(default)s)")
+parser.add_argument(
+    "--download",
+    default=False,
+    action="store_true",
+    help="Download the voxceleb2 dataset. (default: %(default)s)")
+parser.add_argument(
+    "--generate",
+    default=False,
+    action="store_true",
+    help="Generate the manifest files. (default: %(default)s)")
+
+args = parser.parse_args()
+
+
+def create_manifest(data_dir, manifest_path_prefix):
+    print("Creating manifest %s ..." % manifest_path_prefix)
+    json_lines = []
+    data_path = os.path.join(data_dir, "**", "*.wav")
+    total_sec = 0.0
+    total_text = 0.0
+    total_num = 0
+    speakers = set()
+    for audio_path in glob.glob(data_path, recursive=True):
+        audio_id = "-".join(audio_path.split("/")[-3:])
+        utt2spk = audio_path.split("/")[-3]
+        duration = soundfile.info(audio_path).duration
+        text = ""
+        json_lines.append(
+            json.dumps(
+                {
+                    "utt": audio_id,
+                    "utt2spk": str(utt2spk),
+                    "feat": audio_path,
+                    "feat_shape": (duration, ),
+                    "text": text  # compatible with asr data format
+                },
+                ensure_ascii=False))
+
+        total_sec += duration
+        total_text += len(text)
+        total_num += 1
+        speakers.add(utt2spk)
+
+    # data_dir_name refer to dev or test
+    # voxceleb2 is given explicit in the path
+    data_dir_name = Path(data_dir).name
+    manifest_path_prefix = manifest_path_prefix + "." + data_dir_name
+
+    if not os.path.exists(os.path.dirname(manifest_path_prefix)):
+        os.makedirs(os.path.dirname(manifest_path_prefix))
+    with codecs.open(manifest_path_prefix, 'w', encoding='utf-8') as f:
+        for line in json_lines:
+            f.write(line + "\n")
+
+    manifest_dir = os.path.dirname(manifest_path_prefix)
+    meta_path = os.path.join(manifest_dir, "voxceleb2." +
+                             data_dir_name) + ".meta"
+    with codecs.open(meta_path, 'w', encoding='utf-8') as f:
+        print(f"{total_num} utts", file=f)
+        print(f"{len(speakers)} speakers", file=f)
+        print(f"{total_sec / (60 * 60)} h", file=f)
+        print(f"{total_text} text", file=f)
+        print(f"{total_text / total_sec} text/sec", file=f)
+        print(f"{total_sec / total_num} sec/utt", file=f)
+
+
+def download_dataset(url, md5sum, target_dir, dataset):
+    if not os.path.exists(target_dir):
+        os.makedirs(target_dir)
+
+    # wav directory already exists, it need do nothing
+    print("target dir {}".format(os.path.join(target_dir, dataset)))
+    # unzip the dev dataset will create the dev and unzip the m4a to dev dir
+    # but the test dataset will unzip to aac
+    # so, wo create the ${target_dir}/test and unzip the m4a to test dir
+    if not os.path.exists(os.path.join(target_dir, dataset)):
+        filepath = download(url, md5sum, target_dir)
+        if dataset == "test":
+            unzip(filepath, os.path.join(target_dir, "test"))
+
+
+def main():
+    if args.target_dir.startswith('~'):
+        args.target_dir = os.path.expanduser(args.target_dir)
+
+    # download and unpack the vox2-dev data
+    print("download: {}".format(args.download))
+    if args.download:
+        download_dataset(
+            url=DEV_DATA_URL,
+            md5sum=DEV_MD5SUM,
+            target_dir=args.target_dir,
+            dataset="dev")
+
+        download_dataset(
+            url=TEST_DATA_URL,
+            md5sum=TEST_MD5SUM,
+            target_dir=args.target_dir,
+            dataset="test")
+
+        print("VoxCeleb2 download is done!")
+
+    if args.generate:
+        create_manifest(
+            args.target_dir, manifest_path_prefix=args.manifest_prefix)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/demos/README.md b/demos/README.md
index 4482aa191..36e93dbf1 100644
--- a/demos/README.md
+++ b/demos/README.md
@@ -4,6 +4,7 @@
 
 The directory containes many speech applications in multi scenarios.
 
+* audio searching - mass audio similarity retrieval
 * audio tagging - multi-label tagging of an audio file
 * automatic_video_subtitiles - generate subtitles from a video
 * metaverse - 2D AR with TTS  
diff --git a/demos/README_cn.md b/demos/README_cn.md
index 242b4f070..add6e25f5 100644
--- a/demos/README_cn.md
+++ b/demos/README_cn.md
@@ -4,6 +4,7 @@
 
 该目录包含基于 PaddleSpeech 开发的不同场景的语音应用 Demo：
 
+* 声音检索 - 海量音频相似性检索。
 * 声音分类 - 基于 AudioSet 的 527 类标签的音频多标签分类。 
 * 视频字幕生成 - 识别视频中语音的文本，并进行文本后处理。
 * 元宇宙 - 基于语音合成的 2D 增强现实。
diff --git a/demos/audio_searching/README.md b/demos/audio_searching/README.md
new file mode 100644
index 000000000..8a6f38639
--- /dev/null
+++ b/demos/audio_searching/README.md
@@ -0,0 +1,235 @@
+([简体中文](./README_cn.md)|English)
+
+# Audio Searching
+
+## Introduction
+As the Internet continues to evolve, unstructured data such as emails, social media photos, live videos, and customer service voice calls have become increasingly common. If we want to process the data on a computer, we need to use embedding technology to transform the data into vector and store, index, and query it.
+
+However, when there is a large amount of data, such as hundreds of millions of audio tracks, it is more difficult to do a similarity search. The exhaustive method is feasible, but very time consuming.  For this scenario, this demo will introduce how to build an audio similarity retrieval system using the open source vector database Milvus.
+
+Audio retrieval (speech, music, speaker, etc.) enables querying and finding similar sounds (or the same speaker) in a large amount of audio data.  The audio similarity retrieval system can be used to identify similar sound effects, minimize intellectual property infringement, quickly retrieve the voice print library, and help enterprises control fraud and identity theft. Audio retrieval also plays an important role in the classification and statistical analysis of audio data.
+
+In this demo, you will learn how to build an audio retrieval system to retrieve similar sound snippets. The uploaded audio clips are converted into vector data using paddlespeech-based pre-training models (audio classification model, speaker recognition model, etc.) and stored in Milvus.  Milvus automatically generates a unique ID for each vector, then stores the ID and the corresponding audio information (audio ID, audio speaker ID, etc.) in MySQL to complete the library construction.  During retrieval, users upload test audio to obtain vector, and then conduct vector similarity search in Milvus.The retrieval result returned by Milvus is vector ID, and the corresponding audio information can be queried in MySQL by ID.
+
+![Workflow of an audio searching system](./img/audio_searching.png)
+
+Note：this demo uses the [CN-Celeb](http://openslr.org/82/) dataset of at least 650,000 audio entries and 3000 speakers to build the audio vector library, which is then retrieved using a preset distance calculation. The dataset can also use other,  Adjust as needed, e.g. Librispeech, VoxCeleb, UrbanSound, GloVe, MNIST, etc.
+
+## Usage
+### 1. Prepare PaddleSpeech
+Audio vector extraction requires PaddleSpeech training model, so please make sure that PaddleSpeech has been installed before running. Specific installation steps: See [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md).  
+
+You can choose one way from easy, meduim and hard to install paddlespeech.
+
+### 2. Prepare MySQL and Milvus services by docker-compose
+The audio similarity search system requires Milvus, MySQL services. We can start these containers with one click through [docker-compose.yaml](./docker-compose.yaml), so please make sure you have [installed Docker Engine](https://docs.docker.com/engine/install/) and [Docker Compose](https://docs.docker.com/compose/install/) before running. then
+
+```bash
+## Enter the audio_searching directory for the following example
+cd ~/PaddleSpeech/demos/audio_searching/
+
+## Then start the related services within the container
+docker-compose -f docker-compose.yaml up -d
+```
+
+You will see the that all containers are created:
+
+```bash
+Creating network "quick_deploy_app_net" with driver "bridge"
+Creating milvus-minio    ... done
+Creating milvus-etcd     ... done
+Creating audio-mysql     ... done
+Creating milvus-standalone ... done
+Creating audio-webclient     ... done
+```
+
+And show all containers with `docker ps`, and you can use `docker logs audio-mysql` to get the logs of server container
+
+```bash
+CONTAINER ID  IMAGE COMMAND CREATED STATUS  PORTS NAMES
+b2bcf279e599  milvusdb/milvus:v2.0.1  "/tini -- milvus run…"  22 hours ago  Up 22 hours 0.0.0.0:19530->19530/tcp  milvus-standalone
+d8ef4c84e25c  mysql:5.7 "docker-entrypoint.s…"  22 hours ago  Up 22 hours 0.0.0.0:3306->3306/tcp, 33060/tcp audio-mysql
+8fb501edb4f3  quay.io/coreos/etcd:v3.5.0  "etcd -advertise-cli…"  22 hours ago  Up 22 hours 2379-2380/tcp milvus-etcd
+ffce340b3790  minio/minio:RELEASE.2020-12-03T00-03-10Z  "/usr/bin/docker-ent…"  22 hours ago  Up 22 hours (healthy) 9000/tcp  milvus-minio
+15c84a506754  paddlepaddle/paddlespeech-audio-search-client:2.3  "/bin/bash -c '/usr/…"  22 hours ago  Up 22 hours (healthy) 0.0.0.0:8068->80/tcp  audio-webclient
+```
+
+### 3. Start API Server
+Then to start the system server, and it provides HTTP backend services.
+
+- Install the Python packages
+
+  ```bash
+  pip install -r requirements.txt
+  ```
+- Set configuration(In the case of local running, you can skip this step.)
+
+  ```bash
+  ## Method 1: Modify the source file
+  vim src/config.py
+
+  ## Method 2: Modify the environment variables, as shown in
+  export MILVUS_HOST=127.0.0.1
+  export MYSQL_HOST=127.0.0.1
+  ```
+
+  Here listing some parameters that need to be set, for more information please refer to [config.py](./src/config.py).
+
+  | **Parameter**    |**Description**         | **Default setting** |
+  | ---------------- | -----------------------| ------------------- |
+  | MILVUS_HOST      | The IP address of Milvus, you can get it by ifconfig. If running everything on one machine, most likely 127.0.0.1 | 127.0.0.1
+  | MILVUS_PORT      | Port of Milvus.    | 19530               |
+  | VECTOR_DIMENSION | Dimension of the vectors.        | 2048          |
+  | MYSQL_HOST       | The IP address of Mysql.    | 127.0.0.1           |
+  | MYSQL_PORT       | Port of Mysql.        | 3306                |
+  | DEFAULT_TABLE    | The milvus and mysql default collection name.  | audio_table          |
+
+- Run the code
+
+  Then start the server with Fastapi.
+
+  ```bash
+  export PYTHONPATH=$PYTHONPATH:./src:../../paddleaudio
+  python src/main.py
+  ```
+
+  Then you will see the Application is started:
+
+  ```bash
+  INFO:     Started server process [13352]
+  2022-03-26 22:45:30,838 ｜ INFO ｜ server.py ｜ serve ｜ 75 ｜ Started server process [13352]
+  INFO:     Waiting for application startup.
+  2022-03-26 22:45:30,839 ｜ INFO ｜ on.py ｜ startup ｜ 45 ｜ Waiting for application startup.
+  INFO:     Application startup complete.
+  2022-03-26 22:45:30,839 ｜ INFO ｜ on.py ｜ startup ｜ 59 ｜ Application startup complete.
+  INFO:     Uvicorn running on http://0.0.0.0:8002 (Press CTRL+C to quit)
+  2022-03-26 22:45:30,840 ｜ INFO ｜ server.py ｜ _log_started_message ｜ 206 ｜ Uvicorn running on http://0.0.0.0:8002 (Press CTRL+C to quit)
+  ```
+
+### 4. Usage
+- Prepare data
+  ```bash
+  wget -c https://www.openslr.org/resources/82/cn-celeb_v2.tar.gz && tar -xvf cn-celeb_v2.tar.gz 
+  ```
+  **Note**: If you want to build a quick demo, you can use ./src/test_main.py:download_audio_data function, it downloads 20 audio files , Subsequent results show this collection as an example
+
+- Prepare model(Skip this step if you use the default model.)
+  ```bash
+  ## Modify model configuration parameters. Currently, only ecapatdnn_voxceleb12 is supported, and multiple types will be supported in the future
+  vim ./src/encode.py
+  ```
+ 
+- Scripts test (Recommended)
+
+    The internal process is downloading data, loading the paddlespeech model, extracting embedding, storing library, retrieving and deleting library  
+    ```bash
+    python ./src/test_main.py
+    ```
+
+    Output：
+    ```bash
+    Downloading https://paddlespeech.bj.bcebos.com/vector/audio/example_audio.tar.gz ...
+    ...
+    Unpacking ./example_audio.tar.gz ...
+    [2022-03-26 22:50:54,987] [    INFO] - checking the aduio file format......
+    [2022-03-26 22:50:54,987] [    INFO] - The sample rate is 16000
+    [2022-03-26 22:50:54,987] [    INFO] - The audio file format is right
+    [2022-03-26 22:50:54,988] [    INFO] - device type: cpu
+    [2022-03-26 22:50:54,988] [    INFO] - load the pretrained model: ecapatdnn_voxceleb12-16k
+    [2022-03-26 22:50:54,990] [    INFO] - Downloading sv0_ecapa_tdnn_voxceleb12_ckpt_0_1_0.tar.gz from https://paddlespeech.bj.bcebos.com/vector/voxceleb/sv0_ecapa_tdnn_voxceleb12_ckpt_0_1_0.tar.gz
+    ...
+    [2022-03-26 22:51:17,285] [    INFO] - start to dynamic import the model class
+    [2022-03-26 22:51:17,285] [    INFO] - model name ecapatdnn
+    [2022-03-26 22:51:23,864] [    INFO] - start to set the model parameters to model
+    [2022-03-26 22:54:08,115] [    INFO] - create the model instance success
+    [2022-03-26 22:54:08,116] [    INFO] - Preprocess audio file: /home/zhaoqingen/PaddleSpeech/demos/audio_
+    searching/example_audio/knife_hit_iron3.wav
+    [2022-03-26 22:54:08,116] [    INFO] - load the audio sample points, shape is: (11012,)
+    [2022-03-26 22:54:08,150] [    INFO] - extract the audio feat, shape is: (80, 69)
+    [2022-03-26 22:54:08,152] [    INFO] - feats shape: [1, 80, 69]
+    [2022-03-26 22:54:08,154] [    INFO] - audio extract the feat success
+    [2022-03-26 22:54:08,155] [    INFO] - start to do backbone network model forward
+    [2022-03-26 22:54:08,155] [    INFO] - feats shape:[1, 80, 69], lengths shape: [1]
+    [2022-03-26 22:54:08,433] [    INFO] - embedding size: (192,)
+    Extracting feature from audio No. 1 , 20 audios in total
+    [2022-03-26 22:54:08,435] [    INFO] - checking the aduio file format......
+    [2022-03-26 22:54:08,435] [    INFO] - The sample rate is 16000
+    [2022-03-26 22:54:08,436] [    INFO] - The audio file format is right
+    [2022-03-26 22:54:08,436] [    INFO] - device type: cpu
+    [2022-03-26 22:54:08,436] [    INFO] - Model has been initialized
+    [2022-03-26 22:54:08,436] [    INFO] - Preprocess audio file: /home/zhaoqingen/PaddleSpeech/demos/audio_searching/example_audio/sword_wielding.wav
+    [2022-03-26 22:54:08,436] [    INFO] - load the audio sample points, shape is: (6391,)
+    [2022-03-26 22:54:08,452] [    INFO] - extract the audio feat, shape is: (80, 40)
+    [2022-03-26 22:54:08,454] [    INFO] - feats shape: [1, 80, 40]
+    [2022-03-26 22:54:08,454] [    INFO] - audio extract the feat success
+    [2022-03-26 22:54:08,454] [    INFO] - start to do backbone network model forward
+    [2022-03-26 22:54:08,455] [    INFO] - feats shape:[1, 80, 40], lengths shape: [1]
+    [2022-03-26 22:54:08,633] [    INFO] - embedding size: (192,)
+    Extracting feature from audio No. 2 , 20 audios in total
+    ...
+    2022-03-26 22:54:15,892 ｜ INFO ｜ main.py ｜ load_audios ｜ 85 ｜ Successfully loaded data, total count: 20
+    2022-03-26 22:54:15,908 ｜ INFO ｜ main.py ｜ count_audio ｜ 148 ｜ Successfully count the number of data!
+    [2022-03-26 22:54:15,916] [    INFO] - checking the aduio file format......
+    [2022-03-26 22:54:15,916] [    INFO] - The sample rate is 16000
+    [2022-03-26 22:54:15,916] [    INFO] - The audio file format is right
+    [2022-03-26 22:54:15,916] [    INFO] - device type: cpu
+    [2022-03-26 22:54:15,916] [    INFO] - Model has been initialized
+    [2022-03-26 22:54:15,916] [    INFO] - Preprocess audio file: /home/zhaoqingen/PaddleSpeech/demos/audio_searching/example_audio/test.wav
+    [2022-03-26 22:54:15,917] [    INFO] - load the audio sample points, shape is: (8456,)
+    [2022-03-26 22:54:15,923] [    INFO] - extract the audio feat, shape is: (80, 53)
+    [2022-03-26 22:54:15,924] [    INFO] - feats shape: [1, 80, 53]
+    [2022-03-26 22:54:15,924] [    INFO] - audio extract the feat success
+    [2022-03-26 22:54:15,924] [    INFO] - start to do backbone network model forward
+    [2022-03-26 22:54:15,924] [    INFO] - feats shape:[1, 80, 53], lengths shape: [1]
+    [2022-03-26 22:54:16,051] [    INFO] - embedding size: (192,)
+    ...
+    2022-03-26 22:54:16,086 ｜ INFO ｜ main.py ｜ search_local_audio ｜ 132 ｜ search result http://testserver/data?audio_path=./example_audio/test.wav, score 100.0
+    2022-03-26 22:54:16,087 ｜ INFO ｜ main.py ｜ search_local_audio ｜ 132 ｜ search result http://testserver/data?audio_path=./example_audio/knife_chopping.wav, score 29.182177782058716
+    2022-03-26 22:54:16,087 ｜ INFO ｜ main.py ｜ search_local_audio ｜ 132 ｜ search result http://testserver/data?audio_path=./example_audio/knife_cut_into_body.wav, score 22.73637056350708
+    ...
+    2022-03-26 22:54:16,088 ｜ INFO ｜ main.py ｜ search_local_audio ｜ 136 ｜ Successfully searched similar audio!
+    2022-03-26 22:54:17,164 ｜ INFO ｜ main.py ｜ drop_tables ｜ 160 ｜ Successfully drop tables in Milvus and MySQL!
+    ```
+- GUI test (Optional)
+  
+    Navigate to 127.0.0.1:8068 in your browser to access the front-end interface.
+
+    **Note**: If the browser and the service are not on the same machine, then the IP needs to be changed to the IP of the machine where the service is located, and the corresponding API_URL in docker-compose.yaml needs to be changed, and the docker-compose.yaml file needs to be re-executed for the change to take effect.
+
+    - Insert data
+
+      Download the data on the server and decompress it to a file, for example, /home/speech/data/. Then enter /home/speech/data/ in the address bar of the upload page to upload the data.
+    
+      ![](./img/insert.png)
+
+    - Search for similar audio
+
+      Select the magnifying glass icon on the left side of the interface. Then, press the "Default Target Audio File" button and upload a .wav sound file from the client you'd like to search. Results will be displayed.
+
+      ![](./img/search.png)
+
+### 5.Result
+
+ machine configuration：
+- OS: CentOS release 7.6 
+- kernel：4.17.11-1.el7.elrepo.x86_64
+- CPU：Intel(R) Xeon(R) CPU E5-2620 v4 @ 2.10GHz 
+- memory：132G
+
+dataset：
+- CN-Celeb, train size 650,000, test size 10,000, dimention 192, distance L2
+
+recall and elapsed time statistics are shown in the following figure：
+
+  ![](./img/result.png)
+
+
+The retrieval framework based on Milvus takes about 2.9 milliseconds to retrieve on the premise of 90% recall rate, and it takes about 500 milliseconds for feature extraction (testing audio takes about 5 seconds), that is, a single audio test takes about 503 milliseconds in total, which can meet most application scenarios.
+
+### 6.Pretrained Models
+
+Here is a list of pretrained models released by PaddleSpeech :
+
+| Model | Sample Rate
+| :--- | :---: 
+| ecapa_tdnn | 16000
diff --git a/demos/audio_searching/README_cn.md b/demos/audio_searching/README_cn.md
new file mode 100644
index 000000000..0d0f42a0f
--- /dev/null
+++ b/demos/audio_searching/README_cn.md
@@ -0,0 +1,237 @@
+
+(简体中文|[English](./README.md))
+
+# 音频相似性检索
+## 介绍
+
+随着互联网不断发展，电子邮件、社交媒体照片、直播视频、客服语音等非结构化数据已经变得越来越普遍。如果想要使用计算机来处理这些数据，需要使用 embedding 技术将这些数据转化为向量 vector，然后进行存储、建索引、并查询。
+
+但是，当数据量很大，比如上亿条音频要做相似度搜索，就比较困难了。穷举法固然可行，但非常耗时。针对这种场景，该 demo 将介绍如何使用开源向量数据库 Milvus 搭建音频相似度检索系统。
+
+音频检索（如演讲、音乐、说话人等检索）实现了在海量音频数据中查询并找出相似声音（或相同说话人）片段。音频相似性检索系统可用于识别相似的音效、最大限度减少知识产权侵权等，还可以快速的检索声纹库、帮助企业控制欺诈和身份盗用等。在音频数据的分类和统计分析中，音频检索也发挥着重要作用。
+
+在本 demo 中，你将学会如何构建一个音频检索系统，用来检索相似的声音片段。使用基于 PaddleSpeech 预训练模型（音频分类模型，说话人识别模型等）将上传的音频片段转换为向量数据，并存储在 Milvus 中。Milvus 自动为每个向量生成唯一的 ID，然后将 ID 和 相应的音频信息（音频id，音频的说话人id等等）存储在 MySQL，这样就完成建库的工作。用户在检索时，上传测试音频，得到向量，然后在 Milvus 中进行向量相似度搜索，Milvus 返回的检索结果为向量 ID，通过 ID 在 MySQL 内部查询相应的音频信息即可。
+
+![音频检索流程图](./img/audio_searching.png)
+
+注：该 demo 使用 [CN-Celeb](http://openslr.org/82/) 数据集，包括至少 650000 条音频，3000 个说话人，来建立音频向量库（音频特征，或音频说话人特征），然后通过预设的距离计算方式进行音频（或说话人）检索，这里面数据集也可以使用其他的，根据需要调整，如Librispeech，VoxCeleb，UrbanSound，GloVe，MNIST等。
+
+## 使用方法
+### 1. PaddleSpeech 安装
+音频向量的提取需要用到基于 PaddleSpeech 训练的模型，所以请确保在运行之前已经安装了 PaddleSpeech，具体安装步骤，详见[安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install_cn.md)。
+
+你可以从 easy，medium，hard 三种方式中选择一种方式安装。
+
+### 2. MySQL 和 Milvus 安装
+音频相似性的检索需要用到 Milvus, MySQL 服务。 我们可以通过 [docker-compose.yaml](./docker-compose.yaml) 一键启动这些容器，所以请确保在运行之前已经安装了 [Docker Engine](https://docs.docker.com/engine/install/) 和 [Docker Compose](https://docs.docker.com/compose/install/)。 即
+
+```bash
+## 先进入到 audio_searching 目录，如下示例
+cd ~/PaddleSpeech/demos/audio_searching/
+
+## 然后启动容器内的相关服务
+docker-compose -f docker-compose.yaml up -d
+```
+
+你会看到所有的容器都被创建:
+
+```bash
+Creating network "quick_deploy_app_net" with driver "bridge"
+Creating milvus-minio    ... done
+Creating milvus-etcd     ... done
+Creating audio-mysql     ... done
+Creating milvus-standalone ... done
+Creating audio-webclient     ... done
+```
+
+可以采用'docker ps'来显示所有的容器，还可以使用'docker logs audio-mysql'来获取服务器容器的日志：
+
+```bash
+CONTAINER ID  IMAGE COMMAND CREATED STATUS  PORTS NAMES
+b2bcf279e599  milvusdb/milvus:v2.0.1  "/tini -- milvus run…"  22 hours ago  Up 22 hours 0.0.0.0:19530->19530/tcp  milvus-standalone
+d8ef4c84e25c  mysql:5.7 "docker-entrypoint.s…"  22 hours ago  Up 22 hours 0.0.0.0:3306->3306/tcp, 33060/tcp audio-mysql
+8fb501edb4f3  quay.io/coreos/etcd:v3.5.0  "etcd -advertise-cli…"  22 hours ago  Up 22 hours 2379-2380/tcp milvus-etcd
+ffce340b3790  minio/minio:RELEASE.2020-12-03T00-03-10Z  "/usr/bin/docker-ent…"  22 hours ago  Up 22 hours (healthy) 9000/tcp  milvus-minio
+15c84a506754  paddlepaddle/paddlespeech-audio-search-client:2.3  "/bin/bash -c '/usr/…"  22 hours ago  Up 22 hours (healthy) 0.0.0.0:8068->80/tcp  audio-webclient
+
+```
+
+### 3. 配置并启动 API 服务
+启动系统服务程序，它会提供基于 HTTP 后端服务。
+
+- 安装服务依赖的 python 基础包
+
+  ```bash
+  pip install -r requirements.txt
+  ```
+- 修改配置(本地运行情况下，一般不用修改，可以跳过该步骤)
+
+  ```bash
+  ## 方法一：修改源码文件
+  vim src/config.py
+
+  ## 方法二：修改环境变量，如下所示
+  export MILVUS_HOST=127.0.0.1
+  export MYSQL_HOST=127.0.0.1
+  ```
+
+  这里列出了一些需要设置的参数，更多信息请参考 [config.py](./src/config.py)
+
+  | **参数**    | **描述**                | **默认设置** |
+  | ---------------- | -------------------- | ------------------- |
+  | MILVUS_HOST      | Milvus 服务的 IP 地址 | 127.0.0.1           |
+  | MILVUS_PORT      | Milvus 服务的端口号   | 19530               |
+  | VECTOR_DIMENSION | 特征向量的维度        | 192                 |
+  | MYSQL_HOST       | Mysql 服务的 IP 地址  | 127.0.0.1           |
+  | MYSQL_PORT       | Mysql 服务的端口号    | 3306                |
+  | DEFAULT_TABLE    | 默认存储的表名        | audio_table         |
+
+- 运行程序
+
+  启动用 Fastapi 构建的服务
+
+  ```bash
+  export PYTHONPATH=$PYTHONPATH:./src:../../paddleaudio
+  python src/main.py
+  ```
+
+  然后你会看到应用程序启动:
+
+  ```bash
+  INFO:     Started server process [13352]
+  2022-03-26 22:45:30,838 ｜ INFO ｜ server.py ｜ serve ｜ 75 ｜ Started server process [13352]
+  INFO:     Waiting for application startup.
+  2022-03-26 22:45:30,839 ｜ INFO ｜ on.py ｜ startup ｜ 45 ｜ Waiting for application startup.
+  INFO:     Application startup complete.
+  2022-03-26 22:45:30,839 ｜ INFO ｜ on.py ｜ startup ｜ 59 ｜ Application startup complete.
+  INFO:     Uvicorn running on http://0.0.0.0:8002 (Press CTRL+C to quit)
+  2022-03-26 22:45:30,840 ｜ INFO ｜ server.py ｜ _log_started_message ｜ 206 ｜ Uvicorn running on http://0.0.0.0:8002 (Press CTRL+C to quit)
+  ```
+
+### 4. 测试方法
+- 准备数据
+  ```bash
+  wget -c https://www.openslr.org/resources/82/cn-celeb_v2.tar.gz && tar -xvf cn-celeb_v2.tar.gz 
+  ```
+  **注**：如果希望快速搭建 demo，可以采用 ./src/test_main.py:download_audio_data 内部的 20 条音频，另外后续结果展示以该集合为例
+
+- 准备模型（如果使用默认模型，可以跳过此步骤）
+  ```bash
+  ## 修改模型配置参数，目前 model 仅支持 ecapatdnn_voxceleb12，后续将支持多种类型
+  vim ./src/encode.py
+  ```
+ 
+ - 脚本测试（推荐）
+
+    ```bash
+    python ./src/test_main.py
+    ```
+    注：内部将依次下载数据，加载 paddlespeech 模型，提取 embedding，存储建库，检索，删库
+
+    输出：
+    ```bash
+    Downloading https://paddlespeech.bj.bcebos.com/vector/audio/example_audio.tar.gz ...
+    ...
+    Unpacking ./example_audio.tar.gz ...
+    [2022-03-26 22:50:54,987] [    INFO] - checking the aduio file format......
+    [2022-03-26 22:50:54,987] [    INFO] - The sample rate is 16000
+    [2022-03-26 22:50:54,987] [    INFO] - The audio file format is right
+    [2022-03-26 22:50:54,988] [    INFO] - device type: cpu
+    [2022-03-26 22:50:54,988] [    INFO] - load the pretrained model: ecapatdnn_voxceleb12-16k
+    [2022-03-26 22:50:54,990] [    INFO] - Downloading sv0_ecapa_tdnn_voxceleb12_ckpt_0_1_0.tar.gz from https://paddlespeech.bj.bcebos.com/vector/voxceleb/sv0_ecapa_tdnn_voxceleb12_ckpt_0_1_0.tar.gz
+    ...
+    [2022-03-26 22:51:17,285] [    INFO] - start to dynamic import the model class
+    [2022-03-26 22:51:17,285] [    INFO] - model name ecapatdnn
+    [2022-03-26 22:51:23,864] [    INFO] - start to set the model parameters to model
+    [2022-03-26 22:54:08,115] [    INFO] - create the model instance success
+    [2022-03-26 22:54:08,116] [    INFO] - Preprocess audio file: /home/zhaoqingen/PaddleSpeech/demos/audio_
+    searching/example_audio/knife_hit_iron3.wav
+    [2022-03-26 22:54:08,116] [    INFO] - load the audio sample points, shape is: (11012,)
+    [2022-03-26 22:54:08,150] [    INFO] - extract the audio feat, shape is: (80, 69)
+    [2022-03-26 22:54:08,152] [    INFO] - feats shape: [1, 80, 69]
+    [2022-03-26 22:54:08,154] [    INFO] - audio extract the feat success
+    [2022-03-26 22:54:08,155] [    INFO] - start to do backbone network model forward
+    [2022-03-26 22:54:08,155] [    INFO] - feats shape:[1, 80, 69], lengths shape: [1]
+    [2022-03-26 22:54:08,433] [    INFO] - embedding size: (192,)
+    Extracting feature from audio No. 1 , 20 audios in total
+    [2022-03-26 22:54:08,435] [    INFO] - checking the aduio file format......
+    [2022-03-26 22:54:08,435] [    INFO] - The sample rate is 16000
+    [2022-03-26 22:54:08,436] [    INFO] - The audio file format is right
+    [2022-03-26 22:54:08,436] [    INFO] - device type: cpu
+    [2022-03-26 22:54:08,436] [    INFO] - Model has been initialized
+    [2022-03-26 22:54:08,436] [    INFO] - Preprocess audio file: /home/zhaoqingen/PaddleSpeech/demos/audio_searching/example_audio/sword_wielding.wav
+    [2022-03-26 22:54:08,436] [    INFO] - load the audio sample points, shape is: (6391,)
+    [2022-03-26 22:54:08,452] [    INFO] - extract the audio feat, shape is: (80, 40)
+    [2022-03-26 22:54:08,454] [    INFO] - feats shape: [1, 80, 40]
+    [2022-03-26 22:54:08,454] [    INFO] - audio extract the feat success
+    [2022-03-26 22:54:08,454] [    INFO] - start to do backbone network model forward
+    [2022-03-26 22:54:08,455] [    INFO] - feats shape:[1, 80, 40], lengths shape: [1]
+    [2022-03-26 22:54:08,633] [    INFO] - embedding size: (192,)
+    Extracting feature from audio No. 2 , 20 audios in total
+    ...
+    2022-03-26 22:54:15,892 ｜ INFO ｜ main.py ｜ load_audios ｜ 85 ｜ Successfully loaded data, total count: 20
+    2022-03-26 22:54:15,908 ｜ INFO ｜ main.py ｜ count_audio ｜ 148 ｜ Successfully count the number of data!
+    [2022-03-26 22:54:15,916] [    INFO] - checking the aduio file format......
+    [2022-03-26 22:54:15,916] [    INFO] - The sample rate is 16000
+    [2022-03-26 22:54:15,916] [    INFO] - The audio file format is right
+    [2022-03-26 22:54:15,916] [    INFO] - device type: cpu
+    [2022-03-26 22:54:15,916] [    INFO] - Model has been initialized
+    [2022-03-26 22:54:15,916] [    INFO] - Preprocess audio file: /home/zhaoqingen/PaddleSpeech/demos/audio_searching/example_audio/test.wav
+    [2022-03-26 22:54:15,917] [    INFO] - load the audio sample points, shape is: (8456,)
+    [2022-03-26 22:54:15,923] [    INFO] - extract the audio feat, shape is: (80, 53)
+    [2022-03-26 22:54:15,924] [    INFO] - feats shape: [1, 80, 53]
+    [2022-03-26 22:54:15,924] [    INFO] - audio extract the feat success
+    [2022-03-26 22:54:15,924] [    INFO] - start to do backbone network model forward
+    [2022-03-26 22:54:15,924] [    INFO] - feats shape:[1, 80, 53], lengths shape: [1]
+    [2022-03-26 22:54:16,051] [    INFO] - embedding size: (192,)
+    ...
+    2022-03-26 22:54:16,086 ｜ INFO ｜ main.py ｜ search_local_audio ｜ 132 ｜ search result http://testserver/data?audio_path=./example_audio/test.wav, score 100.0
+    2022-03-26 22:54:16,087 ｜ INFO ｜ main.py ｜ search_local_audio ｜ 132 ｜ search result http://testserver/data?audio_path=./example_audio/knife_chopping.wav, score 29.182177782058716
+    2022-03-26 22:54:16,087 ｜ INFO ｜ main.py ｜ search_local_audio ｜ 132 ｜ search result http://testserver/data?audio_path=./example_audio/knife_cut_into_body.wav, score 22.73637056350708
+    ...
+    2022-03-26 22:54:16,088 ｜ INFO ｜ main.py ｜ search_local_audio ｜ 136 ｜ Successfully searched similar audio!
+    2022-03-26 22:54:17,164 ｜ INFO ｜ main.py ｜ drop_tables ｜ 160 ｜ Successfully drop tables in Milvus and MySQL!
+    ```
+
+  - 前端测试（可选）
+  
+    在浏览器中输入 127.0.0.1:8068 访问前端页面
+    
+    **注**：如果浏览器和服务不在同一台机器上，那么 IP 需要修改成服务所在的机器 IP，并且 docker-compose.yaml 中相应的 API_URL 也要修改，然后重新执行 docker-compose.yaml 文件，使修改生效。
+
+    - 上传音频
+    
+      在服务端下载数据并解压到一文件夹，假设为 /home/speech/data/，那么在上传页面地址栏输入 /home/speech/data/ 进行数据上传
+    
+      ![](./img/insert.png)
+
+    - 检索相似音频
+
+      选择左上角放大镜，点击 “Default Target Audio File” 按钮，从客户端上传测试音频，接着你将看到检索结果
+
+      ![](./img/search.png)
+
+### 5. 结果
+
+机器配置：
+- 操作系统: CentOS release 7.6 
+- 内核：4.17.11-1.el7.elrepo.x86_64
+- 处理器：Intel(R) Xeon(R) CPU E5-2620 v4 @ 2.10GHz 
+- 内存：132G
+
+数据集：
+- CN-Celeb, 训练集 65万, 测试集 1万，向量维度 192，距离计算方式 L2
+
+召回和耗时统计如下图：
+
+  ![](./img/result.png)
+
+基于 Milvus 的检索框架在召回率 90% 的前提下，检索耗时约 2.9 毫秒，加上特征提取(Embedding)耗时约 500 毫秒(测试音频时长约 5 秒)，即单条音频测试总共耗时约 503 毫秒，可以满足大多数应用场景。
+
+### 6. 预训练模型
+
+以下是 PaddleSpeech 提供的预训练模型列表：
+
+| 模型 | 采样率
+| :--- | :---: 
+| ecapa_tdnn| 16000
diff --git a/demos/audio_searching/docker-compose.yaml b/demos/audio_searching/docker-compose.yaml
new file mode 100644
index 000000000..16ac054d6
--- /dev/null
+++ b/demos/audio_searching/docker-compose.yaml
@@ -0,0 +1,88 @@
+version: '3.5'
+
+services:
+  etcd:
+    container_name: milvus-etcd
+    image: quay.io/coreos/etcd:v3.5.0
+    networks:
+      app_net:
+    environment:
+      - ETCD_AUTO_COMPACTION_MODE=revision
+      - ETCD_AUTO_COMPACTION_RETENTION=1000
+      - ETCD_QUOTA_BACKEND_BYTES=4294967296
+    volumes:
+      - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/etcd:/etcd
+    command: etcd -advertise-client-urls=http://127.0.0.1:2379 -listen-client-urls http://0.0.0.0:2379 --data-dir /etcd
+
+  minio:
+    container_name: milvus-minio
+    image: minio/minio:RELEASE.2020-12-03T00-03-10Z
+    networks:
+      app_net:
+    environment:
+      MINIO_ACCESS_KEY: minioadmin
+      MINIO_SECRET_KEY: minioadmin
+    volumes:
+      - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/minio:/minio_data
+    command: minio server /minio_data
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
+      interval: 30s
+      timeout: 20s
+      retries: 3
+
+  standalone:
+    container_name: milvus-standalone
+    image: milvusdb/milvus:v2.0.1
+    networks:
+      app_net:
+        ipv4_address: 172.16.23.10
+    command: ["milvus", "run", "standalone"]
+    environment:
+      ETCD_ENDPOINTS: etcd:2379
+      MINIO_ADDRESS: minio:9000
+    volumes:
+      - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/milvus:/var/lib/milvus
+    ports:
+      - "19530:19530"
+    depends_on:
+      - "etcd"
+      - "minio"
+  
+  mysql:
+    container_name: audio-mysql
+    image: mysql:5.7
+    networks:
+      app_net:
+        ipv4_address: 172.16.23.11
+    environment:
+      - MYSQL_ROOT_PASSWORD=123456
+    volumes:
+      - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/mysql:/var/lib/mysql
+    ports:
+      - "3306:3306"
+
+  webclient:
+    container_name: audio-webclient
+    image: paddlepaddle/paddlespeech-audio-search-client:2.3
+    networks:
+      app_net:
+        ipv4_address: 172.16.23.13
+    environment:
+      API_URL: 'http://127.0.0.1:8002'
+    ports:
+      - "8068:80"
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost/"]
+      interval: 30s
+      timeout: 20s
+      retries: 3
+
+networks:
+  app_net:
+    driver: bridge
+    ipam:
+      driver: default
+      config:
+        - subnet: 172.16.23.0/24
+          gateway: 172.16.23.1
diff --git a/demos/audio_searching/img/audio_searching.png b/demos/audio_searching/img/audio_searching.png
new file mode 100644
index 000000000..b145dd499
Binary files /dev/null and b/demos/audio_searching/img/audio_searching.png differ
diff --git a/demos/audio_searching/img/insert.png b/demos/audio_searching/img/insert.png
new file mode 100644
index 000000000..a01015e4e
Binary files /dev/null and b/demos/audio_searching/img/insert.png differ
diff --git a/demos/audio_searching/img/result.png b/demos/audio_searching/img/result.png
new file mode 100644
index 000000000..c4efc0c7f
Binary files /dev/null and b/demos/audio_searching/img/result.png differ
diff --git a/demos/audio_searching/img/search.png b/demos/audio_searching/img/search.png
new file mode 100644
index 000000000..cccc7fb92
Binary files /dev/null and b/demos/audio_searching/img/search.png differ
diff --git a/demos/audio_searching/requirements.txt b/demos/audio_searching/requirements.txt
new file mode 100644
index 000000000..057c6ab92
--- /dev/null
+++ b/demos/audio_searching/requirements.txt
@@ -0,0 +1,13 @@
+diskcache==5.2.1
+dtaidistance==2.3.1
+fastapi
+librosa==0.8.0
+numpy==1.21.0
+pydantic
+pymilvus==2.0.1
+pymysql
+python-multipart
+soundfile==0.10.3.post1
+starlette
+typing
+uvicorn
\ No newline at end of file
diff --git a/demos/audio_searching/src/config.py b/demos/audio_searching/src/config.py
new file mode 100644
index 000000000..3d6d3d43b
--- /dev/null
+++ b/demos/audio_searching/src/config.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+############### Milvus Configuration ###############
+MILVUS_HOST = os.getenv("MILVUS_HOST", "127.0.0.1")
+MILVUS_PORT = int(os.getenv("MILVUS_PORT", "19530"))
+VECTOR_DIMENSION = int(os.getenv("VECTOR_DIMENSION", "192"))
+INDEX_FILE_SIZE = int(os.getenv("INDEX_FILE_SIZE", "1024"))
+METRIC_TYPE = os.getenv("METRIC_TYPE", "L2")
+DEFAULT_TABLE = os.getenv("DEFAULT_TABLE", "audio_table")
+TOP_K = int(os.getenv("TOP_K", "10"))
+
+############### MySQL Configuration ###############
+MYSQL_HOST = os.getenv("MYSQL_HOST", "127.0.0.1")
+MYSQL_PORT = int(os.getenv("MYSQL_PORT", "3306"))
+MYSQL_USER = os.getenv("MYSQL_USER", "root")
+MYSQL_PWD = os.getenv("MYSQL_PWD", "123456")
+MYSQL_DB = os.getenv("MYSQL_DB", "mysql")
+
+############### Data Path ###############
+UPLOAD_PATH = os.getenv("UPLOAD_PATH", "tmp/audio-data")
+
+############### Number of Log Files ###############
+LOGS_NUM = int(os.getenv("logs_num", "0"))
diff --git a/demos/audio_searching/src/encode.py b/demos/audio_searching/src/encode.py
new file mode 100644
index 000000000..f67184c29
--- /dev/null
+++ b/demos/audio_searching/src/encode.py
@@ -0,0 +1,34 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+
+from logs import LOGGER
+from paddlespeech.cli import VectorExecutor
+
+vector_executor = VectorExecutor()
+
+
+def get_audio_embedding(path):
+    """
+    Use vpr_inference to generate embedding of audio
+    """
+    try:
+        embedding = vector_executor(
+            audio_file=path, model='ecapatdnn_voxceleb12')
+        embedding = embedding / np.linalg.norm(embedding)
+        embedding = embedding.tolist()
+        return embedding
+    except Exception as e:
+        LOGGER.error(f"Error with embedding:{e}")
+        return None
diff --git a/demos/audio_searching/src/logs.py b/demos/audio_searching/src/logs.py
new file mode 100644
index 000000000..465eb682a
--- /dev/null
+++ b/demos/audio_searching/src/logs.py
@@ -0,0 +1,163 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import datetime
+import logging
+import os
+import re
+import sys
+
+from config import LOGS_NUM
+
+
+class MultiprocessHandler(logging.FileHandler):
+    """
+    A handler class which writes formatted logging records to disk files
+    """
+
+    def __init__(self,
+                 filename,
+                 when='D',
+                 backupCount=0,
+                 encoding=None,
+                 delay=False):
+        """
+        Open the specified file and use it as the stream for logging
+        """
+        self.prefix = filename
+        self.backupCount = backupCount
+        self.when = when.upper()
+        self.extMath = r"^\d{4}-\d{2}-\d{2}"
+
+        self.when_dict = {
+            'S': "%Y-%m-%d-%H-%M-%S",
+            'M': "%Y-%m-%d-%H-%M",
+            'H': "%Y-%m-%d-%H",
+            'D': "%Y-%m-%d"
+        }
+
+        self.suffix = self.when_dict.get(when)
+        if not self.suffix:
+            print('The specified date interval unit is invalid: ', self.when)
+            sys.exit(1)
+
+        self.filefmt = os.path.join('.', "logs",
+                                    f"{self.prefix}-{self.suffix}.log")
+
+        self.filePath = datetime.datetime.now().strftime(self.filefmt)
+
+        _dir = os.path.dirname(self.filefmt)
+        try:
+            if not os.path.exists(_dir):
+                os.makedirs(_dir)
+        except Exception as e:
+            print('Failed to create log file: ', e)
+            print("log_path：" + self.filePath)
+            sys.exit(1)
+
+        logging.FileHandler.__init__(self, self.filePath, 'a+', encoding, delay)
+
+    def should_change_file_to_write(self):
+        """
+        To write the file
+        """
+        _filePath = datetime.datetime.now().strftime(self.filefmt)
+        if _filePath != self.filePath:
+            self.filePath = _filePath
+            return True
+        return False
+
+    def do_change_file(self):
+        """
+        To change file states
+        """
+        self.baseFilename = os.path.abspath(self.filePath)
+        if self.stream:
+            self.stream.close()
+            self.stream = None
+
+        if not self.delay:
+            self.stream = self._open()
+        if self.backupCount > 0:
+            for s in self.get_files_to_delete():
+                os.remove(s)
+
+    def get_files_to_delete(self):
+        """
+        To delete backup files
+        """
+        dir_name, _ = os.path.split(self.baseFilename)
+        file_names = os.listdir(dir_name)
+        result = []
+        prefix = self.prefix + '-'
+        for file_name in file_names:
+            if file_name[:len(prefix)] == prefix:
+                suffix = file_name[len(prefix):-4]
+                if re.compile(self.extMath).match(suffix):
+                    result.append(os.path.join(dir_name, file_name))
+        result.sort()
+
+        if len(result) < self.backupCount:
+            result = []
+        else:
+            result = result[:len(result) - self.backupCount]
+        return result
+
+    def emit(self, record):
+        """
+        Emit a record
+        """
+        try:
+            if self.should_change_file_to_write():
+                self.do_change_file()
+            logging.FileHandler.emit(self, record)
+        except (KeyboardInterrupt, SystemExit):
+            raise
+        except Exception as e:
+            self.handleError(record)
+
+
+def write_log():
+    """
+    Init a logger
+    """
+    logger = logging.getLogger()
+    logger.setLevel(logging.DEBUG)
+    # formatter = '%(asctime)s ｜ %(levelname)s ｜ %(filename)s ｜ %(funcName)s ｜ %(module)s ｜ %(lineno)s ｜ %(message)s'
+    fmt = logging.Formatter(
+        '%(asctime)s ｜ %(levelname)s ｜ %(filename)s ｜ %(funcName)s ｜ %(lineno)s ｜ %(message)s'
+    )
+
+    stream_handler = logging.StreamHandler(sys.stdout)
+    stream_handler.setLevel(logging.INFO)
+    stream_handler.setFormatter(fmt)
+
+    log_name = "audio-searching"
+    file_handler = MultiprocessHandler(log_name, when='D', backupCount=LOGS_NUM)
+    file_handler.setLevel(logging.DEBUG)
+    file_handler.setFormatter(fmt)
+    file_handler.do_change_file()
+
+    logger.addHandler(stream_handler)
+    logger.addHandler(file_handler)
+
+    return logger
+
+
+LOGGER = write_log()
+
+if __name__ == "__main__":
+    message = 'test writing logs'
+    LOGGER.info(message)
+    LOGGER.debug(message)
+    LOGGER.error(message)
diff --git a/demos/audio_searching/src/main.py b/demos/audio_searching/src/main.py
new file mode 100644
index 000000000..db091a39d
--- /dev/null
+++ b/demos/audio_searching/src/main.py
@@ -0,0 +1,168 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from typing import Optional
+
+import uvicorn
+from config import UPLOAD_PATH
+from diskcache import Cache
+from fastapi import FastAPI
+from fastapi import File
+from fastapi import UploadFile
+from logs import LOGGER
+from milvus_helpers import MilvusHelper
+from mysql_helpers import MySQLHelper
+from operations.count import do_count
+from operations.drop import do_drop
+from operations.load import do_load
+from operations.search import do_search
+from pydantic import BaseModel
+from starlette.middleware.cors import CORSMiddleware
+from starlette.requests import Request
+from starlette.responses import FileResponse
+
+app = FastAPI()
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"])
+
+MODEL = None
+MILVUS_CLI = MilvusHelper()
+MYSQL_CLI = MySQLHelper()
+
+# Mkdir 'tmp/audio-data'
+if not os.path.exists(UPLOAD_PATH):
+    os.makedirs(UPLOAD_PATH)
+    LOGGER.info(f"Mkdir the path: {UPLOAD_PATH}")
+
+
+@app.get('/data')
+def audio_path(audio_path):
+    # Get the audio file
+    try:
+        LOGGER.info(f"Successfully load audio: {audio_path}")
+        return FileResponse(audio_path)
+    except Exception as e:
+        LOGGER.error(f"upload audio error: {e}")
+        return {'status': False, 'msg': e}, 400
+
+
+@app.get('/progress')
+def get_progress():
+    # Get the progress of dealing with data
+    try:
+        cache = Cache('./tmp')
+        return f"current: {cache['current']}, total: {cache['total']}"
+    except Exception as e:
+        LOGGER.error(f"Upload data error: {e}")
+        return {'status': False, 'msg': e}, 400
+
+
+class Item(BaseModel):
+    Table: Optional[str] = None
+    File: str
+
+
+@app.post('/audio/load')
+async def load_audios(item: Item):
+    # Insert all the audio files under the file path to Milvus/MySQL
+    try:
+        total_num = do_load(item.Table, item.File, MILVUS_CLI, MYSQL_CLI)
+        LOGGER.info(f"Successfully loaded data, total count: {total_num}")
+        return {'status': True, 'msg': "Successfully loaded data!"}
+    except Exception as e:
+        LOGGER.error(e)
+        return {'status': False, 'msg': e}, 400
+
+
+@app.post('/audio/search')
+async def search_audio(request: Request,
+                       table_name: str=None,
+                       audio: UploadFile=File(...)):
+    # Search the uploaded audio in Milvus/MySQL
+    try:
+        # Save the upload data to server.
+        content = await audio.read()
+        query_audio_path = os.path.join(UPLOAD_PATH, audio.filename)
+        with open(query_audio_path, "wb+") as f:
+            f.write(content)
+        host = request.headers['host']
+        _, paths, distances = do_search(host, table_name, query_audio_path,
+                                        MILVUS_CLI, MYSQL_CLI)
+        names = []
+        for path, score in zip(paths, distances):
+            names.append(os.path.basename(path))
+            LOGGER.info(f"search result {path}, score {score}")
+        res = dict(zip(paths, zip(names, distances)))
+        # Sort results by distance metric, closest distances first
+        res = sorted(res.items(), key=lambda item: item[1][1], reverse=True)
+        LOGGER.info("Successfully searched similar audio!")
+        return res
+    except Exception as e:
+        LOGGER.error(e)
+        return {'status': False, 'msg': e}, 400
+
+
+@app.post('/audio/search/local')
+async def search_local_audio(request: Request,
+                             query_audio_path: str,
+                             table_name: str=None):
+    # Search the uploaded audio in Milvus/MySQL
+    try:
+        host = request.headers['host']
+        _, paths, distances = do_search(host, table_name, query_audio_path,
+                                        MILVUS_CLI, MYSQL_CLI)
+        names = []
+        for path, score in zip(paths, distances):
+            names.append(os.path.basename(path))
+            LOGGER.info(f"search result {path}, score {score}")
+        res = dict(zip(paths, zip(names, distances)))
+        # Sort results by distance metric, closest distances first
+        res = sorted(res.items(), key=lambda item: item[1][1], reverse=True)
+        LOGGER.info("Successfully searched similar audio!")
+        return res
+    except Exception as e:
+        LOGGER.error(e)
+        return {'status': False, 'msg': e}, 400
+
+
+@app.get('/audio/count')
+async def count_audio(table_name: str=None):
+    # Returns the total number of vectors in the system
+    try:
+        num = do_count(table_name, MILVUS_CLI)
+        LOGGER.info("Successfully count the number of data!")
+        return num
+    except Exception as e:
+        LOGGER.error(e)
+        return {'status': False, 'msg': e}, 400
+
+
+@app.post('/audio/drop')
+async def drop_tables(table_name: str=None):
+    # Delete the collection of Milvus and MySQL
+    try:
+        status = do_drop(table_name, MILVUS_CLI, MYSQL_CLI)
+        LOGGER.info("Successfully drop tables in Milvus and MySQL!")
+        return status
+    except Exception as e:
+        LOGGER.error(e)
+        return {'status': False, 'msg': e}, 400
+
+
+if __name__ == '__main__':
+    uvicorn.run(app=app, host='0.0.0.0', port=8002)
diff --git a/demos/audio_searching/src/milvus_helpers.py b/demos/audio_searching/src/milvus_helpers.py
new file mode 100644
index 000000000..1699e892e
--- /dev/null
+++ b/demos/audio_searching/src/milvus_helpers.py
@@ -0,0 +1,185 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+
+from config import METRIC_TYPE
+from config import MILVUS_HOST
+from config import MILVUS_PORT
+from config import VECTOR_DIMENSION
+from logs import LOGGER
+from pymilvus import Collection
+from pymilvus import CollectionSchema
+from pymilvus import connections
+from pymilvus import DataType
+from pymilvus import FieldSchema
+from pymilvus import utility
+
+
+class MilvusHelper:
+    """
+    the basic operations of PyMilvus
+
+    # This example shows how to:
+    #   1. connect to Milvus server
+    #   2. create a collection
+    #   3. insert entities
+    #   4. create index
+    #   5. search
+    #   6. delete a collection
+
+    """
+
+    def __init__(self):
+        try:
+            self.collection = None
+            connections.connect(host=MILVUS_HOST, port=MILVUS_PORT)
+            LOGGER.debug(
+                f"Successfully connect to Milvus with IP:{MILVUS_HOST} and PORT:{MILVUS_PORT}"
+            )
+        except Exception as e:
+            LOGGER.error(f"Failed to connect Milvus: {e}")
+            sys.exit(1)
+
+    def set_collection(self, collection_name):
+        try:
+            if self.has_collection(collection_name):
+                self.collection = Collection(name=collection_name)
+            else:
+                raise Exception(
+                    f"There is no collection named:{collection_name}")
+        except Exception as e:
+            LOGGER.error(f"Failed to set collection in Milvus: {e}")
+            sys.exit(1)
+
+    def has_collection(self, collection_name):
+        # Return if Milvus has the collection
+        try:
+            return utility.has_collection(collection_name)
+        except Exception as e:
+            LOGGER.error(f"Failed to check state of collection in Milvus: {e}")
+            sys.exit(1)
+
+    def create_collection(self, collection_name):
+        # Create milvus collection if not exists
+        try:
+            if not self.has_collection(collection_name):
+                field1 = FieldSchema(
+                    name="id",
+                    dtype=DataType.INT64,
+                    descrition="int64",
+                    is_primary=True,
+                    auto_id=True)
+                field2 = FieldSchema(
+                    name="embedding",
+                    dtype=DataType.FLOAT_VECTOR,
+                    descrition="speaker embeddings",
+                    dim=VECTOR_DIMENSION,
+                    is_primary=False)
+                schema = CollectionSchema(
+                    fields=[field1, field2], description="embeddings info")
+                self.collection = Collection(
+                    name=collection_name, schema=schema)
+                LOGGER.debug(f"Create Milvus collection: {collection_name}")
+            else:
+                self.set_collection(collection_name)
+            return "OK"
+        except Exception as e:
+            LOGGER.error(f"Failed to create collection in Milvus: {e}")
+            sys.exit(1)
+
+    def insert(self, collection_name, vectors):
+        # Batch insert vectors to milvus collection
+        try:
+            self.create_collection(collection_name)
+            data = [vectors]
+            self.set_collection(collection_name)
+            mr = self.collection.insert(data)
+            ids = mr.primary_keys
+            self.collection.load()
+            LOGGER.debug(
+                f"Insert vectors to Milvus in collection: {collection_name} with {len(vectors)} rows"
+            )
+            return ids
+        except Exception as e:
+            LOGGER.error(f"Failed to insert data to Milvus: {e}")
+            sys.exit(1)
+
+    def create_index(self, collection_name):
+        # Create IVF_FLAT index on milvus collection
+        try:
+            self.set_collection(collection_name)
+            default_index = {
+                "index_type": "IVF_SQ8",
+                "metric_type": METRIC_TYPE,
+                "params": {
+                    "nlist": 16384
+                }
+            }
+            status = self.collection.create_index(
+                field_name="embedding", index_params=default_index)
+            if not status.code:
+                LOGGER.debug(
+                    f"Successfully create index in collection:{collection_name} with param:{default_index}"
+                )
+                return status
+            else:
+                raise Exception(status.message)
+        except Exception as e:
+            LOGGER.error(f"Failed to create index: {e}")
+            sys.exit(1)
+
+    def delete_collection(self, collection_name):
+        # Delete Milvus collection
+        try:
+            self.set_collection(collection_name)
+            self.collection.drop()
+            LOGGER.debug("Successfully drop collection!")
+            return "ok"
+        except Exception as e:
+            LOGGER.error(f"Failed to drop collection: {e}")
+            sys.exit(1)
+
+    def search_vectors(self, collection_name, vectors, top_k):
+        # Search vector in milvus collection
+        try:
+            self.set_collection(collection_name)
+            search_params = {
+                "metric_type": METRIC_TYPE,
+                "params": {
+                    "nprobe": 16
+                }
+            }
+            res = self.collection.search(
+                vectors,
+                anns_field="embedding",
+                param=search_params,
+                limit=top_k)
+            LOGGER.debug(f"Successfully search in collection: {res}")
+            return res
+        except Exception as e:
+            LOGGER.error(f"Failed to search vectors in Milvus: {e}")
+            sys.exit(1)
+
+    def count(self, collection_name):
+        # Get the number of milvus collection
+        try:
+            self.set_collection(collection_name)
+            num = self.collection.num_entities
+            LOGGER.debug(
+                f"Successfully get the num:{num} of the collection:{collection_name}"
+            )
+            return num
+        except Exception as e:
+            LOGGER.error(f"Failed to count vectors in Milvus: {e}")
+            sys.exit(1)
diff --git a/demos/audio_searching/src/mysql_helpers.py b/demos/audio_searching/src/mysql_helpers.py
new file mode 100644
index 000000000..303838399
--- /dev/null
+++ b/demos/audio_searching/src/mysql_helpers.py
@@ -0,0 +1,133 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+
+import pymysql
+from config import MYSQL_DB
+from config import MYSQL_HOST
+from config import MYSQL_PORT
+from config import MYSQL_PWD
+from config import MYSQL_USER
+from logs import LOGGER
+
+
+class MySQLHelper():
+    """
+    the basic operations of PyMySQL
+
+    # This example shows how to:
+    #   1. connect to MySQL server
+    #   2. create a table
+    #   3. insert data to table
+    #   4. search by milvus ids
+    #   5. delete table
+    """
+
+    def __init__(self):
+        self.conn = pymysql.connect(
+            host=MYSQL_HOST,
+            user=MYSQL_USER,
+            port=MYSQL_PORT,
+            password=MYSQL_PWD,
+            database=MYSQL_DB,
+            local_infile=True)
+        self.cursor = self.conn.cursor()
+
+    def test_connection(self):
+        try:
+            self.conn.ping()
+        except Exception:
+            self.conn = pymysql.connect(
+                host=MYSQL_HOST,
+                user=MYSQL_USER,
+                port=MYSQL_PORT,
+                password=MYSQL_PWD,
+                database=MYSQL_DB,
+                local_infile=True)
+            self.cursor = self.conn.cursor()
+
+    def create_mysql_table(self, table_name):
+        # Create mysql table if not exists
+        self.test_connection()
+        sql = "create table if not exists " + table_name + "(milvus_id TEXT, audio_path TEXT);"
+        try:
+            self.cursor.execute(sql)
+            LOGGER.debug(f"MYSQL create table: {table_name} with sql: {sql}")
+        except Exception as e:
+            LOGGER.error(f"MYSQL ERROR: {e} with sql: {sql}")
+            sys.exit(1)
+
+    def load_data_to_mysql(self, table_name, data):
+        # Batch insert (Milvus_ids, img_path) to mysql
+        self.test_connection()
+        sql = "insert into " + table_name + " (milvus_id,audio_path) values (%s,%s);"
+        try:
+            self.cursor.executemany(sql, data)
+            self.conn.commit()
+            LOGGER.debug(
+                f"MYSQL loads data to table: {table_name} successfully")
+        except Exception as e:
+            LOGGER.error(f"MYSQL ERROR: {e} with sql: {sql}")
+            sys.exit(1)
+
+    def search_by_milvus_ids(self, ids, table_name):
+        # Get the img_path according to the milvus ids
+        self.test_connection()
+        str_ids = str(ids).replace('[', '').replace(']', '')
+        sql = "select audio_path from " + table_name + " where milvus_id in (" + str_ids + ") order by field (milvus_id," + str_ids + ");"
+        try:
+            self.cursor.execute(sql)
+            results = self.cursor.fetchall()
+            results = [res[0] for res in results]
+            LOGGER.debug("MYSQL search by milvus id.")
+            return results
+        except Exception as e:
+            LOGGER.error(f"MYSQL ERROR: {e} with sql: {sql}")
+            sys.exit(1)
+
+    def delete_table(self, table_name):
+        # Delete mysql table if exists
+        self.test_connection()
+        sql = "drop table if exists " + table_name + ";"
+        try:
+            self.cursor.execute(sql)
+            LOGGER.debug(f"MYSQL delete table:{table_name}")
+        except Exception as e:
+            LOGGER.error(f"MYSQL ERROR: {e} with sql: {sql}")
+            sys.exit(1)
+
+    def delete_all_data(self, table_name):
+        # Delete all the data in mysql table
+        self.test_connection()
+        sql = 'delete from ' + table_name + ';'
+        try:
+            self.cursor.execute(sql)
+            self.conn.commit()
+            LOGGER.debug(f"MYSQL delete all data in table:{table_name}")
+        except Exception as e:
+            LOGGER.error(f"MYSQL ERROR: {e} with sql: {sql}")
+            sys.exit(1)
+
+    def count_table(self, table_name):
+        # Get the number of mysql table
+        self.test_connection()
+        sql = "select count(milvus_id) from " + table_name + ";"
+        try:
+            self.cursor.execute(sql)
+            results = self.cursor.fetchall()
+            LOGGER.debug(f"MYSQL count table:{table_name}")
+            return results[0][0]
+        except Exception as e:
+            LOGGER.error(f"MYSQL ERROR: {e} with sql: {sql}")
+            sys.exit(1)
diff --git a/demos/audio_searching/src/operations/__init__.py b/demos/audio_searching/src/operations/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/demos/audio_searching/src/operations/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/demos/audio_searching/src/operations/count.py b/demos/audio_searching/src/operations/count.py
new file mode 100644
index 000000000..9a1f42082
--- /dev/null
+++ b/demos/audio_searching/src/operations/count.py
@@ -0,0 +1,33 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+
+from config import DEFAULT_TABLE
+from logs import LOGGER
+
+
+def do_count(table_name, milvus_cli):
+    """
+    Returns the total number of vectors in the system
+    """
+    if not table_name:
+        table_name = DEFAULT_TABLE
+    try:
+        if not milvus_cli.has_collection(table_name):
+            return None
+        num = milvus_cli.count(table_name)
+        return num
+    except Exception as e:
+        LOGGER.error(f"Error attempting to count table {e}")
+        sys.exit(1)
diff --git a/demos/audio_searching/src/operations/drop.py b/demos/audio_searching/src/operations/drop.py
new file mode 100644
index 000000000..f8278ddd0
--- /dev/null
+++ b/demos/audio_searching/src/operations/drop.py
@@ -0,0 +1,34 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+
+from config import DEFAULT_TABLE
+from logs import LOGGER
+
+
+def do_drop(table_name, milvus_cli, mysql_cli):
+    """
+    Delete the collection of Milvus and MySQL
+    """
+    if not table_name:
+        table_name = DEFAULT_TABLE
+    try:
+        if not milvus_cli.has_collection(table_name):
+            return "Collection is not exist"
+        status = milvus_cli.delete_collection(table_name)
+        mysql_cli.delete_table(table_name)
+        return status
+    except Exception as e:
+        LOGGER.error(f"Error attempting to drop table: {e}")
+        sys.exit(1)
diff --git a/demos/audio_searching/src/operations/load.py b/demos/audio_searching/src/operations/load.py
new file mode 100644
index 000000000..80b6375fa
--- /dev/null
+++ b/demos/audio_searching/src/operations/load.py
@@ -0,0 +1,84 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import sys
+
+from config import DEFAULT_TABLE
+from diskcache import Cache
+from encode import get_audio_embedding
+from logs import LOGGER
+
+
+def get_audios(path):
+    """
+    List all wav and aif files recursively under the path folder.
+    """
+    supported_formats = [".wav", ".mp3", ".ogg", ".flac", ".m4a"]
+    return [
+        item for sublist in [[os.path.join(dir, file) for file in files]
+                             for dir, _, files in list(os.walk(path))]
+        for item in sublist if os.path.splitext(item)[1] in supported_formats
+    ]
+
+
+def extract_features(audio_dir):
+    """
+    Get the vector of audio
+    """
+    try:
+        cache = Cache('./tmp')
+        feats = []
+        names = []
+        audio_list = get_audios(audio_dir)
+        total = len(audio_list)
+        cache['total'] = total
+        for i, audio_path in enumerate(audio_list):
+            norm_feat = get_audio_embedding(audio_path)
+            if norm_feat is None:
+                continue
+            feats.append(norm_feat)
+            names.append(audio_path.encode())
+            cache['current'] = i + 1
+            print(
+                f"Extracting feature from audio No. {i + 1} , {total} audios in total"
+            )
+        return feats, names
+    except Exception as e:
+        LOGGER.error(f"Error with extracting feature from audio {e}")
+        sys.exit(1)
+
+
+def format_data(ids, names):
+    """
+    Combine the id of the vector and the name of the audio into a list
+    """
+    data = []
+    for i in range(len(ids)):
+        value = (str(ids[i]), names[i])
+        data.append(value)
+    return data
+
+
+def do_load(table_name, audio_dir, milvus_cli, mysql_cli):
+    """
+    Import vectors to Milvus and data to Mysql respectively
+    """
+    if not table_name:
+        table_name = DEFAULT_TABLE
+    vectors, names = extract_features(audio_dir)
+    ids = milvus_cli.insert(table_name, vectors)
+    milvus_cli.create_index(table_name)
+    mysql_cli.create_mysql_table(table_name)
+    mysql_cli.load_data_to_mysql(table_name, format_data(ids, names))
+    return len(ids)
diff --git a/demos/audio_searching/src/operations/search.py b/demos/audio_searching/src/operations/search.py
new file mode 100644
index 000000000..9cf48abf9
--- /dev/null
+++ b/demos/audio_searching/src/operations/search.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+
+from config import DEFAULT_TABLE
+from config import TOP_K
+from encode import get_audio_embedding
+from logs import LOGGER
+
+
+def do_search(host, table_name, audio_path, milvus_cli, mysql_cli):
+    """
+    Search the uploaded audio in Milvus/MySQL
+    """
+    try:
+        if not table_name:
+            table_name = DEFAULT_TABLE
+        feat = get_audio_embedding(audio_path)
+        vectors = milvus_cli.search_vectors(table_name, [feat], TOP_K)
+        vids = [str(x.id) for x in vectors[0]]
+        paths = mysql_cli.search_by_milvus_ids(vids, table_name)
+        distances = [x.distance for x in vectors[0]]
+        for i in range(len(paths)):
+            tmp = "http://" + str(host) + "/data?audio_path=" + str(paths[i])
+            paths[i] = tmp
+            distances[i] = (1 - distances[i]) * 100
+        return vids, paths, distances
+    except Exception as e:
+        LOGGER.error(f"Error with search: {e}")
+        sys.exit(1)
diff --git a/demos/audio_searching/src/test_main.py b/demos/audio_searching/src/test_main.py
new file mode 100644
index 000000000..32030bae7
--- /dev/null
+++ b/demos/audio_searching/src/test_main.py
@@ -0,0 +1,95 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from fastapi.testclient import TestClient
+from main import app
+
+from utils.utility import download
+from utils.utility import unpack
+
+client = TestClient(app)
+
+
+def download_audio_data():
+    """
+    download audio data
+    """
+    url = "https://paddlespeech.bj.bcebos.com/vector/audio/example_audio.tar.gz"
+    md5sum = "52ac69316c1aa1fdef84da7dd2c67b39"
+    target_dir = "./"
+    filepath = download(url, md5sum, target_dir)
+    unpack(filepath, target_dir, True)
+
+
+def test_drop():
+    """
+    Delete the collection of Milvus and MySQL
+    """
+    response = client.post("/audio/drop")
+    assert response.status_code == 200
+
+
+def test_load():
+    """
+    Insert all the audio files under the file path to Milvus/MySQL
+    """
+    response = client.post("/audio/load", json={"File": "./example_audio"})
+    assert response.status_code == 200
+    assert response.json() == {
+        'status': True,
+        'msg': "Successfully loaded data!"
+    }
+
+
+def test_progress():
+    """
+    Get the progress of dealing with data
+    """
+    response = client.get("/progress")
+    assert response.status_code == 200
+    assert response.json() == "current: 20, total: 20"
+
+
+def test_count():
+    """
+    Returns the total number of vectors in the system
+    """
+    response = client.get("audio/count")
+    assert response.status_code == 200
+    assert response.json() == 20
+
+
+def test_search():
+    """
+    Search the uploaded audio in Milvus/MySQL
+    """
+    response = client.post(
+        "/audio/search/local?query_audio_path=.%2Fexample_audio%2Ftest.wav")
+    assert response.status_code == 200
+    assert len(response.json()) == 10
+
+
+def test_data():
+    """
+    Get the audio file
+    """
+    response = client.get("/data?audio_path=.%2Fexample_audio%2Ftest.wav")
+    assert response.status_code == 200
+
+
+if __name__ == "__main__":
+    download_audio_data()
+    test_load()
+    test_count()
+    test_search()
+    test_drop()
diff --git a/demos/speaker_verification/README.md b/demos/speaker_verification/README.md
new file mode 100644
index 000000000..8739d402d
--- /dev/null
+++ b/demos/speaker_verification/README.md
@@ -0,0 +1,158 @@
+([简体中文](./README_cn.md)|English)
+# Speech Verification)
+
+## Introduction
+
+Speaker Verification, refers to the problem of getting a speaker embedding from an audio. 
+
+This demo is an implementation to extract speaker embedding from a specific audio file. It can be done by a single command or a few lines in python using `PaddleSpeech`. 
+
+## Usage
+### 1. Installation
+see [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md).
+
+You can choose one way from easy, meduim and hard to install paddlespeech.
+
+### 2. Prepare Input File
+The input of this demo should be a WAV file(`.wav`), and the sample rate must be the same as the model.
+
+Here are sample files for this demo that can be downloaded:
+```bash
+wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav
+```
+
+### 3. Usage
+- Command Line(Recommended)
+  ```bash
+  paddlespeech vector --task spk --input 85236145389.wav
+
+  echo -e "demo1 85236145389.wav" > vec.job
+  paddlespeech vector --task spk --input vec.job
+
+  echo -e "demo2 85236145389.wav \n demo3 85236145389.wav" | paddlespeech vector --task spk
+  ```
+  
+  Usage:
+  ```bash
+  paddlespeech vector --help
+  ```
+  Arguments:
+  - `input`(required): Audio file to recognize.
+  - `model`: Model type of vector task. Default: `ecapatdnn_voxceleb12`.
+  - `sample_rate`: Sample rate of the model. Default: `16000`.
+  - `config`: Config of vector task. Use pretrained model when it is None. Default: `None`.
+  - `ckpt_path`: Model checkpoint. Use pretrained model when it is None. Default: `None`.
+  - `device`: Choose device to execute model inference. Default: default device of paddlepaddle in current environment.
+
+  Output:
+
+  ```bash
+    demo [ -5.749211     9.505463    -8.200284    -5.2075014    5.3940268
+      -3.04878      1.611095    10.127234   -10.534177   -15.821609
+      1.2032688   -0.35080156   1.2629458  -12.643498    -2.5758228
+    -11.343508     2.3385992   -8.719341    14.213509    15.404744
+      -0.39327756   6.338786     2.688887     8.7104025   17.469526
+      -8.77959      7.0576906    4.648855    -1.3089896  -23.294737
+      8.013747    13.891729    -9.926753     5.655307    -5.9422326
+    -22.842539     0.6293588  -18.46266    -10.811862     9.8192625
+      3.0070958    3.8072643   -2.3861165    3.0821571  -14.739942
+      1.7594414   -0.6485091    4.485623     2.0207152    7.264915
+      -6.40137     23.63524      2.9711294  -22.708025     9.93719
+      20.354511   -10.324688    -0.700492    -8.783211    -5.27593
+      15.999649     3.3004563   12.747926    15.429879     4.7849145
+      5.6699696   -2.3826702   10.605882     3.9112158    3.1500628
+      15.859915    -2.1832209  -23.908653    -6.4799504   -4.5365124
+      -9.224193    14.568347   -10.568833     4.982321    -4.342062
+      0.0914714   12.645902    -5.74285     -3.2141201   -2.7173362
+      -6.680575     0.4757669   -5.035051    -6.7964664   16.865469
+    -11.54324      7.681869     0.44475392   9.708182    -8.932846
+      0.4123232   -4.361452     1.3948607    9.511665     0.11667654
+      2.9079323    6.049952     9.275183   -18.078873     6.2983274
+      -0.7500531   -2.725033    -7.6027865    3.3404543    2.990815
+      4.010979    11.000591    -2.8873312    7.1352735  -16.79663
+      18.495346   -14.293832     7.89578      2.2714825   22.976387
+      -4.875734    -3.0836344   -2.9999814   13.751918     6.448228
+    -11.924197     2.171869     2.0423572   -6.173772    10.778437
+      25.77281     -4.9495463   14.57806      0.3044315    2.6132357
+      -7.591999    -2.076944     9.025118     1.7834753   -3.1799617
+      -4.9401326   23.465864     5.1685796   -9.018578     9.037825
+      -4.4150195    6.859591   -12.274467    -0.88911164   5.186309
+      -3.9988663  -13.638606    -9.925445    -0.06329413  -3.6709652
+    -12.397416   -12.719869    -1.395601     2.1150916    5.7381287
+      -4.4691963   -3.82819     -0.84233856  -1.1604277  -13.490127
+      8.731719   -20.778936   -11.495662     5.8033476   -4.752041
+      10.833007    -6.717991     4.504732    13.4244375    1.1306485
+      7.3435574    1.400918    14.704036    -9.501399     7.2315617
+      -6.417456     1.3333273   11.872697    -0.30664724   8.8845
+      6.5569253    4.7948146    0.03662816  -8.704245     6.224871
+      -3.2701402  -11.508579  ]
+  ```
+
+- Python API
+  ```python
+  import paddle
+  from paddlespeech.cli import VectorExecutor
+
+  vector_executor = VectorExecutor()
+  audio_emb = vector_executor(
+      model='ecapatdnn_voxceleb12',
+      sample_rate=16000,
+      config=None, 
+      ckpt_path=None,
+      audio_file='./85236145389.wav',
+      force_yes=False,
+      device=paddle.get_device())
+  print('Audio embedding Result: \n{}'.format(audio_emb))
+  ```
+
+  Output:
+  ```bash
+  # Vector Result:
+   [ -5.749211     9.505463    -8.200284    -5.2075014    5.3940268
+      -3.04878      1.611095    10.127234   -10.534177   -15.821609
+      1.2032688   -0.35080156   1.2629458  -12.643498    -2.5758228
+    -11.343508     2.3385992   -8.719341    14.213509    15.404744
+      -0.39327756   6.338786     2.688887     8.7104025   17.469526
+      -8.77959      7.0576906    4.648855    -1.3089896  -23.294737
+      8.013747    13.891729    -9.926753     5.655307    -5.9422326
+    -22.842539     0.6293588  -18.46266    -10.811862     9.8192625
+      3.0070958    3.8072643   -2.3861165    3.0821571  -14.739942
+      1.7594414   -0.6485091    4.485623     2.0207152    7.264915
+      -6.40137     23.63524      2.9711294  -22.708025     9.93719
+      20.354511   -10.324688    -0.700492    -8.783211    -5.27593
+      15.999649     3.3004563   12.747926    15.429879     4.7849145
+      5.6699696   -2.3826702   10.605882     3.9112158    3.1500628
+      15.859915    -2.1832209  -23.908653    -6.4799504   -4.5365124
+      -9.224193    14.568347   -10.568833     4.982321    -4.342062
+      0.0914714   12.645902    -5.74285     -3.2141201   -2.7173362
+      -6.680575     0.4757669   -5.035051    -6.7964664   16.865469
+    -11.54324      7.681869     0.44475392   9.708182    -8.932846
+      0.4123232   -4.361452     1.3948607    9.511665     0.11667654
+      2.9079323    6.049952     9.275183   -18.078873     6.2983274
+      -0.7500531   -2.725033    -7.6027865    3.3404543    2.990815
+      4.010979    11.000591    -2.8873312    7.1352735  -16.79663
+      18.495346   -14.293832     7.89578      2.2714825   22.976387
+      -4.875734    -3.0836344   -2.9999814   13.751918     6.448228
+    -11.924197     2.171869     2.0423572   -6.173772    10.778437
+      25.77281     -4.9495463   14.57806      0.3044315    2.6132357
+      -7.591999    -2.076944     9.025118     1.7834753   -3.1799617
+      -4.9401326   23.465864     5.1685796   -9.018578     9.037825
+      -4.4150195    6.859591   -12.274467    -0.88911164   5.186309
+      -3.9988663  -13.638606    -9.925445    -0.06329413  -3.6709652
+    -12.397416   -12.719869    -1.395601     2.1150916    5.7381287
+      -4.4691963   -3.82819     -0.84233856  -1.1604277  -13.490127
+      8.731719   -20.778936   -11.495662     5.8033476   -4.752041
+      10.833007    -6.717991     4.504732    13.4244375    1.1306485
+      7.3435574    1.400918    14.704036    -9.501399     7.2315617
+      -6.417456     1.3333273   11.872697    -0.30664724   8.8845
+      6.5569253    4.7948146    0.03662816  -8.704245     6.224871
+      -3.2701402  -11.508579  ]
+  ```
+
+### 4.Pretrained Models
+
+Here is a list of pretrained models released by PaddleSpeech that can be used by command and python API:
+
+| Model | Sample Rate
+| :--- | :---: |
+| ecapatdnn_voxceleb12 | 16k
diff --git a/demos/speaker_verification/README_cn.md b/demos/speaker_verification/README_cn.md
new file mode 100644
index 000000000..fe8949b3c
--- /dev/null
+++ b/demos/speaker_verification/README_cn.md
@@ -0,0 +1,155 @@
+(简体中文|[English](./README.md))
+
+# 声纹识别
+## 介绍
+声纹识别是一项用计算机程序自动提取说话人特征的技术。
+
+这个 demo 是一个从给定音频文件提取说话人特征，它可以通过使用 `PaddleSpeech` 的单个命令或 python 中的几行代码来实现。
+
+## 使用方法
+### 1. 安装
+请看[安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install_cn.md)。
+
+你可以从 easy，medium，hard 三中方式中选择一种方式安装。
+
+### 2. 准备输入
+这个 demo 的输入应该是一个 WAV 文件（`.wav`），并且采样率必须与模型的采样率相同。
+
+可以下载此 demo 的示例音频：
+```bash
+# 该音频的内容是数字串 85236145389
+wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav
+```
+### 3. 使用方法
+- 命令行 (推荐使用)
+  ```bash
+  paddlespeech vector --task spk --input 85236145389.wav
+
+  echo -e "demo1 85236145389.wav" > vec.job
+  paddlespeech vector --task spk --input vec.job
+
+  echo -e "demo2 85236145389.wav \n demo3 85236145389.wav" | paddlespeech vector --task spk
+  ```
+  
+  使用方法：
+  ```bash
+  paddlespeech vector --help
+  ```
+  参数：
+  - `input`(必须输入)：用于识别的音频文件。
+  - `model`：声纹任务的模型，默认值：`ecapatdnn_voxceleb12`。
+  - `sample_rate`：音频采样率，默认值：`16000`。
+  - `config`：声纹任务的参数文件，若不设置则使用预训练模型中的默认配置，默认值：`None`。
+  - `ckpt_path`：模型参数文件，若不设置则下载预训练模型使用，默认值：`None`。
+  - `device`：执行预测的设备，默认值：当前系统下 paddlepaddle 的默认 device。
+
+  输出：
+  ```bash
+  demo  [ -5.749211     9.505463    -8.200284    -5.2075014    5.3940268
+      -3.04878      1.611095    10.127234   -10.534177   -15.821609
+      1.2032688   -0.35080156   1.2629458  -12.643498    -2.5758228
+    -11.343508     2.3385992   -8.719341    14.213509    15.404744
+      -0.39327756   6.338786     2.688887     8.7104025   17.469526
+      -8.77959      7.0576906    4.648855    -1.3089896  -23.294737
+      8.013747    13.891729    -9.926753     5.655307    -5.9422326
+    -22.842539     0.6293588  -18.46266    -10.811862     9.8192625
+      3.0070958    3.8072643   -2.3861165    3.0821571  -14.739942
+      1.7594414   -0.6485091    4.485623     2.0207152    7.264915
+      -6.40137     23.63524      2.9711294  -22.708025     9.93719
+      20.354511   -10.324688    -0.700492    -8.783211    -5.27593
+      15.999649     3.3004563   12.747926    15.429879     4.7849145
+      5.6699696   -2.3826702   10.605882     3.9112158    3.1500628
+      15.859915    -2.1832209  -23.908653    -6.4799504   -4.5365124
+      -9.224193    14.568347   -10.568833     4.982321    -4.342062
+      0.0914714   12.645902    -5.74285     -3.2141201   -2.7173362
+      -6.680575     0.4757669   -5.035051    -6.7964664   16.865469
+    -11.54324      7.681869     0.44475392   9.708182    -8.932846
+      0.4123232   -4.361452     1.3948607    9.511665     0.11667654
+      2.9079323    6.049952     9.275183   -18.078873     6.2983274
+      -0.7500531   -2.725033    -7.6027865    3.3404543    2.990815
+      4.010979    11.000591    -2.8873312    7.1352735  -16.79663
+      18.495346   -14.293832     7.89578      2.2714825   22.976387
+      -4.875734    -3.0836344   -2.9999814   13.751918     6.448228
+    -11.924197     2.171869     2.0423572   -6.173772    10.778437
+      25.77281     -4.9495463   14.57806      0.3044315    2.6132357
+      -7.591999    -2.076944     9.025118     1.7834753   -3.1799617
+      -4.9401326   23.465864     5.1685796   -9.018578     9.037825
+      -4.4150195    6.859591   -12.274467    -0.88911164   5.186309
+      -3.9988663  -13.638606    -9.925445    -0.06329413  -3.6709652
+    -12.397416   -12.719869    -1.395601     2.1150916    5.7381287
+      -4.4691963   -3.82819     -0.84233856  -1.1604277  -13.490127
+      8.731719   -20.778936   -11.495662     5.8033476   -4.752041
+      10.833007    -6.717991     4.504732    13.4244375    1.1306485
+      7.3435574    1.400918    14.704036    -9.501399     7.2315617
+      -6.417456     1.3333273   11.872697    -0.30664724   8.8845
+      6.5569253    4.7948146    0.03662816  -8.704245     6.224871
+      -3.2701402  -11.508579  ]
+  ```
+
+- Python API
+  ```python
+  import paddle
+  from paddlespeech.cli import VectorExecutor
+
+  vector_executor = VectorExecutor()
+  audio_emb = vector_executor(
+      model='ecapatdnn_voxceleb12',
+      sample_rate=16000,
+      config=None,  # Set `config` and `ckpt_path` to None to use pretrained model.
+      ckpt_path=None,
+      audio_file='./85236145389.wav',
+      force_yes=False,
+      device=paddle.get_device())
+  print('Audio embedding Result: \n{}'.format(audio_emb))
+  ```
+
+  输出：
+  ```bash
+  # Vector Result:
+   [ -5.749211     9.505463    -8.200284    -5.2075014    5.3940268
+      -3.04878      1.611095    10.127234   -10.534177   -15.821609
+      1.2032688   -0.35080156   1.2629458  -12.643498    -2.5758228
+    -11.343508     2.3385992   -8.719341    14.213509    15.404744
+      -0.39327756   6.338786     2.688887     8.7104025   17.469526
+      -8.77959      7.0576906    4.648855    -1.3089896  -23.294737
+      8.013747    13.891729    -9.926753     5.655307    -5.9422326
+    -22.842539     0.6293588  -18.46266    -10.811862     9.8192625
+      3.0070958    3.8072643   -2.3861165    3.0821571  -14.739942
+      1.7594414   -0.6485091    4.485623     2.0207152    7.264915
+      -6.40137     23.63524      2.9711294  -22.708025     9.93719
+      20.354511   -10.324688    -0.700492    -8.783211    -5.27593
+      15.999649     3.3004563   12.747926    15.429879     4.7849145
+      5.6699696   -2.3826702   10.605882     3.9112158    3.1500628
+      15.859915    -2.1832209  -23.908653    -6.4799504   -4.5365124
+      -9.224193    14.568347   -10.568833     4.982321    -4.342062
+      0.0914714   12.645902    -5.74285     -3.2141201   -2.7173362
+      -6.680575     0.4757669   -5.035051    -6.7964664   16.865469
+    -11.54324      7.681869     0.44475392   9.708182    -8.932846
+      0.4123232   -4.361452     1.3948607    9.511665     0.11667654
+      2.9079323    6.049952     9.275183   -18.078873     6.2983274
+      -0.7500531   -2.725033    -7.6027865    3.3404543    2.990815
+      4.010979    11.000591    -2.8873312    7.1352735  -16.79663
+      18.495346   -14.293832     7.89578      2.2714825   22.976387
+      -4.875734    -3.0836344   -2.9999814   13.751918     6.448228
+    -11.924197     2.171869     2.0423572   -6.173772    10.778437
+      25.77281     -4.9495463   14.57806      0.3044315    2.6132357
+      -7.591999    -2.076944     9.025118     1.7834753   -3.1799617
+      -4.9401326   23.465864     5.1685796   -9.018578     9.037825
+      -4.4150195    6.859591   -12.274467    -0.88911164   5.186309
+      -3.9988663  -13.638606    -9.925445    -0.06329413  -3.6709652
+    -12.397416   -12.719869    -1.395601     2.1150916    5.7381287
+      -4.4691963   -3.82819     -0.84233856  -1.1604277  -13.490127
+      8.731719   -20.778936   -11.495662     5.8033476   -4.752041
+      10.833007    -6.717991     4.504732    13.4244375    1.1306485
+      7.3435574    1.400918    14.704036    -9.501399     7.2315617
+      -6.417456     1.3333273   11.872697    -0.30664724   8.8845
+      6.5569253    4.7948146    0.03662816  -8.704245     6.224871
+      -3.2701402  -11.508579  ]
+  ```
+
+### 4.预训练模型
+以下是 PaddleSpeech 提供的可以被命令行和 python API 使用的预训练模型列表：
+
+| 模型 | 采样率
+| :--- | :---: |
+| ecapatdnn_voxceleb12 | 16k
diff --git a/demos/speaker_verification/run.sh b/demos/speaker_verification/run.sh
new file mode 100644
index 000000000..856886d33
--- /dev/null
+++ b/demos/speaker_verification/run.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav
+
+# asr
+paddlespeech vector --task spk --input ./85236145389.wav
\ No newline at end of file
diff --git a/demos/speech_recognition/README.md b/demos/speech_recognition/README.md
index 5d964fcea..636548801 100644
--- a/demos/speech_recognition/README.md
+++ b/demos/speech_recognition/README.md
@@ -84,5 +84,8 @@ Here is a list of pretrained models released by PaddleSpeech that can be used by
 
 | Model | Language | Sample Rate
 | :--- | :---: | :---: |
-| conformer_wenetspeech| zh| 16000
-| transformer_librispeech| en| 16000
+| conformer_wenetspeech| zh| 16k
+| transformer_librispeech| en| 16k
+| deepspeech2offline_aishell| zh| 16k
+| deepspeech2online_aishell | zh | 16k
+|deepspeech2offline_librispeech|en| 16k
diff --git a/demos/speech_recognition/README_cn.md b/demos/speech_recognition/README_cn.md
index ba1f1d65c..8033dbd81 100644
--- a/demos/speech_recognition/README_cn.md
+++ b/demos/speech_recognition/README_cn.md
@@ -81,5 +81,8 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
 
 | 模型 | 语言 | 采样率
 | :--- | :---: | :---: |
-| conformer_wenetspeech| zh| 16000
-| transformer_librispeech| en| 16000
+| conformer_wenetspeech | zh | 16k
+| transformer_librispeech | en | 16k
+| deepspeech2offline_aishell| zh| 16k
+| deepspeech2online_aishell | zh | 16k
+| deepspeech2offline_librispeech | en | 16k
diff --git a/demos/speech_server/.gitignore b/demos/speech_server/.gitignore
new file mode 100644
index 000000000..d8dd7532a
--- /dev/null
+++ b/demos/speech_server/.gitignore
@@ -0,0 +1 @@
+*.wav
diff --git a/demos/speech_server/README.md b/demos/speech_server/README.md
index a2f6f2213..0323d3983 100644
--- a/demos/speech_server/README.md
+++ b/demos/speech_server/README.md
@@ -15,8 +15,8 @@ You can choose one way from meduim and hard to install paddlespeech.
 
 ### 2. Prepare config File
 The configuration file can be found in `conf/application.yaml` .
-Among them, `engine_list` indicates the speech engine that will be included in the service to be started, in the format of <speech task>_<engine type>.
-At present, the speech tasks integrated by the service include: asr (speech recognition) and tts (speech synthesis).
+Among them, `engine_list` indicates the speech engine that will be included in the service to be started, in the format of `<speech task>_<engine type>`.
+At present, the speech tasks integrated by the service include: asr (speech recognition), tts (text to sppech) and cls (audio classification).
 Currently the engine type supports two forms: python and inference (Paddle Inference)
 
 
@@ -110,21 +110,22 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
 - Python API
   ```python
   from paddlespeech.server.bin.paddlespeech_client import ASRClientExecutor
+  import json
 
   asrclient_executor = ASRClientExecutor()
-  asrclient_executor(
+  res = asrclient_executor(
       input="./zh.wav",
       server_ip="127.0.0.1",
       port=8090,
       sample_rate=16000,
       lang="zh_cn",
       audio_format="wav")
+  print(res.json())
   ```
 
   Output:
   ```bash
   {'success': True, 'code': 200, 'message': {'description': 'success'}, 'result': {'transcription': '我认为跑步最重要的就是给我带来了身体健康'}}
-  time cost 0.604353 s.
   ```
  
 ### 5. TTS Client Usage
@@ -146,7 +147,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
     - `speed`: Audio speed, the value should be set between 0 and 3. Default: 1.0
     - `volume`: Audio volume, the value should be set between 0 and 3. Default: 1.0
     - `sample_rate`: Sampling rate, choice: [0, 8000, 16000], the default is the same as the model. Default: 0
-    - `output`: Output wave filepath. Default: `output.wav`.
+    - `output`: Output wave filepath. Default: None, which means not to save the audio to the local.
 
     Output:
     ```bash
@@ -160,9 +161,10 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
 - Python API
   ```python
   from paddlespeech.server.bin.paddlespeech_client import TTSClientExecutor
+  import json
 
   ttsclient_executor = TTSClientExecutor()
-  ttsclient_executor(
+  res = ttsclient_executor(
       input="您好，欢迎使用百度飞桨语音合成服务。",
       server_ip="127.0.0.1",
       port=8090,
@@ -171,6 +173,11 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
       volume=1.0,
       sample_rate=0,
       output="./output.wav")
+
+  response_dict = res.json()
+  print(response_dict["message"])
+  print("Save synthesized audio successfully on %s." % (response_dict['result']['save_path']))
+  print("Audio duration: %f s." %(response_dict['result']['duration']))
   ```
 
   Output:
@@ -178,7 +185,52 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
   {'description': 'success.'}
   Save synthesized audio successfully on ./output.wav.
   Audio duration: 3.612500 s.
-  Response time: 0.388317 s.
+
+  ```
+
+### 6. CLS Client Usage
+**Note:** The response time will be slightly longer when using the client for the first time
+- Command Line (Recommended)
+   ```
+   paddlespeech_client cls --server_ip 127.0.0.1 --port 8090 --input ./zh.wav
+   ```
+
+  Usage:
+  
+  ```bash
+  paddlespeech_client cls --help
+  ```
+  Arguments:
+  - `server_ip`: server ip. Default: 127.0.0.1
+  - `port`: server port. Default: 8090
+  - `input`(required): Audio file to be classified.
+  - `topk`: topk scores of classification result.
+
+  Output:
+  ```bash
+  [2022-03-09 20:44:39,974] [    INFO] - {'success': True, 'code': 200, 'message': {'description': 'success'}, 'result': {'topk': 1, 'results': [{'class_name': 'Speech', 'prob': 0.9027184844017029}]}}
+  [2022-03-09 20:44:39,975] [    INFO] - Response time 0.104360 s.
+
+
+  ```
+
+- Python API
+  ```python
+  from paddlespeech.server.bin.paddlespeech_client import CLSClientExecutor
+  import json
+
+  clsclient_executor = CLSClientExecutor()
+  res = clsclient_executor(
+      input="./zh.wav",
+      server_ip="127.0.0.1",
+      port=8090,
+      topk=1)
+  print(res.json())
+  ```
+
+  Output:
+  ```bash
+  {'success': True, 'code': 200, 'message': {'description': 'success'}, 'result': {'topk': 1, 'results': [{'class_name': 'Speech', 'prob': 0.9027184844017029}]}}
 
   ```
 
@@ -189,3 +241,6 @@ Get all models supported by the ASR service via `paddlespeech_server stats --tas
 
 ### TTS model
 Get all models supported by the TTS service via `paddlespeech_server stats --task tts`, where static models can be used for paddle inference inference.
+
+### CLS model
+Get all models supported by the CLS service via `paddlespeech_server stats --task cls`, where static models can be used for paddle inference inference.
diff --git a/demos/speech_server/README_cn.md b/demos/speech_server/README_cn.md
index 762248a11..687b51f10 100644
--- a/demos/speech_server/README_cn.md
+++ b/demos/speech_server/README_cn.md
@@ -17,7 +17,7 @@
 ### 2. 准备配置文件
 配置文件可参见 `conf/application.yaml` 。
 其中，`engine_list`表示即将启动的服务将会包含的语音引擎，格式为 <语音任务>_<引擎类型>。
-目前服务集成的语音任务有： asr(语音识别)、tts(语音合成)。
+目前服务集成的语音任务有： asr(语音识别)、tts(语音合成)以及cls(音频分类)。
 目前引擎类型支持两种形式：python 及 inference (Paddle Inference)
 
 
@@ -80,7 +80,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
 
   ```
 
-### 4. ASR客户端使用方法
+### 4. ASR 客户端使用方法
 **注意：** 初次使用客户端时响应时间会略长
 - 命令行 (推荐使用)
    ```
@@ -111,25 +111,26 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
 - Python API
   ```python
   from paddlespeech.server.bin.paddlespeech_client import ASRClientExecutor
+  import json
 
   asrclient_executor = ASRClientExecutor()
-  asrclient_executor(
+  res = asrclient_executor(
       input="./zh.wav",
       server_ip="127.0.0.1",
       port=8090,
       sample_rate=16000,
       lang="zh_cn",
       audio_format="wav")
+  print(res.json())
   ```
 
   输出:
   ```bash
   {'success': True, 'code': 200, 'message': {'description': 'success'}, 'result': {'transcription': '我认为跑步最重要的就是给我带来了身体健康'}}
-  time cost 0.604353 s.
 
   ```
  
-### 5. TTS客户端使用方法
+### 5. TTS 客户端使用方法
 **注意：** 初次使用客户端时响应时间会略长
 - 命令行 (推荐使用)
 
@@ -150,7 +151,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
     - `speed`: 音频速度，该值应设置在 0 到 3 之间。 默认值：1.0
     - `volume`: 音频音量，该值应设置在 0 到 3 之间。 默认值： 1.0
     - `sample_rate`: 采样率，可选 [0, 8000, 16000]，默认与模型相同。 默认值：0
-    - `output`: 输出音频的路径， 默认值：output.wav。
+    - `output`: 输出音频的路径， 默认值：None，表示不保存音频到本地。
 
     输出:
     ```bash
@@ -163,9 +164,10 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
 - Python API
   ```python
   from paddlespeech.server.bin.paddlespeech_client import TTSClientExecutor
+  import json
 
   ttsclient_executor = TTSClientExecutor()
-  ttsclient_executor(
+  res = ttsclient_executor(
       input="您好，欢迎使用百度飞桨语音合成服务。",
       server_ip="127.0.0.1",
       port=8090,
@@ -174,6 +176,11 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
       volume=1.0,
       sample_rate=0,
       output="./output.wav")
+
+  response_dict = res.json()
+  print(response_dict["message"])
+  print("Save synthesized audio successfully on %s." % (response_dict['result']['save_path']))
+  print("Audio duration: %f s." %(response_dict['result']['duration']))
   ```
 
   输出:
@@ -181,13 +188,63 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
   {'description': 'success.'}
   Save synthesized audio successfully on ./output.wav.
   Audio duration: 3.612500 s.
-  Response time: 0.388317 s.
 
   ```
 
+  ### 5. CLS 客户端使用方法
+  **注意：** 初次使用客户端时响应时间会略长
+  - 命令行 (推荐使用)
+   ```
+   paddlespeech_client cls --server_ip 127.0.0.1 --port 8090 --input ./zh.wav
+   ```
+
+  使用帮助:
+  
+  ```bash
+  paddlespeech_client cls --help
+  ```
+  参数:
+  - `server_ip`: 服务端ip地址，默认: 127.0.0.1。
+  - `port`: 服务端口，默认: 8090。
+  - `input`(必须输入): 用于分类的音频文件。
+  - `topk`: 分类结果的topk。
+
+  输出:
+  ```bash
+  [2022-03-09 20:44:39,974] [    INFO] - {'success': True, 'code': 200, 'message': {'description': 'success'}, 'result': {'topk': 1, 'results': [{'class_name': 'Speech', 'prob': 0.9027184844017029}]}}
+  [2022-03-09 20:44:39,975] [    INFO] - Response time 0.104360 s.
+
+
+  ```
+
+- Python API
+  ```python
+  from paddlespeech.server.bin.paddlespeech_client import CLSClientExecutor
+  import json
+
+  clsclient_executor = CLSClientExecutor()
+  res = clsclient_executor(
+      input="./zh.wav",
+      server_ip="127.0.0.1",
+      port=8090,
+      topk=1)
+  print(res.json())
+
+  ```
+
+  输出:
+  ```bash
+  {'success': True, 'code': 200, 'message': {'description': 'success'}, 'result': {'topk': 1, 'results': [{'class_name': 'Speech', 'prob': 0.9027184844017029}]}}
+
+  ```
+
+
 ## 服务支持的模型
 ### ASR支持的模型
 通过 `paddlespeech_server stats --task asr` 获取ASR服务支持的所有模型，其中静态模型可用于 paddle inference 推理。 
 
 ### TTS支持的模型
 通过 `paddlespeech_server stats --task tts` 获取TTS服务支持的所有模型，其中静态模型可用于 paddle inference 推理。
+
+### CLS支持的模型
+通过 `paddlespeech_server stats --task cls` 获取CLS服务支持的所有模型，其中静态模型可用于 paddle inference 推理。
diff --git a/demos/speech_server/cls_client.sh b/demos/speech_server/cls_client.sh
new file mode 100644
index 000000000..5797aa204
--- /dev/null
+++ b/demos/speech_server/cls_client.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
+paddlespeech_client cls --server_ip 127.0.0.1 --port 8090 --input ./zh.wav --topk 1
diff --git a/demos/speech_server/conf/application.yaml b/demos/speech_server/conf/application.yaml
index 6048450b7..2b1a05998 100644
--- a/demos/speech_server/conf/application.yaml
+++ b/demos/speech_server/conf/application.yaml
@@ -9,12 +9,14 @@ port: 8090
 # The task format in the engin_list is: <speech task>_<engine type>
 # task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference']
 
-engine_list: ['asr_python', 'tts_python']
+engine_list: ['asr_python', 'tts_python', 'cls_python']
 
 
 #################################################################################
 #                                ENGINE CONFIG                                  #
 #################################################################################
+
+################################### ASR #########################################
 ################### speech task: asr; engine_type: python #######################
 asr_python:
     model: 'conformer_wenetspeech'
@@ -46,6 +48,7 @@ asr_inference:
         summary: True  # False -> do not show predictor config
 
 
+################################### TTS #########################################
 ################### speech task: tts; engine_type: python #######################
 tts_python: 
     # am (acoustic model) choices=['speedyspeech_csmsc', 'fastspeech2_csmsc', 
@@ -105,3 +108,30 @@ tts_inference:
     # others
     lang: 'zh'
 
+
+################################### CLS #########################################
+################### speech task: cls; engine_type: python #######################
+cls_python:
+    # model choices=['panns_cnn14', 'panns_cnn10', 'panns_cnn6']
+    model: 'panns_cnn14'
+    cfg_path: # [optional] Config of cls task.
+    ckpt_path: # [optional] Checkpoint file of model.
+    label_file: # [optional] Label file of cls task.
+    device:  # set 'gpu:id' or 'cpu'
+
+
+################### speech task: cls; engine_type: inference #######################
+cls_inference:
+    # model_type choices=['panns_cnn14', 'panns_cnn10', 'panns_cnn6']
+    model_type: 'panns_cnn14' 
+    cfg_path: 
+    model_path:  # the pdmodel file of am static model [optional]
+    params_path:  # the pdiparams file of am static model [optional]
+    label_file:  # [optional] Label file of cls task.
+
+    predictor_conf:
+        device:  # set 'gpu:id' or 'cpu'
+        switch_ir_optim: True
+        glog_info: False  # True -> print glog
+        summary: True  # False -> do not show predictor config
+
diff --git a/docs/source/reference.md b/docs/source/reference.md
index a8327e92e..f1a02d200 100644
--- a/docs/source/reference.md
+++ b/docs/source/reference.md
@@ -35,3 +35,7 @@ We borrowed a lot of code from these repos to build `model` and `engine`, thanks
 * [librosa](https://github.com/librosa/librosa/blob/main/LICENSE.md)
 - ISC License
 - Audio feature
+
+* [ThreadPool](https://github.com/progschj/ThreadPool/blob/master/COPYING)
+- zlib License
+- ThreadPool
diff --git a/docs/source/released_model.md b/docs/source/released_model.md
index 8f855f7cf..9a423e03e 100644
--- a/docs/source/released_model.md
+++ b/docs/source/released_model.md
@@ -8,7 +8,8 @@ Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER |
 :-------------:| :------------:| :-----: | -----: | :-----: |:-----:| :-----:  | :-----:  | :-----: 
 [Ds2 Online Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.1.1.model.tar.gz) | Aishell Dataset | Char-based | 345 MB  | 2 Conv + 5 LSTM layers with only forward direction | 0.080 |-| 151 h | [D2 Online Aishell ASR0](../../examples/aishell/asr0) 
 [Ds2 Offline Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.064 |-| 151 h | [Ds2 Offline Aishell ASR0](../../examples/aishell/asr0) 
-[Conformer Offline Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_conformer_aishell_ckpt_0.1.1.model.tar.gz) | Aishell Dataset | Char-based | 284 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.056 |-| 151 h | [Conformer Offline Aishell ASR1](../../examples/aishell/asr1) 
+[Conformer Online Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_chunk_conformer_aishell_ckpt_0.1.2.model.tar.gz) | Aishell Dataset | Char-based | 189 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0565 |-| 151 h | [Conformer Online Aishell ASR1](../../examples/aishell/asr1) 
+[Conformer Offline Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_conformer_aishell_ckpt_0.1.2.model.tar.gz) | Aishell Dataset | Char-based | 189 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0483 |-| 151 h | [Conformer Offline Aishell ASR1](../../examples/aishell/asr1) 
 [Transformer Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_transformer_aishell_ckpt_0.1.1.model.tar.gz) | Aishell Dataset | Char-based | 128 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0523 || 151 h | [Transformer  Aishell ASR1](../../examples/aishell/asr1) 
 [Ds2 Offline Librispeech ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr0/asr0_deepspeech2_librispeech_ckpt_0.1.1.model.tar.gz)| Librispeech Dataset | Char-based | 518 MB | 2 Conv + 3 bidirectional LSTM layers| - |0.0725| 960 h | [Ds2 Offline Librispeech ASR0](../../examples/librispeech/asr0) 
 [Conformer Librispeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/asr1_conformer_librispeech_ckpt_0.1.1.model.tar.gz) | Librispeech Dataset | subword-based | 191 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0337 | 960 h | [Conformer Librispeech ASR1](../../examples/librispeech/asr1) 
@@ -49,17 +50,20 @@ Model Type | Dataset| Example Link | Pretrained Models| Static Models|Size (stat
 WaveFlow| LJSpeech |[waveflow-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc0)|[waveflow_ljspeech_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/waveflow/waveflow_ljspeech_ckpt_0.3.zip)|||
 Parallel WaveGAN| CSMSC |[PWGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc1)|[pwg_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip)|[pwg_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_static_0.4.zip)|5.1MB|
 Parallel WaveGAN| LJSpeech |[PWGAN-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc1)|[pwg_ljspeech_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_ljspeech_ckpt_0.5.zip)|||
-Parallel WaveGAN|AISHELL-3 |[PWGAN-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1)|[pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip)|||
+Parallel WaveGAN| AISHELL-3 |[PWGAN-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc1)|[pwg_aishell3_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip)|||
 Parallel WaveGAN| VCTK |[PWGAN-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/voc1)|[pwg_vctk_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.5.zip)|||
 |Multi Band MelGAN | CSMSC |[MB MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc3) | [mb_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip) <br>[mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip)|[mb_melgan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_static_0.1.1.zip) |8.2MB|
 Style MelGAN | CSMSC |[Style MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc4)|[style_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/style_melgan/style_melgan_csmsc_ckpt_0.1.1.zip)| | |
 HiFiGAN | CSMSC |[HiFiGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc5)|[hifigan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip)|[hifigan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_static_0.1.1.zip)|50MB|
+HiFiGAN | LJSpeech |[HiFiGAN-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc5)|[hifigan_ljspeech_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_ljspeech_ckpt_0.2.0.zip)|||
+HiFiGAN | AISHELL-3 |[HiFiGAN-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc5)|[hifigan_aishell3_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_ckpt_0.2.0.zip)|||
+HiFiGAN | VCTK |[HiFiGAN-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/voc5)|[hifigan_vctk_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_ckpt_0.2.0.zip)|||
 WaveRNN | CSMSC |[WaveRNN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc6)|[wavernn_csmsc_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_ckpt_0.2.0.zip)|[wavernn_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_static_0.2.0.zip)|18MB|
 
 
 ### Voice Cloning
 Model Type | Dataset| Example Link | Pretrained Models
-:-------------:| :------------:| :-----: | :-----:
+:-------------:| :------------:| :-----: | :-----: | 
 GE2E| AISHELL-3, etc. |[ge2e](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/ge2e)|[ge2e_ckpt_0.3.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/ge2e/ge2e_ckpt_0.3.zip)
 GE2E + Tactron2| AISHELL-3 |[ge2e-tactron2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/vc0)|[tacotron2_aishell3_ckpt_vc0_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_aishell3_ckpt_vc0_0.2.0.zip)
 GE2E + FastSpeech2 | AISHELL-3  |[ge2e-fastspeech2-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/vc1)|[fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip)
@@ -67,11 +71,17 @@ GE2E + FastSpeech2 | AISHELL-3  |[ge2e-fastspeech2-aishell3](https://github.com/
 
 ## Audio Classification Models
 
-Model Type | Dataset| Example Link | Pretrained Models
-:-------------:| :------------:| :-----: | :-----:
-PANN | Audioset| [audioset_tagging_cnn](https://github.com/qiuqiangkong/audioset_tagging_cnn) | [panns_cnn6.pdparams](https://bj.bcebos.com/paddleaudio/models/panns_cnn6.pdparams), [panns_cnn10.pdparams](https://bj.bcebos.com/paddleaudio/models/panns_cnn10.pdparams), [panns_cnn14.pdparams](https://bj.bcebos.com/paddleaudio/models/panns_cnn14.pdparams)
+Model Type | Dataset| Example Link | Pretrained Models | Static Models 
+:-------------:| :------------:| :-----: | :-----: | :-----:
+PANN | Audioset| [audioset_tagging_cnn](https://github.com/qiuqiangkong/audioset_tagging_cnn) | [panns_cnn6.pdparams](https://bj.bcebos.com/paddleaudio/models/panns_cnn6.pdparams), [panns_cnn10.pdparams](https://bj.bcebos.com/paddleaudio/models/panns_cnn10.pdparams), [panns_cnn14.pdparams](https://bj.bcebos.com/paddleaudio/models/panns_cnn14.pdparams) | [panns_cnn6_static.tar.gz](https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn6_static.tar.gz)(18M), [panns_cnn10_static.tar.gz](https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn10_static.tar.gz)(19M), [panns_cnn14_static.tar.gz](https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn14_static.tar.gz)(289M) 
 PANN | ESC-50 |[pann-esc50](../../examples/esc50/cls0)|[esc50_cnn6.tar.gz](https://paddlespeech.bj.bcebos.com/cls/esc50/esc50_cnn6.tar.gz), [esc50_cnn10.tar.gz](https://paddlespeech.bj.bcebos.com/cls/esc50/esc50_cnn10.tar.gz), [esc50_cnn14.tar.gz](https://paddlespeech.bj.bcebos.com/cls/esc50/esc50_cnn14.tar.gz)
 
+## Speaker Verification Models
+
+Model Type | Dataset| Example Link | Pretrained Models | Static Models 
+:-------------:| :------------:| :-----: | :-----: | :-----:
+PANN | VoxCeleb| [voxceleb_ecapatdnn](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/voxceleb/sv0) | [ecapatdnn.tar.gz](https://paddlespeech.bj.bcebos.com/vector/voxceleb/sv0_ecapa_tdnn_voxceleb12_ckpt_0_1_1.tar.gz) | -
+
 ## Punctuation Restoration Models
 Model Type | Dataset| Example Link | Pretrained Models
 :-------------:| :------------:| :-----: | :-----:
diff --git a/examples/aishell/asr1/README.md b/examples/aishell/asr1/README.md
index 1226a4f4e..5277a31eb 100644
--- a/examples/aishell/asr1/README.md
+++ b/examples/aishell/asr1/README.md
@@ -168,30 +168,7 @@ bash local/data.sh --stage -1 --stop_stage -1
 bash local/data.sh --stage 2 --stop_stage 2
 CUDA_VISIBLE_DEVICES= ./local/test.sh conf/transformer.yaml exp/transformer/checkpoints/avg_20
 ```
-The performance of the released models are shown below:
-### Conformer
-| Model     | Params | Config              | Augmentation     | Test set | Decode method          | Loss | CER      |
-| --------- | ------ | ------------------- | ---------------- | -------- | ---------------------- | ---- | -------- |
-| conformer | 47.07M | conf/conformer.yaml | spec_aug + shift | test     | attention              | -    | 0.059858 |
-| conformer | 47.07M | conf/conformer.yaml | spec_aug + shift | test     | ctc_greedy_search      | -    | 0.062311 |
-| conformer | 47.07M | conf/conformer.yaml | spec_aug + shift | test     | ctc_prefix_beam_search | -    | 0.062196 |
-| conformer | 47.07M | conf/conformer.yaml | spec_aug + shift | test     | attention_rescoring    | -    | 0.054694 |
-### Chunk Conformer
-Need set `decoding.decoding_chunk_size=16` when decoding.
-| Model     | Params | Config                    | Augmentation     | Test set | Decode method          | Chunk Size & Left Chunks | Loss | CER      |
-| --------- | ------ | ------------------------- | ---------------- | -------- | ---------------------- | ------------------------ | ---- | -------- |
-| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test     | attention              | 16, -1                   | -    | 0.061939 |
-| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test     | ctc_greedy_search      | 16, -1                   | -    | 0.070806 |
-| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test     | ctc_prefix_beam_search | 16, -1                   | -    | 0.070739 |
-| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test     | attention_rescoring    | 16, -1                   | -    | 0.059400 |
-
-### Transformer 
-| Model       | Params | Config                | Augmentation | Test set | Decode method          | Loss              | CER      |
-| ----------- | ------ | --------------------- | ------------ | -------- | ---------------------- | ----------------- | -------- |
-| transformer | 31.95M | conf/transformer.yaml | spec_aug     | test     | attention              | 3.858648955821991 | 0.057293 |
-| transformer | 31.95M | conf/transformer.yaml | spec_aug     | test     | ctc_greedy_search      | 3.858648955821991 | 0.061837 |
-| transformer | 31.95M | conf/transformer.yaml | spec_aug     | test     | ctc_prefix_beam_search | 3.858648955821991 | 0.061685 |
-| transformer | 31.95M | conf/transformer.yaml | spec_aug     | test     | attention_rescoring    | 3.858648955821991 | 0.053844 |
+[The performance of the released models](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/aishell/asr1/RESULTS.md)
 ## Stage 4: CTC Alignment 
 If you want to get the alignment between the audio and the text, you can use the ctc alignment. The code of this stage is shown below:
 ```bash
diff --git a/examples/aishell/asr1/RESULTS.md b/examples/aishell/asr1/RESULTS.md
index b68d69924..73cd57bda 100644
--- a/examples/aishell/asr1/RESULTS.md
+++ b/examples/aishell/asr1/RESULTS.md
@@ -1,24 +1,27 @@
 # Aishell
 
 ## Conformer
-
-| Model | Params | Config | Augmentation| Test set | Decode method | Loss | CER |  
-| --- | --- | --- | --- | --- | --- | --- | --- |  
-| conformer | 47.07M  | conf/conformer.yaml | spec_aug + shift | test | attention | - | 0.059858 |  
-| conformer | 47.07M  | conf/conformer.yaml | spec_aug + shift | test | ctc_greedy_search | - | 0.062311 |  
-| conformer | 47.07M  | conf/conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | - | 0.062196 |  
-| conformer | 47.07M  | conf/conformer.yaml | spec_aug + shift | test | attention_rescoring | - | 0.054694 |  
+paddle version: 2.2.2  
+paddlespeech version: 0.1.2
+| Model | Params | Config | Augmentation| Test set | Decode method | Loss | CER |
+| --- | --- | --- | --- | --- | --- | --- | --- | 
+| conformer | 47.07M  | conf/conformer.yaml | spec_aug | test | attention | - | 0.0548 |
+| conformer | 47.07M  | conf/conformer.yaml | spec_aug | test | ctc_greedy_search | - | 0.05127 |
+| conformer | 47.07M  | conf/conformer.yaml | spec_aug| test | ctc_prefix_beam_search | - | 0.05131 | 
+| conformer | 47.07M  | conf/conformer.yaml | spec_aug | test | attention_rescoring | - | 0.04829 | 
 
 
 ## Chunk Conformer
+paddle version: 2.2.2  
+paddlespeech version: 0.1.2  
 Need set `decoding.decoding_chunk_size=16` when decoding.
 
 | Model | Params | Config | Augmentation| Test set | Decode method | Chunk Size & Left Chunks | Loss | CER |  
 | --- | --- | --- | --- | --- | --- | --- | --- | --- |  
-| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | attention | 16, -1 | - | 0.061939 |  
-| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_greedy_search | 16, -1 | - | 0.070806 |  
-| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | 16, -1 | - | 0.070739 |  
-| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | attention_rescoring | 16, -1 |  - | 0.059400 |  
+| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug | test | attention | 16, -1 | - | 0.0573884 |  
+| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug | test | ctc_greedy_search | 16, -1 | - | 0.06599091 |  
+| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug | test | ctc_prefix_beam_search | 16, -1 | - | 0.065991 |  
+| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug | test | attention_rescoring | 16, -1 |  - | 0.056502 |  
 
 
 ## Transformer 
diff --git a/examples/aishell/asr1/conf/chunk_conformer.yaml b/examples/aishell/asr1/conf/chunk_conformer.yaml
index 68e852ba7..9f70e4c57 100644
--- a/examples/aishell/asr1/conf/chunk_conformer.yaml
+++ b/examples/aishell/asr1/conf/chunk_conformer.yaml
@@ -39,6 +39,7 @@ model_conf:
     ctc_weight: 0.3
     lsm_weight: 0.1     # label smoothing option
     length_normalized_loss: false
+    init_type: 'kaiming_uniform' 
 
 ###########################################
 #                   Data                  #
@@ -61,28 +62,29 @@ feat_dim: 80
 stride_ms: 10.0
 window_ms: 25.0
 sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
-batch_size: 64
+batch_size: 32
 maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
 maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
 minibatches: 0 # for debug
 batch_count: auto
-batch_bins: 0 
+batch_bins: 0
 batch_frames_in: 0
 batch_frames_out: 0
 batch_frames_inout: 0
-num_workers: 0
+num_workers: 2
 subsampling_factor: 1
 num_encs: 1
 
 ###########################################
 #                 Training                #
 ###########################################
-n_epoch: 240 
-accum_grad: 2
+n_epoch: 180 
+accum_grad: 1
 global_grad_clip: 5.0
+dist_sampler: True
 optim: adam
 optim_conf:
-  lr: 0.002
+  lr: 0.001
   weight_decay: 1.0e-6
 scheduler: warmuplr
 scheduler_conf:
@@ -92,4 +94,3 @@ log_interval: 100
 checkpoint:
   kbest_n: 50
   latest_n: 5
-
diff --git a/examples/aishell/asr1/conf/conformer.yaml b/examples/aishell/asr1/conf/conformer.yaml
index 775a4527d..a150a04d5 100644
--- a/examples/aishell/asr1/conf/conformer.yaml
+++ b/examples/aishell/asr1/conf/conformer.yaml
@@ -37,6 +37,7 @@ model_conf:
     ctc_weight: 0.3
     lsm_weight: 0.1     # label smoothing option
     length_normalized_loss: false
+    init_type: 'kaiming_uniform' 
 
 ###########################################
 #                   Data                  #
@@ -75,6 +76,7 @@ num_encs: 1
 n_epoch: 240 
 accum_grad: 2
 global_grad_clip: 5.0
+dist_sampler: True
 optim: adam
 optim_conf:
   lr: 0.002
diff --git a/examples/aishell/asr1/conf/preprocess.yaml b/examples/aishell/asr1/conf/preprocess.yaml
index f7f4c58d5..d3992cb9f 100644
--- a/examples/aishell/asr1/conf/preprocess.yaml
+++ b/examples/aishell/asr1/conf/preprocess.yaml
@@ -23,7 +23,3 @@ process:
     n_mask: 2
     inplace: true
     replace_with_zero: false
-
-
-
-
diff --git a/examples/aishell/asr1/conf/transformer.yaml b/examples/aishell/asr1/conf/transformer.yaml
index 9d2946537..9e08ea0ec 100644
--- a/examples/aishell/asr1/conf/transformer.yaml
+++ b/examples/aishell/asr1/conf/transformer.yaml
@@ -61,16 +61,17 @@ batch_frames_in: 0
 batch_frames_out: 0
 batch_frames_inout: 0
 preprocess_config: conf/preprocess.yaml 
-num_workers: 0
+num_workers: 2
 subsampling_factor: 1
 num_encs: 1
 
 ###########################################
 #                 Training                #
 ###########################################
-n_epoch: 240 
+n_epoch: 30
 accum_grad: 2
 global_grad_clip: 5.0
+dist_sampler: False
 optim: adam
 optim_conf:
   lr: 0.002
diff --git a/examples/aishell3/tts3/local/synthesize.sh b/examples/aishell3/tts3/local/synthesize.sh
index b1fc96a2d..d3978833f 100755
--- a/examples/aishell3/tts3/local/synthesize.sh
+++ b/examples/aishell3/tts3/local/synthesize.sh
@@ -4,18 +4,44 @@ config_path=$1
 train_output_path=$2
 ckpt_name=$3
 
-FLAGS_allocator_strategy=naive_best_fit \
-FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ${BIN_DIR}/../synthesize.py \
-    --am=fastspeech2_aishell3 \
-    --am_config=${config_path} \
-    --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
-    --am_stat=dump/train/speech_stats.npy \
-    --voc=pwgan_aishell3 \
-    --voc_config=pwg_aishell3_ckpt_0.5/default.yaml \
-    --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
-    --voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
-    --test_metadata=dump/test/norm/metadata.jsonl \
-    --output_dir=${train_output_path}/test \
-    --phones_dict=dump/phone_id_map.txt \
-    --speaker_dict=dump/speaker_id_map.txt
+stage=0
+stop_stage=0
+
+# pwgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+    python3 ${BIN_DIR}/../synthesize.py \
+        --am=fastspeech2_aishell3 \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=pwgan_aishell3 \
+        --voc_config=pwg_aishell3_ckpt_0.5/default.yaml \
+        --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
+        --voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test \
+        --phones_dict=dump/phone_id_map.txt \
+        --speaker_dict=dump/speaker_id_map.txt
+fi
+
+# hifigan
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+    python3 ${BIN_DIR}/../synthesize.py \
+        --am=fastspeech2_aishell3 \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=hifigan_aishell3 \
+        --voc_config=hifigan_aishell3_ckpt_0.2.0/default.yaml \
+        --voc_ckpt=hifigan_aishell3_ckpt_0.2.0/snapshot_iter_2500000.pd \
+        --voc_stat=hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test \
+        --phones_dict=dump/phone_id_map.txt \
+        --speaker_dict=dump/speaker_id_map.txt
+fi
+
diff --git a/examples/aishell3/tts3/local/synthesize_e2e.sh b/examples/aishell3/tts3/local/synthesize_e2e.sh
index 60e1a5cee..ff3608be7 100755
--- a/examples/aishell3/tts3/local/synthesize_e2e.sh
+++ b/examples/aishell3/tts3/local/synthesize_e2e.sh
@@ -4,21 +4,50 @@ config_path=$1
 train_output_path=$2
 ckpt_name=$3
 
-FLAGS_allocator_strategy=naive_best_fit \
-FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ${BIN_DIR}/../synthesize_e2e.py \
-    --am=fastspeech2_aishell3 \
-    --am_config=${config_path} \
-    --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
-    --am_stat=dump/train/speech_stats.npy \
-    --voc=pwgan_aishell3 \
-    --voc_config=pwg_aishell3_ckpt_0.5/default.yaml \
-    --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
-    --voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
-    --lang=zh \
-    --text=${BIN_DIR}/../sentences.txt \
-    --output_dir=${train_output_path}/test_e2e \
-    --phones_dict=dump/phone_id_map.txt \
-    --speaker_dict=dump/speaker_id_map.txt \
-    --spk_id=0 \
-    --inference_dir=${train_output_path}/inference
+stage=0
+stop_stage=0
+
+# pwgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=fastspeech2_aishell3 \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=pwgan_aishell3 \
+        --voc_config=pwg_aishell3_ckpt_0.5/default.yaml \
+        --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
+        --voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../sentences.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --speaker_dict=dump/speaker_id_map.txt \
+        --spk_id=0 \
+        --inference_dir=${train_output_path}/inference
+fi
+
+# hifigan
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    echo "in hifigan syn_e2e"
+    FLAGS_allocator_strategy=naive_best_fit \
+    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=fastspeech2_aishell3 \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=fastspeech2_nosil_aishell3_ckpt_0.4/speech_stats.npy \
+        --voc=hifigan_aishell3 \
+        --voc_config=hifigan_aishell3_ckpt_0.2.0/default.yaml \
+        --voc_ckpt=hifigan_aishell3_ckpt_0.2.0/snapshot_iter_2500000.pdz \
+        --voc_stat=hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../sentences.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=fastspeech2_nosil_aishell3_ckpt_0.4/phone_id_map.txt \
+        --speaker_dict=fastspeech2_nosil_aishell3_ckpt_0.4/speaker_id_map.txt \
+        --spk_id=0 \
+        --inference_dir=${train_output_path}/inference
+    fi
diff --git a/examples/aishell3/vc0/local/preprocess.sh b/examples/aishell3/vc0/local/preprocess.sh
index 069cf94c4..e458c7063 100755
--- a/examples/aishell3/vc0/local/preprocess.sh
+++ b/examples/aishell3/vc0/local/preprocess.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-stage=3
+stage=0
 stop_stage=100
 
 config_path=$1
diff --git a/examples/aishell3/voc1/run.sh b/examples/aishell3/voc1/run.sh
index 4f426ea02..cab1ac38b 100755
--- a/examples/aishell3/voc1/run.sh
+++ b/examples/aishell3/voc1/run.sh
@@ -3,7 +3,7 @@
 set -e
 source path.sh
 
-gpus=0
+gpus=0,1
 stage=0
 stop_stage=100
 
diff --git a/examples/aishell3/voc5/README.md b/examples/aishell3/voc5/README.md
new file mode 100644
index 000000000..ebe2530be
--- /dev/null
+++ b/examples/aishell3/voc5/README.md
@@ -0,0 +1,156 @@
+# HiFiGAN with AISHELL-3
+This example contains code used to train a [HiFiGAN](https://arxiv.org/abs/2010.05646) model with [AISHELL-3](http://www.aishelltech.com/aishell_3).
+
+AISHELL-3 is a large-scale and high-fidelity multi-speaker Mandarin speech corpus that could be used to train multi-speaker Text-to-Speech (TTS) systems.
+## Dataset
+### Download and Extract
+Download AISHELL-3.
+```bash
+wget https://www.openslr.org/resources/93/data_aishell3.tgz
+```
+Extract AISHELL-3.
+```bash
+mkdir data_aishell3
+tar zxvf data_aishell3.tgz -C data_aishell3
+```
+### Get MFA Result and Extract
+We use [MFA2.x](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get durations for aishell3_fastspeech2.
+You can download from here [aishell3_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/aishell3_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) (use MFA1.x now) of our repo.
+
+## Get Started
+Assume the path to the dataset is `~/datasets/data_aishell3`.
+Assume the path to the MFA result of AISHELL-3 is `./aishell3_alignment_tone`.
+Run the command below to
+1. **source path**.
+2. preprocess the dataset.
+3. train the model.
+4. synthesize wavs.
+    - synthesize waveform from `metadata.jsonl`.
+```bash
+./run.sh
+```
+You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, run the following command will only preprocess the dataset.
+```bash
+./run.sh --stage 0 --stop-stage 0
+```
+### Data Preprocessing
+```bash
+./local/preprocess.sh ${conf_path}
+```
+When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.
+
+```text
+dump
+├── dev
+│   ├── norm
+│   └── raw
+├── test
+│   ├── norm
+│   └── raw
+└── train
+    ├── norm
+    ├── raw
+    └── feats_stats.npy
+```
+
+The dataset is split into 3 parts, namely `train`, `dev`, and `test`, each of which contains a `norm` and `raw` subfolder. The `raw` folder contains the log magnitude of the mel spectrogram of each utterance, while the norm folder contains the normalized spectrogram. The statistics used to normalize the spectrogram are computed from the training set, which is located in `dump/train/feats_stats.npy`.
+
+Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file that contains id and paths to the spectrogram of each utterance.
+
+### Model Training
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
+```
+`./local/train.sh` calls `${BIN_DIR}/train.py`.
+Here's the complete help message.
+
+```text
+usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
+                [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
+                [--ngpu NGPU] [--batch-size BATCH_SIZE] [--max-iter MAX_ITER]
+                [--run-benchmark RUN_BENCHMARK]
+                [--profiler_options PROFILER_OPTIONS]
+
+Train a ParallelWaveGAN model.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --config CONFIG       config file to overwrite default config.
+  --train-metadata TRAIN_METADATA
+                        training data.
+  --dev-metadata DEV_METADATA
+                        dev data.
+  --output-dir OUTPUT_DIR
+                        output dir.
+  --ngpu NGPU           if ngpu == 0, use cpu.
+
+benchmark:
+  arguments related to benchmark.
+
+  --batch-size BATCH_SIZE
+                        batch size.
+  --max-iter MAX_ITER   train max steps.
+  --run-benchmark RUN_BENCHMARK
+                        runing benchmark or not, if True, use the --batch-size
+                        and --max-iter.
+  --profiler_options PROFILER_OPTIONS
+                        The option of profiler, which should be in format
+                        "key1=value1;key2=value2;key3=value3".
+```
+
+1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
+2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder.
+3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are saved in `checkpoints/` inside this directory.
+4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
+
+### Synthesizing
+`./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
+```
+```text
+usage: synthesize.py [-h] [--generator-type GENERATOR_TYPE] [--config CONFIG]
+                     [--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA]
+                     [--output-dir OUTPUT_DIR] [--ngpu NGPU]
+
+Synthesize with GANVocoder.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --generator-type GENERATOR_TYPE
+                        type of GANVocoder, should in {pwgan, mb_melgan,
+                        style_melgan, } now
+  --config CONFIG       GANVocoder config file.
+  --checkpoint CHECKPOINT
+                        snapshot to load.
+  --test-metadata TEST_METADATA
+                        dev data.
+  --output-dir OUTPUT_DIR
+                        output dir.
+  --ngpu NGPU           if ngpu == 0, use cpu.
+```
+
+1. `--config` config file. You should use the same config with which the model is trained.
+2. `--checkpoint` is the checkpoint to load. Pick one of the checkpoints from `checkpoints` inside the training output directory.
+3. `--test-metadata` is the metadata of the test dataset. Use the `metadata.jsonl` in the `dev/norm` subfolder from the processed directory.
+4. `--output-dir` is the directory to save the synthesized audio files.
+5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
+## Pretrained Models
+The pretrained model can be downloaded here [hifigan_aishell3_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_ckpt_0.2.0.zip).
+
+
+Model | Step | eval/generator_loss | eval/mel_loss| eval/feature_matching_loss
+:-------------:| :------------:| :-----: | :-----: | :--------:
+default| 1(gpu) x 2500000|24.060|0.1068|7.499
+
+HiFiGAN checkpoint contains files listed below.
+
+```text
+hifigan_aishell3_ckpt_0.2.0
+├── default.yaml                  # default config used to train hifigan
+├── feats_stats.npy               # statistics used to normalize spectrogram when training hifigan
+└── snapshot_iter_2500000.pdz     # generator parameters of hifigan
+```
+
+## Acknowledgement
+We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN.
diff --git a/examples/aishell3/voc5/conf/default.yaml b/examples/aishell3/voc5/conf/default.yaml
new file mode 100644
index 000000000..728a90369
--- /dev/null
+++ b/examples/aishell3/voc5/conf/default.yaml
@@ -0,0 +1,168 @@
+# This is the configuration file for AISHELL-3 dataset.
+# This configuration is based on HiFiGAN V1, which is
+# an official configuration. But I found that the optimizer
+# setting does not work well with my implementation.
+# So I changed optimizer settings as follows:
+# - AdamW -> Adam
+# - betas: [0.8, 0.99] -> betas: [0.5, 0.9]
+# - Scheduler: ExponentialLR -> MultiStepLR
+# To match the shift size difference, the upsample scales
+# is also modified from the original 256 shift setting.
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+fs: 24000                # Sampling rate.
+n_fft: 2048              # FFT size (samples).
+n_shift: 300             # Hop size (samples). 12.5ms
+win_length: 1200         # Window length (samples). 50ms
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+n_mels: 80               # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation. (Hz)
+fmax: 7600               # Maximum frequency in mel basis calculation. (Hz)
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_params:
+    in_channels: 80                       # Number of input channels.
+    out_channels: 1                       # Number of output channels.
+    channels: 512                         # Number of initial channels.
+    kernel_size: 7                        # Kernel size of initial and final conv layers.
+    upsample_scales: [5, 5, 4, 3]         # Upsampling scales.
+    upsample_kernel_sizes: [10, 10, 8, 6] # Kernel size for upsampling layers.
+    resblock_kernel_sizes: [3, 7, 11]     # Kernel size for residual blocks.
+    resblock_dilations:                   # Dilations for residual blocks.
+        - [1, 3, 5]
+        - [1, 3, 5]
+        - [1, 3, 5]
+    use_additional_convs: True            # Whether to use additional conv layer in residual blocks.
+    bias: True                            # Whether to use bias parameter in conv.
+    nonlinear_activation: "leakyrelu"     # Nonlinear activation type.
+    nonlinear_activation_params:          # Nonlinear activation paramters.
+        negative_slope: 0.1
+    use_weight_norm: True                 # Whether to apply weight normalization.
+
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_params:
+    scales: 3                              # Number of multi-scale discriminator.
+    scale_downsample_pooling: "AvgPool1D"  # Pooling operation for scale discriminator.
+    scale_downsample_pooling_params:
+        kernel_size: 4                     # Pooling kernel size.
+        stride: 2                          # Pooling stride.
+        padding: 2                         # Padding size.
+    scale_discriminator_params:
+        in_channels: 1                     # Number of input channels.
+        out_channels: 1                    # Number of output channels.
+        kernel_sizes: [15, 41, 5, 3]       # List of kernel sizes.
+        channels: 128                      # Initial number of channels.
+        max_downsample_channels: 1024      # Maximum number of channels in downsampling conv layers.
+        max_groups: 16                     # Maximum number of groups in downsampling conv layers.
+        bias: True
+        downsample_scales: [4, 4, 4, 4, 1] # Downsampling scales.
+        nonlinear_activation: "leakyrelu"  # Nonlinear activation.
+        nonlinear_activation_params:
+            negative_slope: 0.1
+    follow_official_norm: True             # Whether to follow the official norm setting.
+    periods: [2, 3, 5, 7, 11]              # List of period for multi-period discriminator.
+    period_discriminator_params:
+        in_channels: 1                     # Number of input channels.
+        out_channels: 1                    # Number of output channels.
+        kernel_sizes: [5, 3]               # List of kernel sizes.
+        channels: 32                       # Initial number of channels.
+        downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales.
+        max_downsample_channels: 1024      # Maximum number of channels in downsampling conv layers.
+        bias: True                         # Whether to use bias parameter in conv layer."
+        nonlinear_activation: "leakyrelu"  # Nonlinear activation.
+        nonlinear_activation_params:       # Nonlinear activation paramters.
+            negative_slope: 0.1
+        use_weight_norm: True              # Whether to apply weight normalization.
+        use_spectral_norm: False           # Whether to apply spectral normalization.
+    
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+use_stft_loss: False                 # Whether to use multi-resolution STFT loss.
+use_mel_loss: True                   # Whether to use Mel-spectrogram loss.
+mel_loss_params:
+    fs: 24000
+    fft_size: 2048
+    hop_size: 300
+    win_length: 1200
+    window: "hann"
+    num_mels: 80
+    fmin: 0
+    fmax: 12000
+    log_base: null
+generator_adv_loss_params:
+    average_by_discriminators: False # Whether to average loss by #discriminators.
+discriminator_adv_loss_params:
+    average_by_discriminators: False # Whether to average loss by #discriminators.
+use_feat_match_loss: True
+feat_match_loss_params:
+    average_by_discriminators: False # Whether to average loss by #discriminators.
+    average_by_layers: False         # Whether to average loss by #layers in each discriminator.
+    include_final_outputs: False     # Whether to include final outputs in feat match loss calculation.
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_aux: 45.0       # Loss balancing coefficient for STFT loss.
+lambda_adv: 1.0        # Loss balancing coefficient for adversarial loss.
+lambda_feat_match: 2.0 # Loss balancing coefficient for feat match loss..
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 16              # Batch size.
+batch_max_steps: 8400       # Length of each audio in batch. Make sure dividable by hop_size.
+num_workers: 2              # Number of workers in DataLoader.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+    beta1: 0.5
+    beta2: 0.9
+    weight_decay: 0.0                   # Generator's weight decay coefficient.
+generator_scheduler_params:
+    learning_rate: 2.0e-4               # Generator's learning rate.
+    gamma: 0.5                          # Generator's scheduler gamma.
+    milestones:                         # At each milestone, lr will be multiplied by gamma.
+        - 200000
+        - 400000
+        - 600000
+        - 800000
+generator_grad_norm: -1                 # Generator's gradient norm.
+discriminator_optimizer_params:
+    beta1: 0.5
+    beta2: 0.9
+    weight_decay: 0.0                   # Discriminator's weight decay coefficient.
+discriminator_scheduler_params:
+    learning_rate: 2.0e-4               # Discriminator's learning rate.
+    gamma: 0.5                          # Discriminator's scheduler gamma.
+    milestones:                         # At each milestone, lr will be multiplied by gamma.
+        - 200000
+        - 400000
+        - 600000
+        - 800000    
+discriminator_grad_norm: -1             # Discriminator's gradient norm.            
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+generator_train_start_steps: 1     # Number of steps to start to train discriminator.
+discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
+train_max_steps: 2500000           # Number of training steps.
+save_interval_steps: 5000         # Interval steps to save checkpoint.
+eval_interval_steps: 1000          # Interval steps to evaluate the network.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_snapshots: 10                 # max number of snapshots to keep while training
+seed: 42                          # random seed for paddle, random, and np.random
diff --git a/examples/aishell3/voc5/local/preprocess.sh b/examples/aishell3/voc5/local/preprocess.sh
new file mode 100755
index 000000000..44cc3dbe4
--- /dev/null
+++ b/examples/aishell3/voc5/local/preprocess.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+
+stage=0
+stop_stage=100
+
+config_path=$1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # get durations from MFA's result
+    echo "Generate durations.txt from MFA results ..."
+    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
+        --inputdir=./aishell3_alignment_tone \
+        --output=durations.txt \
+        --config=${config_path}
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # extract features
+    echo "Extract features ..."
+    python3 ${BIN_DIR}/../preprocess.py \
+        --rootdir=~/datasets/data_aishell3/ \
+        --dataset=aishell3 \
+        --dumpdir=dump \
+        --dur-file=durations.txt \
+        --config=${config_path} \
+        --cut-sil=True \
+        --num-cpu=20
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # get features' stats(mean and std)
+    echo "Get features' stats ..."
+    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --field-name="feats"
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # normalize, dev and test should use train's stats
+    echo "Normalize ..."
+   
+    python3 ${BIN_DIR}/../normalize.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --dumpdir=dump/train/norm \
+        --stats=dump/train/feats_stats.npy
+    python3 ${BIN_DIR}/../normalize.py \
+        --metadata=dump/dev/raw/metadata.jsonl \
+        --dumpdir=dump/dev/norm \
+        --stats=dump/train/feats_stats.npy
+    
+    python3 ${BIN_DIR}/../normalize.py \
+        --metadata=dump/test/raw/metadata.jsonl \
+        --dumpdir=dump/test/norm \
+        --stats=dump/train/feats_stats.npy
+fi
diff --git a/examples/aishell3/voc5/local/synthesize.sh b/examples/aishell3/voc5/local/synthesize.sh
new file mode 100755
index 000000000..647896175
--- /dev/null
+++ b/examples/aishell3/voc5/local/synthesize.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
+FLAGS_allocator_strategy=naive_best_fit \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python3 ${BIN_DIR}/../synthesize.py \
+    --config=${config_path} \
+    --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
+    --test-metadata=dump/test/norm/metadata.jsonl \
+    --output-dir=${train_output_path}/test \
+    --generator-type=hifigan
diff --git a/examples/aishell3/voc5/local/train.sh b/examples/aishell3/voc5/local/train.sh
new file mode 100755
index 000000000..9695631ef
--- /dev/null
+++ b/examples/aishell3/voc5/local/train.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+
+FLAGS_cudnn_exhaustive_search=true \
+FLAGS_conv_workspace_size_limit=4000 \
+python ${BIN_DIR}/train.py \
+    --train-metadata=dump/train/norm/metadata.jsonl \
+    --dev-metadata=dump/dev/norm/metadata.jsonl \
+    --config=${config_path} \
+    --output-dir=${train_output_path} \
+    --ngpu=1
diff --git a/examples/aishell3/voc5/path.sh b/examples/aishell3/voc5/path.sh
new file mode 100755
index 000000000..7451b3218
--- /dev/null
+++ b/examples/aishell3/voc5/path.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+MODEL=hifigan
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL}
diff --git a/examples/aishell3/voc5/run.sh b/examples/aishell3/voc5/run.sh
new file mode 100755
index 000000000..4f426ea02
--- /dev/null
+++ b/examples/aishell3/voc5/run.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+set -e
+source path.sh
+
+gpus=0
+stage=0
+stop_stage=100
+
+conf_path=conf/default.yaml
+train_output_path=exp/default
+ckpt_name=snapshot_iter_5000.pdz
+
+# with the following command, you can choose the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    ./local/preprocess.sh ${conf_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # synthesize
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
diff --git a/examples/ami/sd0/local/ami_prepare.py b/examples/ami/sd0/local/ami_prepare.py
index 01582dbdd..1f02afe00 100644
--- a/examples/ami/sd0/local/ami_prepare.py
+++ b/examples/ami/sd0/local/ami_prepare.py
@@ -18,18 +18,17 @@ Download: http://groups.inf.ed.ac.uk/ami/download/
 
 Prepares metadata files (JSON) from manual annotations "segments/" using RTTM format (Oracle VAD).
 """
-
 import argparse
 import glob
 import json
 import logging
 import os
 import xml.etree.ElementTree as et
-from distutils.util import strtobool
 
 from ami_splits import get_AMI_split
 from dataio import load_pkl
 from dataio import save_pkl
+from distutils.util import strtobool
 
 logger = logging.getLogger(__name__)
 SAMPLERATE = 16000
diff --git a/examples/csmsc/tts0/local/synthesize_e2e.sh b/examples/csmsc/tts0/local/synthesize_e2e.sh
index f76758733..4c3b08dc1 100755
--- a/examples/csmsc/tts0/local/synthesize_e2e.sh
+++ b/examples/csmsc/tts0/local/synthesize_e2e.sh
@@ -7,7 +7,7 @@ ckpt_name=$3
 stage=0
 stop_stage=0
 
-# TODO: tacotron2 动转静的结果没有静态图的响亮, 可能还是 decode 的时候某个函数动静不对齐
+# TODO: tacotron2 动转静的结果没有动态图的响亮, 可能还是 decode 的时候某个函数动静不对齐
 # pwgan
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     FLAGS_allocator_strategy=naive_best_fit \
diff --git a/examples/csmsc/tts2/local/synthesize.sh b/examples/csmsc/tts2/local/synthesize.sh
index 37b298183..b8982a16d 100755
--- a/examples/csmsc/tts2/local/synthesize.sh
+++ b/examples/csmsc/tts2/local/synthesize.sh
@@ -14,7 +14,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
         --am=speedyspeech_csmsc \
         --am_config=${config_path} \
         --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
-        --am_stat=dump/train/speech_stats.npy \
+        --am_stat=dump/train/feats_stats.npy \
         --voc=pwgan_csmsc \
         --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
         --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
@@ -34,7 +34,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
         --am=speedyspeech_csmsc \
         --am_config=${config_path} \
         --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
-        --am_stat=dump/train/speech_stats.npy \
+        --am_stat=dump/train/feats_stats.npy \
         --voc=mb_melgan_csmsc \
         --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
         --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
@@ -53,7 +53,7 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
         --am=speedyspeech_csmsc \
         --am_config=${config_path} \
         --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
-        --am_stat=dump/train/speech_stats.npy \
+        --am_stat=dump/train/feats_stats.npy \
         --voc=style_melgan_csmsc \
         --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
         --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
@@ -73,7 +73,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
         --am=speedyspeech_csmsc \
         --am_config=${config_path} \
         --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
-        --am_stat=dump/train/speech_stats.npy \
+        --am_stat=dump/train/feats_stats.npy \
         --voc=hifigan_csmsc \
         --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
         --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
@@ -93,7 +93,7 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
         --am=speedyspeech_csmsc \
         --am_config=${config_path} \
         --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
-        --am_stat=dump/train/speech_stats.npy \
+        --am_stat=dump/train/feats_stats.npy \
         --voc=wavernn_csmsc \
         --voc_config=wavernn_csmsc_ckpt_0.2.0/default.yaml \
         --voc_ckpt=wavernn_csmsc_ckpt_0.2.0/snapshot_iter_400000.pdz \
diff --git a/examples/csmsc/tts3/README.md b/examples/csmsc/tts3/README.md
index 7b803526f..ae8f7af60 100644
--- a/examples/csmsc/tts3/README.md
+++ b/examples/csmsc/tts3/README.md
@@ -226,8 +226,11 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path}
 Pretrained FastSpeech2 model with no silence in the edge of audios:
 - [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip)
 - [fastspeech2_conformer_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_conformer_baker_ckpt_0.5.zip)
+- [fastspeech2_cnndecoder_csmsc_ckpt_1.0.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_ckpt_1.0.0.zip)
 
-The static model can be downloaded here [fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip).
+The static model can be downloaded here:
+- [fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip)
+- [fastspeech2_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_static_0.2.0.zip)
 
 Model | Step | eval/loss | eval/l1_loss | eval/duration_loss | eval/pitch_loss| eval/energy_loss 
 :-------------:| :------------:| :-----: | :-----: | :--------: |:--------:|:---------:
diff --git a/examples/csmsc/tts3/conf/cnndecoder.yaml b/examples/csmsc/tts3/conf/cnndecoder.yaml
new file mode 100644
index 000000000..8b46fea44
--- /dev/null
+++ b/examples/csmsc/tts3/conf/cnndecoder.yaml
@@ -0,0 +1,107 @@
+# use CNND
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+
+fs: 24000          # sr
+n_fft: 2048        # FFT size (samples).
+n_shift: 300       # Hop size (samples). 12.5ms
+win_length: 1200   # Window length (samples). 50ms
+                   # If set to null, it will be the same as fft_size.
+window: "hann"     # Window function.
+
+# Only used for feats_type != raw
+
+fmin: 80           # Minimum frequency of Mel basis.
+fmax: 7600         # Maximum frequency of Mel basis.
+n_mels: 80         # The number of mel basis.
+
+# Only used for the model using pitch features (e.g. FastSpeech2)
+f0min: 80          # Minimum f0 for pitch extraction.
+f0max: 400         # Maximum f0 for pitch extraction.
+
+
+###########################################################
+#                       DATA SETTING                      #
+###########################################################
+batch_size: 64
+num_workers: 4
+
+
+###########################################################
+#                       MODEL SETTING                     #
+###########################################################
+model:
+    adim: 384         # attention dimension
+    aheads: 2         # number of attention heads
+    elayers: 4        # number of encoder layers
+    eunits: 1536      # number of encoder ff units
+    dlayers: 4        # number of decoder layers
+    dunits: 1536      # number of decoder ff units
+    positionwise_layer_type: conv1d   # type of position-wise layer
+    positionwise_conv_kernel_size: 3  # kernel size of position wise conv layer
+    duration_predictor_layers: 2      # number of layers of duration predictor
+    duration_predictor_chans: 256     # number of channels of duration predictor
+    duration_predictor_kernel_size: 3 # filter size of duration predictor
+    postnet_layers: 5                 # number of layers of postnset
+    postnet_filts: 5                  # filter size of conv layers in postnet
+    postnet_chans: 256                # number of channels of conv layers in postnet
+    use_scaled_pos_enc: True          # whether to use scaled positional encoding
+    encoder_normalize_before: True    # whether to perform layer normalization before the input
+    decoder_normalize_before: True    # whether to perform layer normalization before the input
+    reduction_factor: 1               # reduction factor
+    encoder_type: transformer           # encoder type
+    decoder_type: cnndecoder           # decoder type
+    init_type: xavier_uniform         # initialization type
+    init_enc_alpha: 1.0               # initial value of alpha of encoder scaled position encoding
+    init_dec_alpha: 1.0               # initial value of alpha of decoder scaled position encoding
+    transformer_enc_dropout_rate: 0.2            # dropout rate for transformer encoder layer
+    transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
+    transformer_enc_attn_dropout_rate: 0.2       # dropout rate for transformer encoder attention layer
+    cnn_dec_dropout_rate: 0.2                    # dropout rate for cnn decoder layer
+    cnn_postnet_dropout_rate: 0.2
+    cnn_postnet_resblock_kernel_sizes: [256, 256] # kernel sizes for residual block of cnn_postnet
+    cnn_postnet_kernel_size: 5                   # kernel size of cnn_postnet
+    cnn_decoder_embedding_dim: 256
+    pitch_predictor_layers: 5                  # number of conv layers in pitch predictor
+    pitch_predictor_chans: 256                 # number of channels of conv layers in pitch predictor
+    pitch_predictor_kernel_size: 5             # kernel size of conv leyers in pitch predictor
+    pitch_predictor_dropout: 0.5               # dropout rate in pitch predictor
+    pitch_embed_kernel_size: 1                 # kernel size of conv embedding layer for pitch
+    pitch_embed_dropout: 0.0                   # dropout rate after conv embedding layer for pitch
+    stop_gradient_from_pitch_predictor: True   # whether to stop the gradient from pitch predictor to encoder
+    energy_predictor_layers: 2                 # number of conv layers in energy predictor
+    energy_predictor_chans: 256                # number of channels of conv layers in energy predictor
+    energy_predictor_kernel_size: 3            # kernel size of conv leyers in energy predictor
+    energy_predictor_dropout: 0.5              # dropout rate in energy predictor
+    energy_embed_kernel_size: 1                # kernel size of conv embedding layer for energy
+    energy_embed_dropout: 0.0                  # dropout rate after conv embedding layer for energy
+    stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder
+
+
+
+###########################################################
+#                       UPDATER SETTING                   #
+###########################################################
+updater:
+    use_masking: True                 # whether to apply masking for padded part in loss calculation
+
+
+###########################################################
+#                     OPTIMIZER SETTING                   #
+###########################################################
+optimizer:
+  optim: adam              # optimizer type
+  learning_rate: 0.001     # learning rate
+
+###########################################################
+#                     TRAINING SETTING                    #
+###########################################################
+max_epoch: 1000
+num_snapshots: 5
+
+
+###########################################################
+#                       OTHER SETTING                     #
+###########################################################
+seed: 10086
diff --git a/examples/csmsc/tts3/local/synthesize_streaming.sh b/examples/csmsc/tts3/local/synthesize_streaming.sh
new file mode 100755
index 000000000..7606c2385
--- /dev/null
+++ b/examples/csmsc/tts3/local/synthesize_streaming.sh
@@ -0,0 +1,92 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
+stage=0
+stop_stage=0
+
+# pwgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+    python3 ${BIN_DIR}/../synthesize_streaming.py \
+        --am=fastspeech2_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=pwgan_csmsc \
+        --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
+        --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
+        --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../sentences.txt \
+        --output_dir=${train_output_path}/test_e2e_streaming \
+        --phones_dict=dump/phone_id_map.txt \
+        --am_streaming=True
+fi
+
+# for more GAN Vocoders
+# multi band melgan
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+    python3 ${BIN_DIR}/../synthesize_streaming.py \
+        --am=fastspeech2_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=mb_melgan_csmsc \
+        --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
+        --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../sentences.txt \
+        --output_dir=${train_output_path}/test_e2e_streaming \
+        --phones_dict=dump/phone_id_map.txt \
+        --am_streaming=True
+fi
+
+# the pretrained models haven't release now
+# style melgan
+# style melgan's Dygraph to Static Graph is not ready now
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+    python3 ${BIN_DIR}/../synthesize_streaming.py \
+        --am=fastspeech2_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=style_melgan_csmsc \
+        --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
+        --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../sentences.txt \
+        --output_dir=${train_output_path}/test_e2e_streaming \
+        --phones_dict=dump/phone_id_map.txt \
+        --am_streaming=True
+fi
+
+# hifigan
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    echo "in hifigan syn_e2e"
+    FLAGS_allocator_strategy=naive_best_fit \
+    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+    python3 ${BIN_DIR}/../synthesize_streaming.py \
+        --am=fastspeech2_csmsc \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=hifigan_csmsc \
+        --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
+        --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
+        --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
+        --lang=zh \
+        --text=${BIN_DIR}/../sentences.txt \
+        --output_dir=${train_output_path}/test_e2e_streaming \
+        --phones_dict=dump/phone_id_map.txt \
+        --am_streaming=True
+fi
diff --git a/examples/csmsc/tts3/run_cnndecoder.sh b/examples/csmsc/tts3/run_cnndecoder.sh
new file mode 100755
index 000000000..5cccef016
--- /dev/null
+++ b/examples/csmsc/tts3/run_cnndecoder.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+set -e
+source path.sh
+
+gpus=0,1
+stage=0
+stop_stage=100
+
+conf_path=conf/cnndecoder.yaml
+train_output_path=exp/cnndecoder
+ckpt_name=snapshot_iter_153.pdz
+
+# with the following command, you can choose the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    ./local/preprocess.sh ${conf_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # synthesize, vocoder is pwgan
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # synthesize_e2e, vocoder is pwgan
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    # inference with static model
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+    # synthesize_e2e, vocoder is pwgan
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_streaming.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
diff --git a/examples/esc50/README.md b/examples/esc50/README.md
index 911a72ad7..9eab95d26 100644
--- a/examples/esc50/README.md
+++ b/examples/esc50/README.md
@@ -4,7 +4,7 @@
 
 对于声音分类任务，传统机器学习的一个常用做法是首先人工提取音频的时域和频域的多种特征并做特征选择、组合、变换等，然后基于SVM或决策树进行分类。而端到端的深度学习则通常利用深度网络如RNN，CNN等直接对声间波形(waveform)或时频特征(time-frequency)进行特征学习(representation learning)和分类预测。
 
-在IEEE ICASSP 2017 大会上，谷歌开放了一个大规模的音频数据集[Audioset](https://research.google.com/audioset/)。该数据集包含了 632 类的音频类别以及 2,084,320 条人工标记的每段 10 秒长度的声音剪辑片段（来源于YouTube视频）。目前该数据集已经有210万个已标注的视频数据，5800小时的音频数据，经过标记的声音样本的标签类别为527。
+在IEEE ICASSP 2017 大会上，谷歌开放了一个大规模的音频数据集[Audioset](https://research.google.com/audioset/)。该数据集包含了 632 类的音频类别以及 2,084,320 条人工标记的每段 **10 秒**长度的声音剪辑片段（来源于YouTube视频）。目前该数据集已经有 210万 个已标注的视频数据，5800 小时的音频数据，经过标记的声音样本的标签类别为 527。
 
 `PANNs`([PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition](https://arxiv.org/pdf/1912.10211.pdf))是基于Audioset数据集训练的声音分类/识别的模型。经过预训练后，模型可以用于提取音频的embbedding。本示例将使用`PANNs`的预训练模型Finetune完成声音分类的任务。
 
@@ -12,14 +12,14 @@
 ## 模型简介
 
 PaddleAudio提供了PANNs的CNN14、CNN10和CNN6的预训练模型，可供用户选择使用：
-- CNN14: 该模型主要包含12个卷积层和2个全连接层，模型参数的数量为79.6M，embbedding维度是2048。
-- CNN10: 该模型主要包含8个卷积层和2个全连接层，模型参数的数量为4.9M，embbedding维度是512。
-- CNN6: 该模型主要包含4个卷积层和2个全连接层，模型参数的数量为4.5M，embbedding维度是512。
+- CNN14: 该模型主要包含12个卷积层和2个全连接层，模型参数的数量为 79.6M，embbedding维度是 2048。
+- CNN10: 该模型主要包含8个卷积层和2个全连接层，模型参数的数量为 4.9M，embbedding维度是 512。
+- CNN6: 该模型主要包含4个卷积层和2个全连接层，模型参数的数量为 4.5M，embbedding维度是 512。
 
 
 ## 数据集
 
-[ESC-50: Dataset for Environmental Sound Classification](https://github.com/karolpiczak/ESC-50) 是一个包含有 2000 个带标签的环境声音样本，音频样本采样率为 44,100Hz 的单通道音频文件，所有样本根据标签被划分为 50 个类别，每个类别有 40 个样本。
+[ESC-50: Dataset for Environmental Sound Classification](https://github.com/karolpiczak/ESC-50) 是一个包含有 2000 个带标签的时长为 **5 秒**的环境声音样本，音频样本采样率为 44,100Hz 的单通道音频文件，所有样本根据标签被划分为 50 个类别，每个类别有 40 个样本。
 
 ## 模型指标
 
@@ -43,13 +43,13 @@ $ CUDA_VISIBLE_DEVICES=0 ./run.sh 1 conf/panns.yaml
 ```
 
 训练的参数可在 `conf/panns.yaml` 的 `training` 中配置，其中：
-- `epochs`: 训练轮次，默认为50。
+- `epochs`: 训练轮次，默认为 50。
 - `learning_rate`: Fine-tune的学习率；默认为5e-5。
-- `batch_size`: 批处理大小，请结合显存情况进行调整，若出现显存不足，请适当调低这一参数；默认为16。
+- `batch_size`: 批处理大小，请结合显存情况进行调整，若出现显存不足，请适当调低这一参数；默认为 16。
 - `num_workers`: Dataloader获取数据的子进程数。默认为0，加载数据的流程在主进程执行。
 - `checkpoint_dir`: 模型参数文件和optimizer参数文件的保存目录，默认为`./checkpoint`。
-- `save_freq`: 训练过程中的模型保存频率，默认为10。
-- `log_freq`: 训练过程中的信息打印频率，默认为10。
+- `save_freq`: 训练过程中的模型保存频率，默认为 10。
+- `log_freq`: 训练过程中的信息打印频率，默认为 10。
 
 示例代码中使用的预训练模型为`CNN14`，如果想更换为其他预训练模型，可通过修改 `conf/panns.yaml` 的 `model` 中配置：
 ```yaml
@@ -76,7 +76,7 @@ $ CUDA_VISIBLE_DEVICES=0 ./run.sh 2 conf/panns.yaml
 
 训练的参数可在 `conf/panns.yaml` 的 `predicting` 中配置，其中：
 - `audio_file`: 指定预测的音频文件。
-- `top_k`: 预测显示的top k标签的得分，默认为1。
+- `top_k`: 预测显示的top k标签的得分，默认为 1。
 - `checkpoint`: 模型参数checkpoint文件。
 
 输出的预测结果如下：
diff --git a/examples/ljspeech/tts3/local/synthesize.sh b/examples/ljspeech/tts3/local/synthesize.sh
index f150d158f..6dc34274c 100755
--- a/examples/ljspeech/tts3/local/synthesize.sh
+++ b/examples/ljspeech/tts3/local/synthesize.sh
@@ -4,17 +4,42 @@ config_path=$1
 train_output_path=$2
 ckpt_name=$3
 
-FLAGS_allocator_strategy=naive_best_fit \
-FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ${BIN_DIR}/../synthesize.py \
-    --am=fastspeech2_ljspeech \
-    --am_config=${config_path} \
-    --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
-    --am_stat=dump/train/speech_stats.npy \
-    --voc=pwgan_ljspeech \
-    --voc_config=pwg_ljspeech_ckpt_0.5/pwg_default.yaml \
-    --voc_ckpt=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz  \
-    --voc_stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \
-    --test_metadata=dump/test/norm/metadata.jsonl \
-    --output_dir=${train_output_path}/test \
-    --phones_dict=dump/phone_id_map.txt
+stage=0
+stop_stage=0
+
+# pwgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+    python3 ${BIN_DIR}/../synthesize.py \
+        --am=fastspeech2_ljspeech \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=pwgan_ljspeech \
+        --voc_config=pwg_ljspeech_ckpt_0.5/pwg_default.yaml \
+        --voc_ckpt=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz  \
+        --voc_stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test \
+        --phones_dict=dump/phone_id_map.txt
+fi
+
+# hifigan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+    python3 ${BIN_DIR}/../synthesize.py \
+        --am=fastspeech2_ljspeech \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=hifigan_ljspeech \
+        --voc_config=hifigan_ljspeech_ckpt_0.2.0/default.yaml \
+        --voc_ckpt=hifigan_ljspeech_ckpt_0.2.0/snapshot_iter_2500000.pdz \
+        --voc_stat=hifigan_ljspeech_ckpt_0.2.0/feats_stats.npy \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test \
+        --phones_dict=dump/phone_id_map.txt
+fi
+
diff --git a/examples/ljspeech/tts3/local/synthesize_e2e.sh b/examples/ljspeech/tts3/local/synthesize_e2e.sh
index 0b0cb5741..36865f7f1 100755
--- a/examples/ljspeech/tts3/local/synthesize_e2e.sh
+++ b/examples/ljspeech/tts3/local/synthesize_e2e.sh
@@ -4,19 +4,45 @@ config_path=$1
 train_output_path=$2
 ckpt_name=$3
 
-FLAGS_allocator_strategy=naive_best_fit \
-FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ${BIN_DIR}/../synthesize_e2e.py \
-    --am=fastspeech2_ljspeech \
-    --am_config=${config_path} \
-    --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
-    --am_stat=dump/train/speech_stats.npy \
-    --voc=pwgan_ljspeech \
-    --voc_config=pwg_ljspeech_ckpt_0.5/pwg_default.yaml \
-    --voc_ckpt=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz  \
-    --voc_stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \
-    --lang=en \
-    --text=${BIN_DIR}/../sentences_en.txt \
-    --output_dir=${train_output_path}/test_e2e \
-    --inference_dir=${train_output_path}/inference \
-    --phones_dict=dump/phone_id_map.txt
\ No newline at end of file
+stage=0
+stop_stage=0
+
+# pwgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=fastspeech2_ljspeech \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=pwgan_ljspeech \
+        --voc_config=pwg_ljspeech_ckpt_0.5/pwg_default.yaml \
+        --voc_ckpt=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz  \
+        --voc_stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \
+        --lang=en \
+        --text=${BIN_DIR}/../sentences_en.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --inference_dir=${train_output_path}/inference \
+        --phones_dict=dump/phone_id_map.txt
+fi
+
+# hifigan
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=fastspeech2_ljspeech \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=hifigan_ljspeech \
+        --voc_config=hifigan_ljspeech_ckpt_0.2.0/default.yaml \
+        --voc_ckpt=hifigan_ljspeech_ckpt_0.2.0/snapshot_iter_2500000.pdz \
+        --voc_stat=hifigan_ljspeech_ckpt_0.2.0/feats_stats.npy \
+        --lang=en \
+        --text=${BIN_DIR}/../sentences_en.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --inference_dir=${train_output_path}/inference \
+        --phones_dict=dump/phone_id_map.txt
+fi
diff --git a/examples/ljspeech/voc5/README.md b/examples/ljspeech/voc5/README.md
new file mode 100644
index 000000000..9fbb9f746
--- /dev/null
+++ b/examples/ljspeech/voc5/README.md
@@ -0,0 +1,148 @@
+# HiFiGAN with the LJSpeech-1.1
+This example contains code used to train a [HiFiGAN](https://arxiv.org/abs/2010.05646) model with [LJSpeech-1.1](https://keithito.com/LJ-Speech-Dataset/).
+## Dataset
+### Download and Extract
+Download LJSpeech-1.1 from the [official website](https://keithito.com/LJ-Speech-Dataset/).
+### Get MFA Result and Extract
+We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut the silence in the edge of audio.
+You can download from here [ljspeech_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/LJSpeech-1.1/ljspeech_alignment.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
+
+## Get Started
+Assume the path to the dataset is `~/datasets/LJSpeech-1.1`.
+Assume the path to the MFA result of LJSpeech-1.1 is `./ljspeech_alignment`.
+Run the command below to
+1. **source path**.
+2. preprocess the dataset.
+3. train the model.
+4. synthesize wavs.
+    - synthesize waveform from `metadata.jsonl`.
+```bash
+./run.sh
+```
+You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, running the following command will only preprocess the dataset.
+```bash
+./run.sh --stage 0 --stop-stage 0
+```
+### Data Preprocessing
+```bash
+./local/preprocess.sh ${conf_path}
+```
+When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.
+
+```text
+dump
+├── dev
+│   ├── norm
+│   └── raw
+├── test
+│   ├── norm
+│   └── raw
+└── train
+    ├── norm
+    ├── raw
+    └── feats_stats.npy
+```
+
+The dataset is split into 3 parts, namely `train`, `dev`, and `test`, each of which contains a `norm` and `raw` subfolder. The `raw` folder contains the log magnitude of the mel spectrogram of each utterance, while the norm folder contains the normalized spectrogram. The statistics used to normalize the spectrogram are computed from the training set, which is located in `dump/train/feats_stats.npy`.
+
+Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file that contains id and paths to the spectrogram of each utterance.
+
+### Model Training
+`./local/train.sh` calls `${BIN_DIR}/train.py`.
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
+```
+Here's the complete help message.
+
+```text
+usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
+                [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
+                [--ngpu NGPU] [--batch-size BATCH_SIZE] [--max-iter MAX_ITER]
+                [--run-benchmark RUN_BENCHMARK]
+                [--profiler_options PROFILER_OPTIONS]
+
+Train a ParallelWaveGAN model.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --config CONFIG       config file to overwrite default config.
+  --train-metadata TRAIN_METADATA
+                        training data.
+  --dev-metadata DEV_METADATA
+                        dev data.
+  --output-dir OUTPUT_DIR
+                        output dir.
+  --ngpu NGPU           if ngpu == 0, use cpu.
+
+benchmark:
+  arguments related to benchmark.
+
+  --batch-size BATCH_SIZE
+                        batch size.
+  --max-iter MAX_ITER   train max steps.
+  --run-benchmark RUN_BENCHMARK
+                        runing benchmark or not, if True, use the --batch-size
+                        and --max-iter.
+  --profiler_options PROFILER_OPTIONS
+                        The option of profiler, which should be in format
+                        "key1=value1;key2=value2;key3=value3".
+```
+
+1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
+2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder.
+3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are saved in `checkpoints/` inside this directory.
+4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
+
+### Synthesizing
+`./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
+```
+```text
+usage: synthesize.py [-h] [--generator-type GENERATOR_TYPE] [--config CONFIG]
+                     [--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA]
+                     [--output-dir OUTPUT_DIR] [--ngpu NGPU]
+
+Synthesize with GANVocoder.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --generator-type GENERATOR_TYPE
+                        type of GANVocoder, should in {pwgan, mb_melgan,
+                        style_melgan, } now
+  --config CONFIG       GANVocoder config file.
+  --checkpoint CHECKPOINT
+                        snapshot to load.
+  --test-metadata TEST_METADATA
+                        dev data.
+  --output-dir OUTPUT_DIR
+                        output dir.
+  --ngpu NGPU           if ngpu == 0, use cpu.
+```
+
+1. `--config` parallel wavegan config file. You should use the same config with which the model is trained.
+2. `--checkpoint` is the checkpoint to load. Pick one of the checkpoints from `checkpoints` inside the training output directory.
+3. `--test-metadata` is the metadata of the test dataset. Use the `metadata.jsonl` in the `dev/norm` subfolder from the processed directory.
+4. `--output-dir` is the directory to save the synthesized audio files.
+5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
+
+## Pretrained Model
+The pretrained model can be downloaded here [hifigan_ljspeech_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_ljspeech_ckpt_0.2.0.zip).
+
+
+Model | Step | eval/generator_loss | eval/mel_loss| eval/feature_matching_loss
+:-------------:| :------------:| :-----: | :-----: | :--------:
+default| 1(gpu) x 2500000|24.492|0.115|7.227
+
+HiFiGAN checkpoint contains files listed below.
+
+```text
+hifigan_ljspeech_ckpt_0.2.0
+├── default.yaml                  # default config used to train hifigan
+├── feats_stats.npy               # statistics used to normalize spectrogram when training hifigan
+└── snapshot_iter_2500000.pdz     # generator parameters of hifigan
+```
+
+
+## Acknowledgement
+We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN.
diff --git a/examples/ljspeech/voc5/conf/default.yaml b/examples/ljspeech/voc5/conf/default.yaml
new file mode 100644
index 000000000..97c512204
--- /dev/null
+++ b/examples/ljspeech/voc5/conf/default.yaml
@@ -0,0 +1,167 @@
+# This is the configuration file for LJSpeech dataset.
+# This configuration is based on HiFiGAN V1, which is an official configuration. 
+# But I found that the optimizer setting does not work well with my implementation.
+# So I changed optimizer settings as follows:
+# - AdamW -> Adam
+# - betas: [0.8, 0.99] -> betas: [0.5, 0.9]
+# - Scheduler: ExponentialLR -> MultiStepLR
+# To match the shift size difference, the upsample scales is also modified from the original 256 shift setting.
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+fs: 22050                # Sampling rate.
+n_fft: 1024              # FFT size (samples).
+n_shift: 256             # Hop size (samples). 11.6ms
+win_length: null         # Window length (samples).
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+n_mels: 80               # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation. (Hz)
+fmax: 7600               # Maximum frequency in mel basis calculation. (Hz)
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_params:
+    in_channels: 80                       # Number of input channels.
+    out_channels: 1                       # Number of output channels.
+    channels: 512                         # Number of initial channels.
+    kernel_size: 7                        # Kernel size of initial and final conv layers.
+    upsample_scales: [8, 8, 2, 2]         # Upsampling scales.
+    upsample_kernel_sizes: [16, 16, 4, 4] # Kernel size for upsampling layers.
+    resblock_kernel_sizes: [3, 7, 11]     # Kernel size for residual blocks.
+    resblock_dilations:                   # Dilations for residual blocks.
+        - [1, 3, 5]
+        - [1, 3, 5]
+        - [1, 3, 5]
+    use_additional_convs: True            # Whether to use additional conv layer in residual blocks.
+    bias: True                            # Whether to use bias parameter in conv.
+    nonlinear_activation: "leakyrelu"     # Nonlinear activation type.
+    nonlinear_activation_params:          # Nonlinear activation paramters.
+        negative_slope: 0.1
+    use_weight_norm: True                 # Whether to apply weight normalization.
+
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_params:
+    scales: 3                              # Number of multi-scale discriminator.
+    scale_downsample_pooling: "AvgPool1D"  # Pooling operation for scale discriminator.
+    scale_downsample_pooling_params:
+        kernel_size: 4                     # Pooling kernel size.
+        stride: 2                          # Pooling stride.
+        padding: 2                         # Padding size.
+    scale_discriminator_params:
+        in_channels: 1                     # Number of input channels.
+        out_channels: 1                    # Number of output channels.
+        kernel_sizes: [15, 41, 5, 3]       # List of kernel sizes.
+        channels: 128                      # Initial number of channels.
+        max_downsample_channels: 1024      # Maximum number of channels in downsampling conv layers.
+        max_groups: 16                     # Maximum number of groups in downsampling conv layers.
+        bias: True
+        downsample_scales: [4, 4, 4, 4, 1] # Downsampling scales.
+        nonlinear_activation: "leakyrelu"  # Nonlinear activation.
+        nonlinear_activation_params:
+            negative_slope: 0.1
+    follow_official_norm: True             # Whether to follow the official norm setting.
+    periods: [2, 3, 5, 7, 11]              # List of period for multi-period discriminator.
+    period_discriminator_params:
+        in_channels: 1                     # Number of input channels.
+        out_channels: 1                    # Number of output channels.
+        kernel_sizes: [5, 3]               # List of kernel sizes.
+        channels: 32                       # Initial number of channels.
+        downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales.
+        max_downsample_channels: 1024      # Maximum number of channels in downsampling conv layers.
+        bias: True                         # Whether to use bias parameter in conv layer."
+        nonlinear_activation: "leakyrelu"  # Nonlinear activation.
+        nonlinear_activation_params:       # Nonlinear activation paramters.
+            negative_slope: 0.1
+        use_weight_norm: True              # Whether to apply weight normalization.
+        use_spectral_norm: False           # Whether to apply spectral normalization.
+    
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+use_stft_loss: False                 # Whether to use multi-resolution STFT loss.
+use_mel_loss: True                   # Whether to use Mel-spectrogram loss.
+mel_loss_params:
+    fs: 22050
+    fft_size: 1024
+    hop_size: 256
+    win_length: null
+    window: "hann"
+    num_mels: 80
+    fmin: 0
+    fmax: 11025
+    log_base: null
+generator_adv_loss_params:
+    average_by_discriminators: False # Whether to average loss by #discriminators.
+discriminator_adv_loss_params:
+    average_by_discriminators: False # Whether to average loss by #discriminators.
+use_feat_match_loss: True
+feat_match_loss_params:
+    average_by_discriminators: False # Whether to average loss by #discriminators.
+    average_by_layers: False         # Whether to average loss by #layers in each discriminator.
+    include_final_outputs: False     # Whether to include final outputs in feat match loss calculation.
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_aux: 45.0       # Loss balancing coefficient for STFT loss.
+lambda_adv: 1.0        # Loss balancing coefficient for adversarial loss.
+lambda_feat_match: 2.0 # Loss balancing coefficient for feat match loss..
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 16              # Batch size.
+batch_max_steps: 8192       # Length of each audio in batch. Make sure dividable by hop_size.
+num_workers: 2              # Number of workers in DataLoader.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+    beta1: 0.5
+    beta2: 0.9
+    weight_decay: 0.0                   # Generator's weight decay coefficient.
+generator_scheduler_params:
+    learning_rate: 2.0e-4               # Generator's learning rate.
+    gamma: 0.5                          # Generator's scheduler gamma.
+    milestones:                         # At each milestone, lr will be multiplied by gamma.
+        - 200000
+        - 400000
+        - 600000
+        - 800000
+generator_grad_norm: -1                 # Generator's gradient norm.
+discriminator_optimizer_params:
+    beta1: 0.5
+    beta2: 0.9
+    weight_decay: 0.0                   # Discriminator's weight decay coefficient.
+discriminator_scheduler_params:
+    learning_rate: 2.0e-4               # Discriminator's learning rate.
+    gamma: 0.5                          # Discriminator's scheduler gamma.
+    milestones:                         # At each milestone, lr will be multiplied by gamma.
+        - 200000
+        - 400000
+        - 600000
+        - 800000    
+discriminator_grad_norm: -1             # Discriminator's gradient norm.            
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+generator_train_start_steps: 1     # Number of steps to start to train discriminator.
+discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
+train_max_steps: 2500000           # Number of training steps.
+save_interval_steps: 5000         # Interval steps to save checkpoint.
+eval_interval_steps: 1000          # Interval steps to evaluate the network.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_snapshots: 10                 # max number of snapshots to keep while training
+seed: 42                          # random seed for paddle, random, and np.random
diff --git a/examples/ljspeech/voc5/local/preprocess.sh b/examples/ljspeech/voc5/local/preprocess.sh
new file mode 100755
index 000000000..d1af60dad
--- /dev/null
+++ b/examples/ljspeech/voc5/local/preprocess.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+
+stage=0
+stop_stage=100
+
+config_path=$1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # get durations from MFA's result
+    echo "Generate durations.txt from MFA results ..."
+    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
+        --inputdir=./ljspeech_alignment \
+        --output=durations.txt \
+        --config=${config_path}
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # extract features
+    echo "Extract features ..."
+    python3 ${BIN_DIR}/../preprocess.py \
+        --rootdir=~/datasets/LJSpeech-1.1/ \
+        --dataset=ljspeech \
+        --dumpdir=dump \
+        --dur-file=durations.txt \
+        --config=${config_path} \
+        --cut-sil=True \
+        --num-cpu=20
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # get features' stats(mean and std)
+    echo "Get features' stats ..."
+    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --field-name="feats"
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # normalize, dev and test should use train's stats
+    echo "Normalize ..."
+   
+    python3 ${BIN_DIR}/../normalize.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --dumpdir=dump/train/norm \
+        --stats=dump/train/feats_stats.npy
+    python3 ${BIN_DIR}/../normalize.py \
+        --metadata=dump/dev/raw/metadata.jsonl \
+        --dumpdir=dump/dev/norm \
+        --stats=dump/train/feats_stats.npy
+    
+    python3 ${BIN_DIR}/../normalize.py \
+        --metadata=dump/test/raw/metadata.jsonl \
+        --dumpdir=dump/test/norm \
+        --stats=dump/train/feats_stats.npy
+fi
diff --git a/examples/ljspeech/voc5/local/synthesize.sh b/examples/ljspeech/voc5/local/synthesize.sh
new file mode 100755
index 000000000..647896175
--- /dev/null
+++ b/examples/ljspeech/voc5/local/synthesize.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
+FLAGS_allocator_strategy=naive_best_fit \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python3 ${BIN_DIR}/../synthesize.py \
+    --config=${config_path} \
+    --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
+    --test-metadata=dump/test/norm/metadata.jsonl \
+    --output-dir=${train_output_path}/test \
+    --generator-type=hifigan
diff --git a/examples/ljspeech/voc5/local/train.sh b/examples/ljspeech/voc5/local/train.sh
new file mode 100755
index 000000000..9695631ef
--- /dev/null
+++ b/examples/ljspeech/voc5/local/train.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+
+FLAGS_cudnn_exhaustive_search=true \
+FLAGS_conv_workspace_size_limit=4000 \
+python ${BIN_DIR}/train.py \
+    --train-metadata=dump/train/norm/metadata.jsonl \
+    --dev-metadata=dump/dev/norm/metadata.jsonl \
+    --config=${config_path} \
+    --output-dir=${train_output_path} \
+    --ngpu=1
diff --git a/examples/ljspeech/voc5/path.sh b/examples/ljspeech/voc5/path.sh
new file mode 100755
index 000000000..7451b3218
--- /dev/null
+++ b/examples/ljspeech/voc5/path.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+MODEL=hifigan
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL}
diff --git a/examples/ljspeech/voc5/run.sh b/examples/ljspeech/voc5/run.sh
new file mode 100755
index 000000000..cab1ac38b
--- /dev/null
+++ b/examples/ljspeech/voc5/run.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+set -e
+source path.sh
+
+gpus=0,1
+stage=0
+stop_stage=100
+
+conf_path=conf/default.yaml
+train_output_path=exp/default
+ckpt_name=snapshot_iter_5000.pdz
+
+# with the following command, you can choose the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    ./local/preprocess.sh ${conf_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # synthesize
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
diff --git a/examples/vctk/tts3/local/synthesize.sh b/examples/vctk/tts3/local/synthesize.sh
index 8381af464..9e03f9b8a 100755
--- a/examples/vctk/tts3/local/synthesize.sh
+++ b/examples/vctk/tts3/local/synthesize.sh
@@ -4,18 +4,43 @@ config_path=$1
 train_output_path=$2
 ckpt_name=$3
 
-FLAGS_allocator_strategy=naive_best_fit \
-FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ${BIN_DIR}/../synthesize.py \
-    --am=fastspeech2_vctk \
-    --am_config=${config_path} \
-    --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
-    --am_stat=dump/train/speech_stats.npy \
-    --voc=pwgan_vctk \
-    --voc_config=pwg_vctk_ckpt_0.1.1/default.yaml  \
-    --voc_ckpt=pwg_vctk_ckpt_0.1.1/snapshot_iter_1500000.pdz \
-    --voc_stat=pwg_vctk_ckpt_0.1.1/feats_stats.npy \
-    --test_metadata=dump/test/norm/metadata.jsonl \
-    --output_dir=${train_output_path}/test \
-    --phones_dict=dump/phone_id_map.txt \
-    --speaker_dict=dump/speaker_id_map.txt
+stage=0
+stop_stage=0
+
+# pwgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+    python3 ${BIN_DIR}/../synthesize.py \
+        --am=fastspeech2_vctk \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=pwgan_vctk \
+        --voc_config=pwg_vctk_ckpt_0.1.1/default.yaml  \
+        --voc_ckpt=pwg_vctk_ckpt_0.1.1/snapshot_iter_1500000.pdz \
+        --voc_stat=pwg_vctk_ckpt_0.1.1/feats_stats.npy \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test \
+        --phones_dict=dump/phone_id_map.txt \
+        --speaker_dict=dump/speaker_id_map.txt
+fi
+
+# hifigan
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+    python3 ${BIN_DIR}/../synthesize.py \
+        --am=fastspeech2_aishell3 \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=hifigan_vctk \
+        --voc_config=hifigan_vctk_ckpt_0.2.0/default.yaml  \
+        --voc_ckpt=hifigan_vctk_ckpt_0.2.0/snapshot_iter_2500000.pdz \
+        --voc_stat=hifigan_vctk_ckpt_0.2.0/feats_stats.npy \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test \
+        --phones_dict=dump/phone_id_map.txt \
+        --speaker_dict=dump/speaker_id_map.txt
+fi
diff --git a/examples/vctk/tts3/local/synthesize_e2e.sh b/examples/vctk/tts3/local/synthesize_e2e.sh
index 60d56d1c9..a89f42b50 100755
--- a/examples/vctk/tts3/local/synthesize_e2e.sh
+++ b/examples/vctk/tts3/local/synthesize_e2e.sh
@@ -4,21 +4,49 @@ config_path=$1
 train_output_path=$2
 ckpt_name=$3
 
-FLAGS_allocator_strategy=naive_best_fit \
-FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ${BIN_DIR}/../synthesize_e2e.py \
-    --am=fastspeech2_vctk \
-    --am_config=${config_path} \
-    --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
-    --am_stat=dump/train/speech_stats.npy \
-    --voc=pwgan_vctk \
-    --voc_config=pwg_vctk_ckpt_0.1.1/default.yaml  \
-    --voc_ckpt=pwg_vctk_ckpt_0.1.1/snapshot_iter_1500000.pdz \
-    --voc_stat=pwg_vctk_ckpt_0.1.1/feats_stats.npy \
-    --lang=en \
-    --text=${BIN_DIR}/../sentences_en.txt \
-    --output_dir=${train_output_path}/test_e2e \
-    --phones_dict=dump/phone_id_map.txt \
-    --speaker_dict=dump/speaker_id_map.txt \
-    --spk_id=0 \
-    --inference_dir=${train_output_path}/inference
+stage=0
+stop_stage=0
+
+# pwgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=fastspeech2_vctk \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=pwgan_vctk \
+        --voc_config=pwg_vctk_ckpt_0.1.1/default.yaml  \
+        --voc_ckpt=pwg_vctk_ckpt_0.1.1/snapshot_iter_1500000.pdz \
+        --voc_stat=pwg_vctk_ckpt_0.1.1/feats_stats.npy \
+        --lang=en \
+        --text=${BIN_DIR}/../sentences_en.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --speaker_dict=dump/speaker_id_map.txt \
+        --spk_id=0 \
+        --inference_dir=${train_output_path}/inference
+fi
+
+# hifigan
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=fastspeech2_vctk \
+        --am_config=${config_path} \
+        --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --am_stat=dump/train/speech_stats.npy \
+        --voc=hifigan_vctk \
+        --voc_config=hifigan_vctk_ckpt_0.2.0/default.yaml  \
+        --voc_ckpt=hifigan_vctk_ckpt_0.2.0/snapshot_iter_2500000.pdz \
+        --voc_stat=hifigan_vctk_ckpt_0.2.0/feats_stats.npy \
+        --lang=en \
+        --text=${BIN_DIR}/../sentences_en.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --phones_dict=dump/phone_id_map.txt \
+        --speaker_dict=dump/speaker_id_map.txt \
+        --spk_id=0 \
+        --inference_dir=${train_output_path}/inference
+fi
diff --git a/examples/vctk/voc5/README.md b/examples/vctk/voc5/README.md
new file mode 100644
index 000000000..b4be341c0
--- /dev/null
+++ b/examples/vctk/voc5/README.md
@@ -0,0 +1,153 @@
+# HiFiGAN with VCTK
+This example contains code used to train a [HiFiGAN](https://arxiv.org/abs/2010.05646) model with [VCTK](https://datashare.ed.ac.uk/handle/10283/3443).
+
+## Dataset
+### Download and Extract
+Download VCTK-0.92  from the [official website](https://datashare.ed.ac.uk/handle/10283/3443) and extract it to `~/datasets`. Then the dataset is in directory `~/datasets/VCTK-Corpus-0.92`.
+
+### Get MFA Result and Extract
+We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut the silence in the edge of audio.
+You can download from here [vctk_alignment.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/VCTK-Corpus-0.92/vctk_alignment.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
+ps: we remove three speakers in VCTK-0.92 (see [reorganize_vctk.py](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/other/mfa/local/reorganize_vctk.py)):
+1. `p315`, because of no text for it.
+2. `p280` and `p362`, because no *_mic2.flac (which is better than *_mic1.flac) for  them.
+
+## Get Started
+Assume the path to the dataset is `~/datasets/VCTK-Corpus-0.92`.
+Assume the path to the MFA result of VCTK is `./vctk_alignment`.
+Run the command below to
+1. **source path**.
+2. preprocess the dataset.
+3. train the model.
+4. synthesize wavs.
+    - synthesize waveform from `metadata.jsonl`.
+```bash
+./run.sh
+```
+You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, running the following command will only preprocess the dataset.
+```bash
+./run.sh --stage 0 --stop-stage 0
+```
+### Data Preprocessing
+```bash
+./local/preprocess.sh ${conf_path}
+```
+When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.
+
+```text
+dump
+├── dev
+│   ├── norm
+│   └── raw
+├── test
+│   ├── norm
+│   └── raw
+└── train
+    ├── norm
+    ├── raw
+    └── feats_stats.npy
+```
+
+The dataset is split into 3 parts, namely `train`, `dev`, and `test`, each of which contains a `norm` and `raw` subfolder. The `raw` folder contains the log magnitude of the mel spectrogram of each utterance, while the norm folder contains the normalized spectrogram. The statistics used to normalize the spectrogram are computed from the training set, which is located in `dump/train/feats_stats.npy`.
+
+Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file that contains id and paths to the spectrogram of each utterance.
+
+### Model Training
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
+```
+`./local/train.sh` calls `${BIN_DIR}/train.py`.
+Here's the complete help message.
+
+```text
+usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
+                [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
+                [--ngpu NGPU] [--batch-size BATCH_SIZE] [--max-iter MAX_ITER]
+                [--run-benchmark RUN_BENCHMARK]
+                [--profiler_options PROFILER_OPTIONS]
+
+Train a ParallelWaveGAN model.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --config CONFIG       config file to overwrite default config.
+  --train-metadata TRAIN_METADATA
+                        training data.
+  --dev-metadata DEV_METADATA
+                        dev data.
+  --output-dir OUTPUT_DIR
+                        output dir.
+  --ngpu NGPU           if ngpu == 0, use cpu.
+
+benchmark:
+  arguments related to benchmark.
+
+  --batch-size BATCH_SIZE
+                        batch size.
+  --max-iter MAX_ITER   train max steps.
+  --run-benchmark RUN_BENCHMARK
+                        runing benchmark or not, if True, use the --batch-size
+                        and --max-iter.
+  --profiler_options PROFILER_OPTIONS
+                        The option of profiler, which should be in format
+                        "key1=value1;key2=value2;key3=value3".
+```
+
+1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
+2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder.
+3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are saved in `checkpoints/` inside this directory.
+4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
+
+### Synthesizing
+`./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
+```
+```text
+usage: synthesize.py [-h] [--generator-type GENERATOR_TYPE] [--config CONFIG]
+                     [--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA]
+                     [--output-dir OUTPUT_DIR] [--ngpu NGPU]
+
+Synthesize with GANVocoder.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --generator-type GENERATOR_TYPE
+                        type of GANVocoder, should in {pwgan, mb_melgan,
+                        style_melgan, } now
+  --config CONFIG       GANVocoder config file.
+  --checkpoint CHECKPOINT
+                        snapshot to load.
+  --test-metadata TEST_METADATA
+                        dev data.
+  --output-dir OUTPUT_DIR
+                        output dir.
+  --ngpu NGPU           if ngpu == 0, use cpu.
+```
+
+
+1. `--config` config file. You should use the same config with which the model is trained.
+2. `--checkpoint` is the checkpoint to load. Pick one of the checkpoints from `checkpoints` inside the training output directory.
+3. `--test-metadata` is the metadata of the test dataset. Use the `metadata.jsonl` in the `dev/norm` subfolder from the processed directory.
+4. `--output-dir` is the directory to save the synthesized audio files.
+5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
+
+## Pretrained Model
+The pretrained model can be downloaded here [hifigan_vctk_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_ckpt_0.2.0.zip).
+
+
+Model | Step | eval/generator_loss | eval/mel_loss| eval/feature_matching_loss
+:-------------:| :------------:| :-----: | :-----: | :--------:
+default| 1(gpu) x 2500000|58.092|0.1234|24.384
+
+HiFiGAN checkpoint contains files listed below.
+
+```text
+hifigan_vctk_ckpt_0.2.0
+├── default.yaml                  # default config used to train hifigan
+├── feats_stats.npy               # statistics used to normalize spectrogram when training hifigan
+└── snapshot_iter_2500000.pdz     # generator parameters of hifigan
+```
+
+## Acknowledgement
+We adapted some code from https://github.com/kan-bayashi/ParallelWaveGAN.
diff --git a/examples/vctk/voc5/conf/default.yaml b/examples/vctk/voc5/conf/default.yaml
new file mode 100644
index 000000000..6361e01b2
--- /dev/null
+++ b/examples/vctk/voc5/conf/default.yaml
@@ -0,0 +1,168 @@
+# This is the configuration file for VCTK dataset.
+# This configuration is based on HiFiGAN V1, which is
+# an official configuration. But I found that the optimizer
+# setting does not work well with my implementation.
+# So I changed optimizer settings as follows:
+# - AdamW -> Adam
+# - betas: [0.8, 0.99] -> betas: [0.5, 0.9]
+# - Scheduler: ExponentialLR -> MultiStepLR
+# To match the shift size difference, the upsample scales
+# is also modified from the original 256 shift setting.
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+fs: 24000                # Sampling rate.
+n_fft: 2048              # FFT size (samples).
+n_shift: 300             # Hop size (samples). 12.5ms
+win_length: 1200         # Window length (samples). 50ms
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+n_mels: 80               # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation. (Hz)
+fmax: 7600               # Maximum frequency in mel basis calculation. (Hz)
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_params:
+    in_channels: 80                       # Number of input channels.
+    out_channels: 1                       # Number of output channels.
+    channels: 512                         # Number of initial channels.
+    kernel_size: 7                        # Kernel size of initial and final conv layers.
+    upsample_scales: [5, 5, 4, 3]         # Upsampling scales.
+    upsample_kernel_sizes: [10, 10, 8, 6] # Kernel size for upsampling layers.
+    resblock_kernel_sizes: [3, 7, 11]     # Kernel size for residual blocks.
+    resblock_dilations:                   # Dilations for residual blocks.
+        - [1, 3, 5]
+        - [1, 3, 5]
+        - [1, 3, 5]
+    use_additional_convs: True            # Whether to use additional conv layer in residual blocks.
+    bias: True                            # Whether to use bias parameter in conv.
+    nonlinear_activation: "leakyrelu"     # Nonlinear activation type.
+    nonlinear_activation_params:          # Nonlinear activation paramters.
+        negative_slope: 0.1
+    use_weight_norm: True                 # Whether to apply weight normalization.
+
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_params:
+    scales: 3                              # Number of multi-scale discriminator.
+    scale_downsample_pooling: "AvgPool1D"  # Pooling operation for scale discriminator.
+    scale_downsample_pooling_params:
+        kernel_size: 4                     # Pooling kernel size.
+        stride: 2                          # Pooling stride.
+        padding: 2                         # Padding size.
+    scale_discriminator_params:
+        in_channels: 1                     # Number of input channels.
+        out_channels: 1                    # Number of output channels.
+        kernel_sizes: [15, 41, 5, 3]       # List of kernel sizes.
+        channels: 128                      # Initial number of channels.
+        max_downsample_channels: 1024      # Maximum number of channels in downsampling conv layers.
+        max_groups: 16                     # Maximum number of groups in downsampling conv layers.
+        bias: True
+        downsample_scales: [4, 4, 4, 4, 1] # Downsampling scales.
+        nonlinear_activation: "leakyrelu"  # Nonlinear activation.
+        nonlinear_activation_params:
+            negative_slope: 0.1
+    follow_official_norm: True             # Whether to follow the official norm setting.
+    periods: [2, 3, 5, 7, 11]              # List of period for multi-period discriminator.
+    period_discriminator_params:
+        in_channels: 1                     # Number of input channels.
+        out_channels: 1                    # Number of output channels.
+        kernel_sizes: [5, 3]               # List of kernel sizes.
+        channels: 32                       # Initial number of channels.
+        downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales.
+        max_downsample_channels: 1024      # Maximum number of channels in downsampling conv layers.
+        bias: True                         # Whether to use bias parameter in conv layer."
+        nonlinear_activation: "leakyrelu"  # Nonlinear activation.
+        nonlinear_activation_params:       # Nonlinear activation paramters.
+            negative_slope: 0.1
+        use_weight_norm: True              # Whether to apply weight normalization.
+        use_spectral_norm: False           # Whether to apply spectral normalization.
+    
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+use_stft_loss: False                 # Whether to use multi-resolution STFT loss.
+use_mel_loss: True                   # Whether to use Mel-spectrogram loss.
+mel_loss_params:
+    fs: 24000
+    fft_size: 2048
+    hop_size: 300
+    win_length: 1200
+    window: "hann"
+    num_mels: 80
+    fmin: 0
+    fmax: 12000
+    log_base: null
+generator_adv_loss_params:
+    average_by_discriminators: False # Whether to average loss by #discriminators.
+discriminator_adv_loss_params:
+    average_by_discriminators: False # Whether to average loss by #discriminators.
+use_feat_match_loss: True
+feat_match_loss_params:
+    average_by_discriminators: False # Whether to average loss by #discriminators.
+    average_by_layers: False         # Whether to average loss by #layers in each discriminator.
+    include_final_outputs: False     # Whether to include final outputs in feat match loss calculation.
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_aux: 45.0       # Loss balancing coefficient for STFT loss.
+lambda_adv: 1.0        # Loss balancing coefficient for adversarial loss.
+lambda_feat_match: 2.0 # Loss balancing coefficient for feat match loss..
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 16              # Batch size.
+batch_max_steps: 8400       # Length of each audio in batch. Make sure dividable by hop_size.
+num_workers: 2              # Number of workers in DataLoader.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+    beta1: 0.5
+    beta2: 0.9
+    weight_decay: 0.0                   # Generator's weight decay coefficient.
+generator_scheduler_params:
+    learning_rate: 2.0e-4               # Generator's learning rate.
+    gamma: 0.5                          # Generator's scheduler gamma.
+    milestones:                         # At each milestone, lr will be multiplied by gamma.
+        - 200000
+        - 400000
+        - 600000
+        - 800000
+generator_grad_norm: -1                 # Generator's gradient norm.
+discriminator_optimizer_params:
+    beta1: 0.5
+    beta2: 0.9
+    weight_decay: 0.0                   # Discriminator's weight decay coefficient.
+discriminator_scheduler_params:
+    learning_rate: 2.0e-4               # Discriminator's learning rate.
+    gamma: 0.5                          # Discriminator's scheduler gamma.
+    milestones:                         # At each milestone, lr will be multiplied by gamma.
+        - 200000
+        - 400000
+        - 600000
+        - 800000    
+discriminator_grad_norm: -1             # Discriminator's gradient norm.            
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+generator_train_start_steps: 1     # Number of steps to start to train discriminator.
+discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
+train_max_steps: 2500000           # Number of training steps.
+save_interval_steps: 5000         # Interval steps to save checkpoint.
+eval_interval_steps: 1000          # Interval steps to evaluate the network.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_snapshots: 10                 # max number of snapshots to keep while training
+seed: 42                          # random seed for paddle, random, and np.random
diff --git a/examples/vctk/voc5/local/preprocess.sh b/examples/vctk/voc5/local/preprocess.sh
new file mode 100755
index 000000000..88a478cd5
--- /dev/null
+++ b/examples/vctk/voc5/local/preprocess.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+
+stage=0
+stop_stage=100
+
+config_path=$1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # get durations from MFA's result
+    echo "Generate durations.txt from MFA results ..."
+    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
+        --inputdir=./vctk_alignment \
+        --output=durations.txt \
+        --config=${config_path}
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # extract features
+    echo "Extract features ..."
+    python3 ${BIN_DIR}/../preprocess.py \
+        --rootdir=~/datasets/VCTK-Corpus-0.92/ \
+        --dataset=vctk \
+        --dumpdir=dump \
+        --dur-file=durations.txt \
+        --config=${config_path} \
+        --cut-sil=True \
+        --num-cpu=20
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # get features' stats(mean and std)
+    echo "Get features' stats ..."
+    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --field-name="feats"
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # normalize, dev and test should use train's stats
+    echo "Normalize ..."
+   
+    python3 ${BIN_DIR}/../normalize.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --dumpdir=dump/train/norm \
+        --stats=dump/train/feats_stats.npy
+    python3 ${BIN_DIR}/../normalize.py \
+        --metadata=dump/dev/raw/metadata.jsonl \
+        --dumpdir=dump/dev/norm \
+        --stats=dump/train/feats_stats.npy
+    
+    python3 ${BIN_DIR}/../normalize.py \
+        --metadata=dump/test/raw/metadata.jsonl \
+        --dumpdir=dump/test/norm \
+        --stats=dump/train/feats_stats.npy
+fi
diff --git a/examples/vctk/voc5/local/synthesize.sh b/examples/vctk/voc5/local/synthesize.sh
new file mode 100755
index 000000000..647896175
--- /dev/null
+++ b/examples/vctk/voc5/local/synthesize.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
+FLAGS_allocator_strategy=naive_best_fit \
+FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+python3 ${BIN_DIR}/../synthesize.py \
+    --config=${config_path} \
+    --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
+    --test-metadata=dump/test/norm/metadata.jsonl \
+    --output-dir=${train_output_path}/test \
+    --generator-type=hifigan
diff --git a/examples/vctk/voc5/local/train.sh b/examples/vctk/voc5/local/train.sh
new file mode 100755
index 000000000..9695631ef
--- /dev/null
+++ b/examples/vctk/voc5/local/train.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+
+FLAGS_cudnn_exhaustive_search=true \
+FLAGS_conv_workspace_size_limit=4000 \
+python ${BIN_DIR}/train.py \
+    --train-metadata=dump/train/norm/metadata.jsonl \
+    --dev-metadata=dump/dev/norm/metadata.jsonl \
+    --config=${config_path} \
+    --output-dir=${train_output_path} \
+    --ngpu=1
diff --git a/examples/vctk/voc5/path.sh b/examples/vctk/voc5/path.sh
new file mode 100755
index 000000000..7451b3218
--- /dev/null
+++ b/examples/vctk/voc5/path.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+MODEL=hifigan
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL}
diff --git a/examples/vctk/voc5/run.sh b/examples/vctk/voc5/run.sh
new file mode 100755
index 000000000..4f426ea02
--- /dev/null
+++ b/examples/vctk/voc5/run.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+set -e
+source path.sh
+
+gpus=0
+stage=0
+stop_stage=100
+
+conf_path=conf/default.yaml
+train_output_path=exp/default
+ckpt_name=snapshot_iter_5000.pdz
+
+# with the following command, you can choose the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    ./local/preprocess.sh ${conf_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # synthesize
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
diff --git a/examples/voxceleb/README.md b/examples/voxceleb/README.md
index 2c8ad1386..42f8903e4 100644
--- a/examples/voxceleb/README.md
+++ b/examples/voxceleb/README.md
@@ -6,3 +6,45 @@ sv0 - speaker verfication with softmax backend etc, all python code
 
 sv1 - dependence on kaldi, speaker verfication with plda/sc backend, 
       more info refer to the sv1/readme.txt
+
+
+## VoxCeleb2 preparation
+
+VoxCeleb2 audio files are released in m4a format. All the VoxCeleb2 m4a audio files must be converted in wav files before feeding them in PaddleSpeech. 
+Please, follow these steps to prepare the dataset correctly:
+
+1. Download Voxceleb2.
+You can find download instructions here: http://www.robots.ox.ac.uk/~vgg/data/voxceleb/
+
+2. Convert .m4a to wav
+VoxCeleb2 stores files with the m4a audio format. To use them in PaddleSpeech,  you have to convert all the m4a audio files into wav files.
+
+``` shell
+ffmpeg -y -i %s -ac 1 -vn -acodec pcm_s16le -ar 16000 %s
+```
+
+You can do the conversion using ffmpeg  https://gist.github.com/seungwonpark/4f273739beef2691cd53b5c39629d830). This operation might take several hours and should be only once.
+
+3. Put all the wav files in a folder called `wav`. You should have something like `voxceleb2/wav/id*/*.wav` (e.g, `voxceleb2/wav/id00012/21Uxsk56VDQ/00001.wav`)
+
+
+## voxceleb dataset summary
+
+
+|dataset | vox1 - dev | vox1 - test |vox2 - dev| vox2 - test|
+|---------|-----------|------------|-----------|----------|
+|spks    |  1211       |40     |      5994        | 118|
+|utts     | 148642    | 4874   | 1092009     |36273|
+| time(h) | 340.4 | 11.2  | 2360.2  |79.9 |
+
+
+## trial summary
+
+| trial     | filename |  nums | positive | negative |
+|--------|-----------|--------|-------|------|
+| VoxCeleb1 | veri_test.txt | 37720 | 18860 | 18860 | 
+| VoxCeleb1(cleaned) | veri_test2.txt | 37611 | 18802 | 18809 |
+| VoxCeleb1-H | list_test_hard.txt | 552536 | 276270 | 276266 |
+|VoxCeleb1-H(cleaned) |list_test_hard2.txt | 550894 | 275488 | 275406 |
+|VoxCeleb1-E | list_test_all.txt | 581480 | 290743 | 290737 | 
+|VoxCeleb1-E(cleaned) | list_test_all2.txt |579818 |289921 |289897 |
diff --git a/examples/voxceleb/sv0/RESULT.md b/examples/voxceleb/sv0/RESULT.md
new file mode 100644
index 000000000..c37bcecef
--- /dev/null
+++ b/examples/voxceleb/sv0/RESULT.md
@@ -0,0 +1,7 @@
+# VoxCeleb
+
+## ECAPA-TDNN 
+
+| Model | Number of Params | Release | Config | dim | Test set |  Cosine | Cosine + S-Norm | 
+| --- | --- | --- | --- | --- | --- | --- | ---- |
+| ECAPA-TDNN | 85M | 0.1.1 | conf/ecapa_tdnn.yaml |192 | test | 1.15 |  1.06 | 
diff --git a/examples/voxceleb/sv0/conf/ecapa_tdnn.yaml b/examples/voxceleb/sv0/conf/ecapa_tdnn.yaml
new file mode 100644
index 000000000..e58dca82d
--- /dev/null
+++ b/examples/voxceleb/sv0/conf/ecapa_tdnn.yaml
@@ -0,0 +1,52 @@
+###########################################
+#                Data                 #
+###########################################
+# we should explicitly specify the wav path of vox2 audio data converted from m4a
+vox2_base_path: 
+augment: True
+batch_size: 16
+num_workers: 2
+num_speakers: 7205 # 1211 vox1, 5994 vox2, 7205 vox1+2, test speakers: 41
+shuffle: True
+random_chunk: True
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+# currently, we only support fbank
+sr: 16000           # sample rate
+n_mels: 80
+window_size: 400     #25ms, sample rate 16000, 25 * 16000 / 1000 = 400 
+hop_size: 160        #10ms, sample rate 16000, 10 * 16000 / 1000 = 160
+
+###########################################################
+#                       MODEL SETTING                     #
+###########################################################
+# currently, we only support ecapa-tdnn in the ecapa_tdnn.yaml
+# if we want use another model, please choose another configuration yaml file
+model:
+  input_size: 80
+  # "channels": [512, 512, 512, 512, 1536],
+  channels: [1024, 1024, 1024, 1024, 3072]
+  kernel_sizes: [5, 3, 3, 3, 1]
+  dilations: [1, 2, 3, 4, 1]
+  attention_channels: 128
+  lin_neurons: 192
+
+###########################################
+#                Training                 #
+###########################################
+seed: 1986 # according from speechbrain configuration
+epochs: 10
+save_interval: 1
+log_interval: 1
+learning_rate: 1e-8
+
+
+###########################################
+#                Testing                  #
+###########################################
+global_embedding_norm: True
+embedding_mean_norm: True
+embedding_std_norm: False
+
diff --git a/examples/voxceleb/sv0/local/data.sh b/examples/voxceleb/sv0/local/data.sh
new file mode 100755
index 000000000..a3ff1c486
--- /dev/null
+++ b/examples/voxceleb/sv0/local/data.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+stage=1
+stop_stage=100
+
+. ${MAIN_ROOT}/utils/parse_options.sh || exit -1;
+
+if [ $# -ne 2 ] ; then
+   echo "Usage: $0 [options] <data-dir> <conf-path>";
+   echo "e.g.: $0 ./data/ conf/ecapa_tdnn.yaml"
+   echo "Options: "
+   echo "  --stage <stage|-1>               # Used to run a partially-completed data process from somewhere in the middle."
+   echo "  --stop-stage <stop-stage|100>    # Used to run a partially-completed data process stop stage in the middle"
+   exit 1;
+fi
+
+dir=$1
+conf_path=$2
+mkdir -p ${dir}
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # data prepare for vox1 and vox2, vox2 must be converted from m4a to wav
+    # we should use the local/convert.sh convert m4a to wav
+    python3 local/data_prepare.py \
+                        --data-dir ${dir} \
+                        --config ${conf_path}
+fi 
+
+TARGET_DIR=${MAIN_ROOT}/dataset
+mkdir -p ${TARGET_DIR}
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # download data, generate manifests
+    python3 ${TARGET_DIR}/voxceleb/voxceleb1.py \
+      --manifest_prefix="data/vox1/manifest" \
+      --target_dir="${TARGET_DIR}/voxceleb/vox1/"
+
+    if [ $? -ne 0 ]; then
+        echo "Prepare voxceleb failed. Terminated."
+        exit 1
+    fi
+
+   #  for dataset in train dev test; do
+   #      mv data/manifest.${dataset} data/manifest.${dataset}.raw
+   #  done
+fi
\ No newline at end of file
diff --git a/examples/voxceleb/sv0/local/data_prepare.py b/examples/voxceleb/sv0/local/data_prepare.py
new file mode 100644
index 000000000..03d054004
--- /dev/null
+++ b/examples/voxceleb/sv0/local/data_prepare.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+
+import paddle
+from yacs.config import CfgNode
+
+from paddleaudio.datasets.voxceleb import VoxCeleb
+from paddlespeech.s2t.utils.log import Log
+from paddlespeech.vector.io.augment import build_augment_pipeline
+from paddlespeech.vector.training.seeding import seed_everything
+
+logger = Log(__name__).getlog()
+
+
+def main(args, config):
+
+    # stage0: set the cpu device, all data prepare process will be done in cpu mode
+    paddle.set_device("cpu")
+    # set the random seed, it is a must for multiprocess training
+    seed_everything(config.seed)
+
+    # stage 1: generate the voxceleb csv file
+    # Note: this may occurs c++ execption, but the program will execute fine
+    # so we ignore the execption 
+    # we explicitly pass the vox2 base path to data prepare and generate the audio info
+    logger.info("start to generate the voxceleb dataset info")
+    train_dataset = VoxCeleb(
+        'train', target_dir=args.data_dir, vox2_base_path=config.vox2_base_path)
+
+    # stage 2: generate the augment noise csv file
+    if config.augment:
+        logger.info("start to generate the augment dataset info")
+        augment_pipeline = build_augment_pipeline(target_dir=args.data_dir)
+
+
+if __name__ == "__main__":
+    # yapf: disable
+    parser = argparse.ArgumentParser(__doc__)
+    parser.add_argument("--data-dir",
+                        default="./data/",
+                        type=str,
+                        help="data directory")
+    parser.add_argument("--config",
+                        default=None,
+                        type=str,
+                        help="configuration file")
+    args = parser.parse_args()
+    # yapf: enable
+
+    # https://yaml.org/type/float.html
+    config = CfgNode(new_allowed=True)
+    if args.config:
+        config.merge_from_file(args.config)
+
+    config.freeze()
+    print(config)
+
+    main(args, config)
diff --git a/examples/voxceleb/sv0/local/emb.sh b/examples/voxceleb/sv0/local/emb.sh
new file mode 100755
index 000000000..31d79e52d
--- /dev/null
+++ b/examples/voxceleb/sv0/local/emb.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+. ./path.sh
+
+stage=0
+stop_stage=100
+exp_dir=exp/ecapa-tdnn-vox12-big/            # experiment directory
+conf_path=conf/ecapa_tdnn.yaml
+audio_path="demo/voxceleb/00001.wav"
+use_gpu=true
+
+. ${MAIN_ROOT}/utils/parse_options.sh || exit -1;
+
+if [ $# -ne 0 ] ; then
+   echo "Usage: $0 [options]";
+   echo "e.g.: $0 ./data/ exp/voxceleb12/ conf/ecapa_tdnn.yaml"
+   echo "Options: "
+   echo "  --use-gpu <true,false|true>      # specify is gpu is to be used for training"
+   echo "  --stage <stage|-1>               # Used to run a partially-completed data process from somewhere in the middle."
+   echo "  --stop-stage <stop-stage|100>    # Used to run a partially-completed data process stop stage in the middle"
+   echo "  --exp-dir                        # experiment directorh, where is has the model.pdparams"
+   echo "  --conf-path                      # configuration file for extracting the embedding"
+   echo "  --audio-path                     # audio-path, which will be processed to extract the embedding"
+   exit 1;
+fi
+
+# set the test device
+device="cpu"
+if ${use_gpu}; then
+    device="gpu"
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # extract the audio embedding
+    python3 ${BIN_DIR}/extract_emb.py --device ${device} \
+            --config ${conf_path} \
+            --audio-path ${audio_path} --load-checkpoint ${exp_dir}
+fi
\ No newline at end of file
diff --git a/examples/voxceleb/sv0/local/test.sh b/examples/voxceleb/sv0/local/test.sh
new file mode 100644
index 000000000..4460a165a
--- /dev/null
+++ b/examples/voxceleb/sv0/local/test.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+stage=1
+stop_stage=100
+use_gpu=true    # if true, we run on GPU.
+
+. ${MAIN_ROOT}/utils/parse_options.sh || exit -1;
+
+if [ $# -ne 3 ] ; then
+   echo "Usage: $0 [options] <data-dir> <exp-dir> <conf-path>";
+   echo "e.g.: $0 ./data/ exp/voxceleb12/ conf/ecapa_tdnn.yaml"
+   echo "Options: "
+   echo "  --use-gpu <true,false|true>      # specify is gpu is to be used for training"
+   echo "  --stage <stage|-1>               # Used to run a partially-completed data process from somewhere in the middle."
+   echo "  --stop-stage <stop-stage|100>    # Used to run a partially-completed data process stop stage in the middle"
+   exit 1;
+fi
+
+dir=$1
+exp_dir=$2
+conf_path=$3
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+   # test the model and compute the eer metrics
+   python3 ${BIN_DIR}/test.py \
+         --data-dir ${dir} \
+         --load-checkpoint ${exp_dir} \
+         --config ${conf_path}
+fi
diff --git a/examples/voxceleb/sv0/local/train.sh b/examples/voxceleb/sv0/local/train.sh
new file mode 100755
index 000000000..5477d0a34
--- /dev/null
+++ b/examples/voxceleb/sv0/local/train.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+stage=0
+stop_stage=100
+use_gpu=true    # if true, we run on GPU.
+
+. ${MAIN_ROOT}/utils/parse_options.sh || exit -1;
+
+if [ $# -ne 3 ] ; then
+   echo "Usage: $0 [options] <data-dir> <exp-dir> <conf-path>";
+   echo "e.g.: $0 ./data/ exp/voxceleb12/ conf/ecapa_tdnn.yaml"
+   echo "Options: "
+   echo "  --use-gpu <true,false|true>      # specify is gpu is to be used for training"
+   echo "  --stage <stage|-1>               # Used to run a partially-completed data process from somewhere in the middle."
+   echo "  --stop-stage <stop-stage|100>    # Used to run a partially-completed data process stop stage in the middle"
+   exit 1;
+fi
+
+dir=$1
+exp_dir=$2
+conf_path=$3
+
+# get the gpu nums for training
+ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+echo "using $ngpu gpus..."
+
+# setting training device
+device="cpu"
+if ${use_gpu}; then
+    device="gpu"
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train the speaker identification task with voxceleb data
+    # and we will create the trained model parameters in ${exp_dir}/model.pdparams as the soft link
+    # Note: we will store the log file in exp/log directory
+    python3 -m paddle.distributed.launch --gpus=$CUDA_VISIBLE_DEVICES \
+        ${BIN_DIR}/train.py --device ${device} --checkpoint-dir ${exp_dir} \
+        --data-dir ${dir} --config ${conf_path}
+
+fi 
+
+if [ $? -ne 0 ]; then
+    echo "Failed in training!"
+    exit 1
+fi
+
+exit 0
\ No newline at end of file
diff --git a/examples/voxceleb/sv0/path.sh b/examples/voxceleb/sv0/path.sh
new file mode 100755
index 000000000..2be098e04
--- /dev/null
+++ b/examples/voxceleb/sv0/path.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
+
+MODEL=ecapa_tdnn
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/vector/exps/${MODEL}
\ No newline at end of file
diff --git a/examples/voxceleb/sv0/run.sh b/examples/voxceleb/sv0/run.sh
new file mode 100755
index 000000000..bbc9e3dbb
--- /dev/null
+++ b/examples/voxceleb/sv0/run.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+. ./path.sh
+set -e
+
+#######################################################################
+# stage 0: data prepare, including voxceleb1 download and generate {train,dev,enroll,test}.csv
+#          voxceleb2 data is m4a format, so we need user to convert the m4a to wav yourselves as described in Readme.md with the script local/convert.sh
+# stage 1: train the speaker identification model
+# stage 2: test speaker identification 
+# stage 3: extract the training embeding to train the LDA and PLDA
+######################################################################
+
+# we can set the variable PPAUDIO_HOME to specifiy the root directory of the downloaded vox1 and vox2 dataset 
+# default the dataset will be stored in the ~/.paddleaudio/
+# the vox2 dataset is stored in m4a format, we need to convert the audio from m4a to wav yourself
+# and put all of them to ${PPAUDIO_HOME}/datasets/vox2
+# we will find the wav from ${PPAUDIO_HOME}/datasets/vox1/wav and ${PPAUDIO_HOME}/datasets/vox2/wav
+# export PPAUDIO_HOME=
+stage=0
+stop_stage=50
+
+# data directory
+# if we set the variable ${dir}, we will store the wav info to this directory
+# otherwise, we will store the wav info to vox1 and vox2 directory respectively
+# vox2 wav path, we must convert the m4a format to wav format    
+dir=data/                                 # data info directory   
+
+exp_dir=exp/ecapa-tdnn-vox12-big/            # experiment directory
+conf_path=conf/ecapa_tdnn.yaml          
+gpus=0,1,2,3
+
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
+
+mkdir -p ${exp_dir}
+
+if [ $stage -le 0 ] && [ ${stop_stage} -ge 0 ]; then 
+     # stage 0: data prepare for vox1 and vox2, vox2 must be converted from m4a to wav
+     bash ./local/data.sh ${dir} ${conf_path}|| exit -1;
+fi
+
+if [ $stage -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+     # stage 1: train the speaker identification model
+     CUDA_VISIBLE_DEVICES=${gpus} bash ./local/train.sh ${dir} ${exp_dir} ${conf_path} 
+fi
+
+if [ $stage -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+     # stage 2: get the speaker verification scores with cosine function
+     #          now we only support use cosine to get the scores
+     CUDA_VISIBLE_DEVICES=0 bash ./local/test.sh ${dir} ${exp_dir} ${conf_path}
+fi
+
+# if [ $stage -le 3 ]; then
+#      # stage 2: extract the training embeding to train the LDA and PLDA
+#      # todo: extract the training embedding
+# fi 
diff --git a/examples/voxceleb/sv0/utils b/examples/voxceleb/sv0/utils
new file mode 120000
index 000000000..256f914ab
--- /dev/null
+++ b/examples/voxceleb/sv0/utils
@@ -0,0 +1 @@
+../../../utils/
\ No newline at end of file
diff --git a/paddleaudio/.gitignore b/paddleaudio/.gitignore
new file mode 100644
index 000000000..1c930053d
--- /dev/null
+++ b/paddleaudio/.gitignore
@@ -0,0 +1,2 @@
+.eggs
+*.wav
diff --git a/paddleaudio/CHANGELOG.md b/paddleaudio/CHANGELOG.md
index 91b0fef08..925d77696 100644
--- a/paddleaudio/CHANGELOG.md
+++ b/paddleaudio/CHANGELOG.md
@@ -1,5 +1,9 @@
 # Changelog
 
+Date: 2022-3-15, Author: Xiaojie Chen.
+  - kaldi and librosa mfcc, fbank, spectrogram.
+  - unit test and benchmark.
+
 Date: 2022-2-25, Author: Hui Zhang.
   - Refactor architecture.
-  - dtw distance and mcd style dtw
+  - dtw distance and mcd style dtw.
diff --git a/paddleaudio/README.md b/paddleaudio/README.md
new file mode 100644
index 000000000..697c01739
--- /dev/null
+++ b/paddleaudio/README.md
@@ -0,0 +1,7 @@
+# PaddleAudio
+
+PaddleAudio is an audio library for PaddlePaddle.
+
+## Install
+
+`pip install .`
diff --git a/paddleaudio/docs/Makefile b/paddleaudio/docs/Makefile
new file mode 100644
index 000000000..69fe55ecf
--- /dev/null
+++ b/paddleaudio/docs/Makefile
@@ -0,0 +1,19 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+SOURCEDIR     = source
+BUILDDIR      = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
\ No newline at end of file
diff --git a/paddleaudio/docs/README.md b/paddleaudio/docs/README.md
new file mode 100644
index 000000000..20626f52b
--- /dev/null
+++ b/paddleaudio/docs/README.md
@@ -0,0 +1,24 @@
+# Build docs for PaddleAudio
+
+Execute the following steps in **current directory**.
+
+## 1. Install
+
+`pip install Sphinx sphinx_rtd_theme`
+
+
+## 2. Generate API docs
+
+Generate API docs from doc string.
+
+`sphinx-apidoc -fMeT -o source ../paddleaudio ../paddleaudio/utils --templatedir source/_templates`
+
+
+## 3. Build
+
+`sphinx-build source _html`
+
+
+## 4. Preview
+
+Open `_html/index.html` for page preview.
diff --git a/paddleaudio/docs/images/paddle.png b/paddleaudio/docs/images/paddle.png
new file mode 100644
index 000000000..bc1135abf
Binary files /dev/null and b/paddleaudio/docs/images/paddle.png differ
diff --git a/paddleaudio/docs/make.bat b/paddleaudio/docs/make.bat
new file mode 100644
index 000000000..543c6b13b
--- /dev/null
+++ b/paddleaudio/docs/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.http://sphinx-doc.org/
+	exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
+
+:end
+popd
diff --git a/paddleaudio/docs/source/_static/custom.css b/paddleaudio/docs/source/_static/custom.css
new file mode 100644
index 000000000..bb65c51a9
--- /dev/null
+++ b/paddleaudio/docs/source/_static/custom.css
@@ -0,0 +1,5 @@
+.wy-nav-content {
+    max-width: 80%;
+}
+.table table{ background:#b9b9b9} 
+.table table td{ background:#FFF; } 
diff --git a/paddleaudio/docs/source/_templates/module.rst_t b/paddleaudio/docs/source/_templates/module.rst_t
new file mode 100644
index 000000000..d9a50e6b9
--- /dev/null
+++ b/paddleaudio/docs/source/_templates/module.rst_t
@@ -0,0 +1,9 @@
+{%- if show_headings %}
+{{- basename | e | heading }}
+
+{% endif -%}
+.. automodule:: {{ qualname }}
+{%- for option in automodule_options %}
+   :{{ option }}:
+{%- endfor %}
+
diff --git a/paddleaudio/docs/source/_templates/package.rst_t b/paddleaudio/docs/source/_templates/package.rst_t
new file mode 100644
index 000000000..7239c11b7
--- /dev/null
+++ b/paddleaudio/docs/source/_templates/package.rst_t
@@ -0,0 +1,57 @@
+{%- macro automodule(modname, options) -%}
+.. automodule:: {{ modname }}
+{%- for option in options %}
+   :{{ option }}:
+{%- endfor %}
+{%- endmacro %}
+
+{%- macro toctree(docnames) -%}
+.. toctree::
+   :maxdepth: {{ maxdepth }}
+{% for docname in docnames %}
+   {{ docname }}
+{%- endfor %}
+{%- endmacro %}
+
+{%- if is_namespace %}
+{{- [pkgname, "namespace"] | join(" ") | e | heading }}
+{% else %}
+{{- pkgname | e | heading }}
+{% endif %}
+
+{%- if is_namespace %}
+.. py:module:: {{ pkgname }}
+{% endif %}
+
+{%- if modulefirst and not is_namespace %}
+{{ automodule(pkgname, automodule_options) }}
+{% endif %}
+
+{%- if subpackages %}
+Subpackages
+-----------
+
+{{ toctree(subpackages) }}
+{% endif %}
+
+{%- if submodules %}
+Submodules
+----------
+{% if separatemodules %}
+{{ toctree(submodules) }}
+{% else %}
+{%- for submodule in submodules %}
+{% if show_headings %}
+{{- submodule | e | heading(2) }}
+{% endif %}
+{{ automodule(submodule, automodule_options) }}
+{% endfor %}
+{%- endif %}
+{%- endif %}
+
+{%- if not modulefirst and not is_namespace %}
+Module contents
+---------------
+
+{{ automodule(pkgname, automodule_options) }}
+{% endif %}
diff --git a/paddleaudio/docs/source/_templates/toc.rst_t b/paddleaudio/docs/source/_templates/toc.rst_t
new file mode 100644
index 000000000..f0877eeb2
--- /dev/null
+++ b/paddleaudio/docs/source/_templates/toc.rst_t
@@ -0,0 +1,8 @@
+{{ header | heading }}
+
+.. toctree::
+   :maxdepth: {{ maxdepth }}
+{% for docname in docnames %}
+   {{ docname }}
+{%- endfor %}
+
diff --git a/paddleaudio/docs/source/conf.py b/paddleaudio/docs/source/conf.py
new file mode 100644
index 000000000..09c4f312f
--- /dev/null
+++ b/paddleaudio/docs/source/conf.py
@@ -0,0 +1,181 @@
+# -*- coding: utf-8 -*-
+#
+# Configuration file for the Sphinx documentation builder.
+#
+# This file does only contain a selection of the most common options. For a
+# full list see the documentation:
+# http://www.sphinx-doc.org/en/master/config
+# -- Path setup --------------------------------------------------------------
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+import os
+import sys
+sys.path.insert(0, os.path.abspath('../..'))
+
+# -- Project information -----------------------------------------------------
+
+project = 'PaddleAudio'
+copyright = '2022, PaddlePaddle'
+author = 'PaddlePaddle'
+
+# The short X.Y version
+version = ''
+# The full version, including alpha/beta/rc tags
+release = '0.2.0'
+
+# -- General configuration ---------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    'sphinx.ext.autodoc',
+    'sphinx.ext.intersphinx',
+    'sphinx.ext.mathjax',
+    'sphinx.ext.viewcode',
+    'sphinx.ext.napoleon',
+]
+
+napoleon_google_docstring = True
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+# source_suffix = ['.rst', '.md']
+source_suffix = '.rst'
+
+# The master toctree document.
+master_doc = 'index'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = []
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = None
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+
+import sphinx_rtd_theme
+html_theme = 'sphinx_rtd_theme'
+html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
+smartquotes = False
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#
+# html_theme_options = {}
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+html_logo = '../images/paddle.png'
+html_css_files = [
+    'custom.css',
+]
+
+# Custom sidebar templates, must be a dictionary that maps document names
+# to template names.
+#
+# The default sidebars (for documents that don't match any pattern) are
+# defined by theme itself.  Builtin themes are using these templates by
+# default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
+# 'searchbox.html']``.
+#
+# html_sidebars = {}
+
+# -- Options for HTMLHelp output ---------------------------------------------
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'PaddleAudiodoc'
+
+# -- Options for LaTeX output ------------------------------------------------
+
+latex_elements = {
+    # The paper size ('letterpaper' or 'a4paper').
+    #
+    # 'papersize': 'letterpaper',
+
+    # The font size ('10pt', '11pt' or '12pt').
+    #
+    # 'pointsize': '10pt',
+
+    # Additional stuff for the LaTeX preamble.
+    #
+    # 'preamble': '',
+
+    # Latex figure (float) alignment
+    #
+    # 'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+    (master_doc, 'PaddleAudio.tex', 'PaddleAudio Documentation', 'PaddlePaddle',
+     'manual'),
+]
+
+# -- Options for manual page output ------------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [(master_doc, 'paddleaudio', 'PaddleAudio Documentation', [author],
+              1)]
+
+# -- Options for Texinfo output ----------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+    (master_doc, 'PaddleAudio', 'PaddleAudio Documentation', author,
+     'PaddleAudio', 'One line description of project.', 'Miscellaneous'),
+]
+
+# -- Options for Epub output -------------------------------------------------
+
+# Bibliographic Dublin Core info.
+epub_title = project
+
+# The unique identifier of the text. This can be a ISBN number
+# or the project homepage.
+#
+# epub_identifier = ''
+
+# A unique identification for the text.
+#
+# epub_uid = ''
+
+# A list of files that should not be packed into the epub file.
+epub_exclude_files = ['search.html']
+
+# -- Extension configuration -------------------------------------------------
+
+# -- Options for intersphinx extension ---------------------------------------
+
+# Example configuration for intersphinx: refer to the Python standard library.
+intersphinx_mapping = {'https://docs.python.org/': None}
diff --git a/paddleaudio/docs/source/index.rst b/paddleaudio/docs/source/index.rst
new file mode 100644
index 000000000..26963308e
--- /dev/null
+++ b/paddleaudio/docs/source/index.rst
@@ -0,0 +1,22 @@
+.. PaddleAudio documentation master file, created by
+   sphinx-quickstart on Tue Mar 22 15:57:16 2022.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Welcome to PaddleAudio's documentation!
+=======================================
+
+.. toctree::
+   :maxdepth: 1
+
+   Index <self>
+
+
+API References
+--------------
+
+.. toctree::
+   :maxdepth: 2
+   :titlesonly:
+
+   paddleaudio
\ No newline at end of file
diff --git a/paddleaudio/paddleaudio/backends/soundfile_backend.py b/paddleaudio/paddleaudio/backends/soundfile_backend.py
index 2b920284a..c1155654f 100644
--- a/paddleaudio/paddleaudio/backends/soundfile_backend.py
+++ b/paddleaudio/paddleaudio/backends/soundfile_backend.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import os
 import warnings
 from typing import Optional
 from typing import Tuple
@@ -19,7 +20,6 @@ from typing import Union
 import numpy as np
 import resampy
 import soundfile as sf
-from numpy import ndarray as array
 from scipy.io import wavfile
 
 from ..utils import ParameterError
@@ -38,13 +38,21 @@ RESAMPLE_MODES = ['kaiser_best', 'kaiser_fast']
 EPS = 1e-8
 
 
-def resample(y: array, src_sr: int, target_sr: int,
-             mode: str='kaiser_fast') -> array:
-    """ Audio resampling
-     This function is the same as using resampy.resample().
-     Notes:
-        The default mode is kaiser_fast.  For better audio quality, use mode = 'kaiser_fast'
-     """
+def resample(y: np.ndarray,
+             src_sr: int,
+             target_sr: int,
+             mode: str='kaiser_fast') -> np.ndarray:
+    """Audio resampling.
+
+    Args:
+        y (np.ndarray): Input waveform array in 1D or 2D.
+        src_sr (int): Source sample rate.
+        target_sr (int): Target sample rate.
+        mode (str, optional): The resampling filter to use. Defaults to 'kaiser_fast'.
+
+    Returns:
+        np.ndarray: `y` resampled to `target_sr`
+    """
 
     if mode == 'kaiser_best':
         warnings.warn(
@@ -53,7 +61,7 @@ def resample(y: array, src_sr: int, target_sr: int,
 
     if not isinstance(y, np.ndarray):
         raise ParameterError(
-            'Only support numpy array, but received y in {type(y)}')
+            'Only support numpy np.ndarray, but received y in {type(y)}')
 
     if mode not in RESAMPLE_MODES:
         raise ParameterError(f'resample mode must in {RESAMPLE_MODES}')
@@ -61,9 +69,17 @@ def resample(y: array, src_sr: int, target_sr: int,
     return resampy.resample(y, src_sr, target_sr, filter=mode)
 
 
-def to_mono(y: array, merge_type: str='average') -> array:
-    """ convert sterior audio to mono
+def to_mono(y: np.ndarray, merge_type: str='average') -> np.ndarray:
+    """Convert sterior audio to mono.
+
+    Args:
+        y (np.ndarray): Input waveform array in 1D or 2D.
+        merge_type (str, optional): Merge type to generate mono waveform. Defaults to 'average'.
+
+    Returns:
+        np.ndarray: `y` with mono channel.
     """
+
     if merge_type not in MERGE_TYPES:
         raise ParameterError(
             f'Unsupported merge type {merge_type}, available types are {MERGE_TYPES}'
@@ -101,18 +117,34 @@ def to_mono(y: array, merge_type: str='average') -> array:
     return y_out
 
 
-def _safe_cast(y: array, dtype: Union[type, str]) -> array:
-    """ data type casting in a safe way, i.e., prevent overflow or underflow
-    This function is used internally.
+def _safe_cast(y: np.ndarray, dtype: Union[type, str]) -> np.ndarray:
+    """Data type casting in a safe way, i.e., prevent overflow or underflow.
+
+    Args:
+        y (np.ndarray): Input waveform array in 1D or 2D.
+        dtype (Union[type, str]): Data type of waveform.
+
+    Returns:
+        np.ndarray: `y` after safe casting.
     """
-    return np.clip(y, np.iinfo(dtype).min, np.iinfo(dtype).max).astype(dtype)
+    if 'float' in str(y.dtype):
+        return np.clip(y, np.finfo(dtype).min,
+                       np.finfo(dtype).max).astype(dtype)
+    else:
+        return np.clip(y, np.iinfo(dtype).min,
+                       np.iinfo(dtype).max).astype(dtype)
 
 
-def depth_convert(y: array, dtype: Union[type, str],
-                  dithering: bool=True) -> array:
-    """Convert audio array to target dtype safely
-    This function convert audio waveform to a target dtype, with addition steps of
+def depth_convert(y: np.ndarray, dtype: Union[type, str]) -> np.ndarray:
+    """Convert audio array to target dtype safely. This function convert audio waveform to a target dtype, with addition steps of
     preventing overflow/underflow and preserving audio range.
+
+    Args:
+        y (np.ndarray): Input waveform array in 1D or 2D.
+        dtype (Union[type, str]): Data type of waveform.
+
+    Returns:
+        np.ndarray: `y` after safe casting.
     """
 
     SUPPORT_DTYPE = ['int16', 'int8', 'float32', 'float64']
@@ -157,14 +189,20 @@ def depth_convert(y: array, dtype: Union[type, str],
     return y
 
 
-def sound_file_load(file: str,
+def sound_file_load(file: os.PathLike,
                     offset: Optional[float]=None,
                     dtype: str='int16',
-                    duration: Optional[int]=None) -> Tuple[array, int]:
-    """Load audio using soundfile library
-    This function load audio file using libsndfile.
-    Reference:
-        http://www.mega-nerd.com/libsndfile/#Features
+                    duration: Optional[int]=None) -> Tuple[np.ndarray, int]:
+    """Load audio using soundfile library. This function load audio file using libsndfile.
+
+    Args:
+        file (os.PathLike): File of waveform.
+        offset (Optional[float], optional): Offset to the start of waveform. Defaults to None.
+        dtype (str, optional): Data type of waveform. Defaults to 'int16'.
+        duration (Optional[int], optional): Duration of waveform to read. Defaults to None.
+
+    Returns:
+        Tuple[np.ndarray, int]: Waveform in ndarray and its samplerate.
     """
     with sf.SoundFile(file) as sf_desc:
         sr_native = sf_desc.samplerate
@@ -179,9 +217,17 @@ def sound_file_load(file: str,
     return y, sf_desc.samplerate
 
 
-def normalize(y: array, norm_type: str='linear',
-              mul_factor: float=1.0) -> array:
-    """ normalize an input audio with additional multiplier.
+def normalize(y: np.ndarray, norm_type: str='linear',
+              mul_factor: float=1.0) -> np.ndarray:
+    """Normalize an input audio with additional multiplier.
+
+    Args:
+        y (np.ndarray): Input waveform array in 1D or 2D.
+        norm_type (str, optional): Type of normalization. Defaults to 'linear'.
+        mul_factor (float, optional): Scaling factor. Defaults to 1.0.
+
+    Returns:
+        np.ndarray: `y` after normalization.
     """
 
     if norm_type == 'linear':
@@ -199,12 +245,13 @@ def normalize(y: array, norm_type: str='linear',
     return y
 
 
-def save(y: array, sr: int, file: str) -> None:
-    """Save audio file to disk.
-    This function saves audio to disk using scipy.io.wavfile, with additional step
-    to convert input waveform to int16 unless it already is int16
-    Notes:
-        It only support raw wav format.
+def save(y: np.ndarray, sr: int, file: os.PathLike) -> None:
+    """Save audio file to disk. This function saves audio to disk using scipy.io.wavfile, with additional step to convert input waveform to int16.
+
+    Args:
+        y (np.ndarray): Input waveform array in 1D or 2D.
+        sr (int): Sample rate.
+        file (os.PathLike): Path of auido file to save.
     """
     if not file.endswith('.wav'):
         raise ParameterError(
@@ -226,7 +273,7 @@ def save(y: array, sr: int, file: str) -> None:
 
 
 def load(
-        file: str,
+        file: os.PathLike,
         sr: Optional[int]=None,
         mono: bool=True,
         merge_type: str='average',  # ch0,ch1,random,average
@@ -236,11 +283,24 @@ def load(
         offset: float=0.0,
         duration: Optional[int]=None,
         dtype: str='float32',
-        resample_mode: str='kaiser_fast') -> Tuple[array, int]:
-    """Load audio file from disk.
-    This function loads audio from disk using using audio beackend.
-    Parameters:
-    Notes:
+        resample_mode: str='kaiser_fast') -> Tuple[np.ndarray, int]:
+    """Load audio file from disk. This function loads audio from disk using using audio beackend.
+
+    Args:
+        file (os.PathLike): Path of auido file to load.
+        sr (Optional[int], optional): Sample rate of loaded waveform. Defaults to None.
+        mono (bool, optional): Return waveform with mono channel. Defaults to True.
+        merge_type (str, optional): Merge type of multi-channels waveform. Defaults to 'average'.
+        normal (bool, optional): Waveform normalization. Defaults to True.
+        norm_type (str, optional): Type of normalization. Defaults to 'linear'.
+        norm_mul_factor (float, optional): Scaling factor. Defaults to 1.0.
+        offset (float, optional): Offset to the start of waveform. Defaults to 0.0.
+        duration (Optional[int], optional): Duration of waveform to read. Defaults to None.
+        dtype (str, optional): Data type of waveform. Defaults to 'float32'.
+        resample_mode (str, optional): The resampling filter to use. Defaults to 'kaiser_fast'.
+
+    Returns:
+        Tuple[np.ndarray, int]: Waveform in ndarray and its samplerate.
     """
 
     y, r = sound_file_load(file, offset=offset, dtype=dtype, duration=duration)
diff --git a/paddleaudio/paddleaudio/compliance/__init__.py b/paddleaudio/paddleaudio/compliance/__init__.py
index 97043fd7b..c08f9ab11 100644
--- a/paddleaudio/paddleaudio/compliance/__init__.py
+++ b/paddleaudio/paddleaudio/compliance/__init__.py
@@ -11,3 +11,5 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from . import kaldi
+from . import librosa
diff --git a/paddleaudio/paddleaudio/compliance/kaldi.py b/paddleaudio/paddleaudio/compliance/kaldi.py
index 8cb9b6660..538be0196 100644
--- a/paddleaudio/paddleaudio/compliance/kaldi.py
+++ b/paddleaudio/paddleaudio/compliance/kaldi.py
@@ -220,7 +220,7 @@ def spectrogram(waveform: Tensor,
     """Compute and return a spectrogram from a waveform. The output is identical to Kaldi's.
 
     Args:
-        waveform (Tensor): A waveform tensor with shape [C, T].
+        waveform (Tensor): A waveform tensor with shape `(C, T)`.
         blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42.
         channel (int, optional): Select the channel of waveform. Defaults to -1.
         dither (float, optional): Dithering constant . Defaults to 0.0.
@@ -239,7 +239,7 @@ def spectrogram(waveform: Tensor,
         window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY.
 
     Returns:
-        Tensor: A spectrogram tensor with shape (m, padded_window_size // 2 + 1) where m is the number of frames
+        Tensor: A spectrogram tensor with shape `(m, padded_window_size // 2 + 1)` where m is the number of frames
             depends on frame_length and frame_shift.
     """
     dtype = waveform.dtype
@@ -422,7 +422,7 @@ def fbank(waveform: Tensor,
     """Compute and return filter banks from a waveform. The output is identical to Kaldi's.
 
     Args:
-        waveform (Tensor): A waveform tensor with shape [C, T].
+        waveform (Tensor): A waveform tensor with shape `(C, T)`.
         blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42.
         channel (int, optional): Select the channel of waveform. Defaults to -1.
         dither (float, optional): Dithering constant . Defaults to 0.0.
@@ -451,7 +451,7 @@ def fbank(waveform: Tensor,
         window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY.
 
     Returns:
-        Tensor: A filter banks tensor with shape (m, n_mels).
+        Tensor: A filter banks tensor with shape `(m, n_mels)`.
     """
     dtype = waveform.dtype
 
@@ -542,7 +542,7 @@ def mfcc(waveform: Tensor,
             identical to Kaldi's.
 
     Args:
-        waveform (Tensor): A waveform tensor with shape [C, T].
+        waveform (Tensor): A waveform tensor with shape `(C, T)`.
         blackman_coeff (float, optional): Coefficient for Blackman window.. Defaults to 0.42.
         cepstral_lifter (float, optional): Scaling of output mfccs. Defaults to 22.0.
         channel (int, optional): Select the channel of waveform. Defaults to -1.
@@ -571,7 +571,7 @@ def mfcc(waveform: Tensor,
         window_type (str, optional): Choose type of window for FFT computation. Defaults to POVEY.
 
     Returns:
-        Tensor: A mel frequency cepstral coefficients tensor with shape (m, n_mfcc).
+        Tensor: A mel frequency cepstral coefficients tensor with shape `(m, n_mfcc)`.
     """
     assert n_mfcc <= n_mels, 'n_mfcc cannot be larger than n_mels: %d vs %d' % (
         n_mfcc, n_mels)
diff --git a/paddleaudio/paddleaudio/compliance/librosa.py b/paddleaudio/paddleaudio/compliance/librosa.py
index 167795c37..740584ca5 100644
--- a/paddleaudio/paddleaudio/compliance/librosa.py
+++ b/paddleaudio/paddleaudio/compliance/librosa.py
@@ -19,7 +19,6 @@ from typing import Union
 
 import numpy as np
 import scipy
-from numpy import ndarray as array
 from numpy.lib.stride_tricks import as_strided
 from scipy import signal
 
@@ -32,7 +31,6 @@ __all__ = [
     'mfcc',
     'hz_to_mel',
     'mel_to_hz',
-    'split_frames',
     'mel_frequencies',
     'power_to_db',
     'compute_fbank_matrix',
@@ -49,7 +47,8 @@ __all__ = [
 ]
 
 
-def pad_center(data: array, size: int, axis: int=-1, **kwargs) -> array:
+def _pad_center(data: np.ndarray, size: int, axis: int=-1,
+                **kwargs) -> np.ndarray:
     """Pad an array to a target length along a target axis.
 
     This differs from `np.pad` by centering the data prior to padding,
@@ -69,8 +68,10 @@ def pad_center(data: array, size: int, axis: int=-1, **kwargs) -> array:
     return np.pad(data, lengths, **kwargs)
 
 
-def split_frames(x: array, frame_length: int, hop_length: int,
-                 axis: int=-1) -> array:
+def _split_frames(x: np.ndarray,
+                  frame_length: int,
+                  hop_length: int,
+                  axis: int=-1) -> np.ndarray:
     """Slice a data array into (overlapping) frames.
 
     This function is aligned with librosa.frame
@@ -142,11 +143,16 @@ def _check_audio(y, mono=True) -> bool:
     return True
 
 
-def hz_to_mel(frequencies: Union[float, List[float], array],
-              htk: bool=False) -> array:
-    """Convert Hz to Mels
+def hz_to_mel(frequencies: Union[float, List[float], np.ndarray],
+              htk: bool=False) -> np.ndarray:
+    """Convert Hz to Mels.
 
-    This function is aligned with librosa.
+    Args:
+        frequencies (Union[float, List[float], np.ndarray]): Frequencies in Hz.
+        htk (bool, optional): Use htk scaling. Defaults to False.
+
+    Returns:
+        np.ndarray: Frequency in mels.
     """
     freq = np.asanyarray(frequencies)
 
@@ -177,10 +183,16 @@ def hz_to_mel(frequencies: Union[float, List[float], array],
     return mels
 
 
-def mel_to_hz(mels: Union[float, List[float], array], htk: int=False) -> array:
+def mel_to_hz(mels: Union[float, List[float], np.ndarray],
+              htk: int=False) -> np.ndarray:
     """Convert mel bin numbers to frequencies.
 
-    This function is aligned with librosa.
+    Args:
+        mels (Union[float, List[float], np.ndarray]): Frequency in mels.
+        htk (bool, optional): Use htk scaling. Defaults to False.
+
+    Returns:
+        np.ndarray: Frequencies in Hz.
     """
     mel_array = np.asanyarray(mels)
 
@@ -212,10 +224,17 @@ def mel_to_hz(mels: Union[float, List[float], array], htk: int=False) -> array:
 def mel_frequencies(n_mels: int=128,
                     fmin: float=0.0,
                     fmax: float=11025.0,
-                    htk: bool=False) -> array:
-    """Compute mel frequencies
+                    htk: bool=False) -> np.ndarray:
+    """Compute mel frequencies.
+
+    Args:
+        n_mels (int, optional): Number of mel bins. Defaults to 128.
+        fmin (float, optional): Minimum frequency in Hz. Defaults to 0.0.
+        fmax (float, optional): Maximum frequency in Hz. Defaults to 11025.0.
+        htk (bool, optional): Use htk scaling. Defaults to False.
 
-    This function is aligned with librosa.
+    Returns:
+        np.ndarray: Vector of n_mels frequencies in Hz with shape `(n_mels,)`.
     """
     # 'Center freqs' of mel bands - uniformly spaced between limits
     min_mel = hz_to_mel(fmin, htk=htk)
@@ -226,10 +245,15 @@ def mel_frequencies(n_mels: int=128,
     return mel_to_hz(mels, htk=htk)
 
 
-def fft_frequencies(sr: int, n_fft: int) -> array:
+def fft_frequencies(sr: int, n_fft: int) -> np.ndarray:
     """Compute fourier frequencies.
 
-    This function is aligned with librosa.
+    Args:
+        sr (int): Sample rate.
+        n_fft (int): FFT size.
+
+    Returns:
+        np.ndarray: FFT frequencies in Hz with shape `(n_fft//2 + 1,)`.
     """
     return np.linspace(0, float(sr) / 2, int(1 + n_fft // 2), endpoint=True)
 
@@ -241,10 +265,22 @@ def compute_fbank_matrix(sr: int,
                          fmax: Optional[float]=None,
                          htk: bool=False,
                          norm: str="slaney",
-                         dtype: type=np.float32):
+                         dtype: type=np.float32) -> np.ndarray:
     """Compute fbank matrix.
 
-    This funciton is aligned with librosa.
+    Args:
+        sr (int): Sample rate.
+        n_fft (int): FFT size.
+        n_mels (int, optional): Number of mel bins. Defaults to 128.
+        fmin (float, optional): Minimum frequency in Hz. Defaults to 0.0.
+        fmax (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
+        htk (bool, optional): Use htk scaling. Defaults to False.
+        norm (str, optional): Type of normalization. Defaults to "slaney".
+        dtype (type, optional): Data type. Defaults to np.float32.
+
+
+    Returns:
+        np.ndarray: Mel transform matrix with shape `(n_mels, n_fft//2 + 1)`.
     """
     if norm != "slaney":
         raise ParameterError('norm must set to slaney')
@@ -289,17 +325,28 @@ def compute_fbank_matrix(sr: int,
     return weights
 
 
-def stft(x: array,
+def stft(x: np.ndarray,
          n_fft: int=2048,
          hop_length: Optional[int]=None,
          win_length: Optional[int]=None,
          window: str="hann",
          center: bool=True,
          dtype: type=np.complex64,
-         pad_mode: str="reflect") -> array:
+         pad_mode: str="reflect") -> np.ndarray:
     """Short-time Fourier transform (STFT).
 
-    This function is aligned with librosa.
+    Args:
+        x (np.ndarray): Input waveform in one dimension.
+        n_fft (int, optional): FFT size. Defaults to 2048.
+        hop_length (Optional[int], optional): Number of steps to advance between adjacent windows. Defaults to None.
+        win_length (Optional[int], optional): The size of window. Defaults to None.
+        window (str, optional): A string of window specification. Defaults to "hann".
+        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True.
+        dtype (type, optional): Data type of STFT results. Defaults to np.complex64.
+        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect".
+
+    Returns:
+        np.ndarray: The complex STFT output with shape `(n_fft//2 + 1, num_frames)`.
     """
     _check_audio(x)
 
@@ -314,7 +361,7 @@ def stft(x: array,
     fft_window = signal.get_window(window, win_length, fftbins=True)
 
     # Pad the window out to n_fft size
-    fft_window = pad_center(fft_window, n_fft)
+    fft_window = _pad_center(fft_window, n_fft)
 
     # Reshape so that the window can be broadcast
     fft_window = fft_window.reshape((-1, 1))
@@ -333,7 +380,7 @@ def stft(x: array,
         )
 
     # Window the time series.
-    x_frames = split_frames(x, frame_length=n_fft, hop_length=hop_length)
+    x_frames = _split_frames(x, frame_length=n_fft, hop_length=hop_length)
     # Pre-allocate the STFT matrix
     stft_matrix = np.empty(
         (int(1 + n_fft // 2), x_frames.shape[1]), dtype=dtype, order="F")
@@ -352,16 +399,20 @@ def stft(x: array,
     return stft_matrix
 
 
-def power_to_db(spect: array,
+def power_to_db(spect: np.ndarray,
                 ref: float=1.0,
                 amin: float=1e-10,
-                top_db: Optional[float]=80.0) -> array:
-    """Convert a power spectrogram (amplitude squared) to decibel (dB) units
+                top_db: Optional[float]=80.0) -> np.ndarray:
+    """Convert a power spectrogram (amplitude squared) to decibel (dB) units. The function computes the scaling `10 * log10(x / ref)` in a numerically stable way.
 
-    This computes the scaling ``10 * log10(spect / ref)`` in a numerically
-    stable way.
+    Args:
+        spect (np.ndarray): STFT power spectrogram of an input waveform.
+        ref (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0.
+        amin (float, optional): Minimum threshold. Defaults to 1e-10.
+        top_db (Optional[float], optional): Threshold the output at `top_db` below the peak. Defaults to 80.0.
 
-    This function is aligned with librosa.
+    Returns:
+        np.ndarray: Power spectrogram in db scale.
     """
     spect = np.asarray(spect)
 
@@ -394,49 +445,27 @@ def power_to_db(spect: array,
     return log_spec
 
 
-def mfcc(x,
+def mfcc(x: np.ndarray,
          sr: int=16000,
-         spect: Optional[array]=None,
+         spect: Optional[np.ndarray]=None,
          n_mfcc: int=20,
          dct_type: int=2,
          norm: str="ortho",
          lifter: int=0,
-         **kwargs) -> array:
+         **kwargs) -> np.ndarray:
     """Mel-frequency cepstral coefficients (MFCCs)
 
-    This function is NOT strictly aligned with librosa. The following example shows how to get the
-    same result with librosa:
-
-    # mfcc:
-     kwargs = {
-        'window_size':512,
-        'hop_length':320,
-        'mel_bins':64,
-        'fmin':50,
-         'to_db':False}
-    a = mfcc(x,
-        spect=None,
-        n_mfcc=20,
-        dct_type=2,
-        norm='ortho',
-        lifter=0,
-        **kwargs)
-
-    # librosa mfcc:
-    spect = librosa.feature.melspectrogram(y=x,sr=16000,n_fft=512,
-                                              win_length=512,
-                                              hop_length=320,
-                                              n_mels=64, fmin=50)
-    b = librosa.feature.mfcc(y=x,
-        sr=16000,
-        S=spect,
-        n_mfcc=20,
-        dct_type=2,
-        norm='ortho',
-        lifter=0)
-
-    assert np.mean( (a-b)**2) < 1e-8
+    Args:
+        x (np.ndarray): Input waveform in one dimension.
+        sr (int, optional): Sample rate. Defaults to 16000.
+        spect (Optional[np.ndarray], optional): Input log-power Mel spectrogram. Defaults to None.
+        n_mfcc (int, optional): Number of cepstra in MFCC. Defaults to 20.
+        dct_type (int, optional): Discrete cosine transform (DCT) type. Defaults to 2.
+        norm (str, optional): Type of normalization. Defaults to "ortho".
+        lifter (int, optional): Cepstral filtering. Defaults to 0.
 
+    Returns:
+        np.ndarray: Mel frequency cepstral coefficients array with shape `(n_mfcc, num_frames)`.
     """
     if spect is None:
         spect = melspectrogram(x, sr=sr, **kwargs)
@@ -454,12 +483,12 @@ def mfcc(x,
             f"MFCC lifter={lifter} must be a non-negative number")
 
 
-def melspectrogram(x: array,
+def melspectrogram(x: np.ndarray,
                    sr: int=16000,
                    window_size: int=512,
                    hop_length: int=320,
                    n_mels: int=64,
-                   fmin: int=50,
+                   fmin: float=50.0,
                    fmax: Optional[float]=None,
                    window: str='hann',
                    center: bool=True,
@@ -468,27 +497,28 @@ def melspectrogram(x: array,
                    to_db: bool=True,
                    ref: float=1.0,
                    amin: float=1e-10,
-                   top_db: Optional[float]=None) -> array:
+                   top_db: Optional[float]=None) -> np.ndarray:
     """Compute mel-spectrogram.
 
-    Parameters:
-        x: numpy.ndarray
-        The input wavform is a numpy array [shape=(n,)]
-
-        window_size: int, typically 512, 1024, 2048, etc.
-        The window size for framing, also used as n_fft for stft
-
+    Args:
+        x (np.ndarray): Input waveform in one dimension.
+        sr (int, optional): Sample rate. Defaults to 16000.
+        window_size (int, optional): Size of FFT and window length. Defaults to 512.
+        hop_length (int, optional): Number of steps to advance between adjacent windows. Defaults to 320.
+        n_mels (int, optional): Number of mel bins. Defaults to 64.
+        fmin (float, optional): Minimum frequency in Hz. Defaults to 50.0.
+        fmax (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
+        window (str, optional): A string of window specification. Defaults to "hann".
+        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True.
+        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect".
+        power (float, optional): Exponent for the magnitude melspectrogram. Defaults to 2.0.
+        to_db (bool, optional): Enable db scale. Defaults to True.
+        ref (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0.
+        amin (float, optional): Minimum threshold. Defaults to 1e-10.
+        top_db (Optional[float], optional): Threshold the output at `top_db` below the peak. Defaults to None.
 
     Returns:
-        The mel-spectrogram in power scale or db scale(default)
-
-
-    Notes:
-    1. sr is default to 16000, which is commonly used in speech/speaker processing.
-    2. when fmax is None, it is set to sr//2.
-    3. this function will convert mel spectgrum to db scale by default. This is different
-    that of librosa.
-
+        np.ndarray: The mel-spectrogram in power scale or db scale with shape `(n_mels, num_frames)`.
     """
     _check_audio(x, mono=True)
     if len(x) <= 0:
@@ -518,18 +548,28 @@ def melspectrogram(x: array,
         return mel_spect
 
 
-def spectrogram(x: array,
+def spectrogram(x: np.ndarray,
                 sr: int=16000,
                 window_size: int=512,
                 hop_length: int=320,
                 window: str='hann',
                 center: bool=True,
                 pad_mode: str='reflect',
-                power: float=2.0) -> array:
-    """Compute spectrogram from an input waveform.
+                power: float=2.0) -> np.ndarray:
+    """Compute spectrogram.
+
+    Args:
+        x (np.ndarray): Input waveform in one dimension.
+        sr (int, optional): Sample rate. Defaults to 16000.
+        window_size (int, optional): Size of FFT and window length. Defaults to 512.
+        hop_length (int, optional): Number of steps to advance between adjacent windows. Defaults to 320.
+        window (str, optional): A string of window specification. Defaults to "hann".
+        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True.
+        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to "reflect".
+        power (float, optional): Exponent for the magnitude melspectrogram. Defaults to 2.0.
 
-    This function is a wrapper for librosa.feature.stft, with addition step to
-    compute the magnitude of the complex spectrogram.
+    Returns:
+        np.ndarray: The STFT spectrogram in power scale `(n_fft//2 + 1, num_frames)`.
     """
 
     s = stft(
@@ -544,18 +584,16 @@ def spectrogram(x: array,
     return np.abs(s)**power
 
 
-def mu_encode(x: array, mu: int=255, quantized: bool=True) -> array:
-    """Mu-law encoding.
-
-    Compute the mu-law decoding given an input code.
-    When quantized is True, the result will be converted to
-    integer in range [0,mu-1]. Otherwise, the resulting signal
-    is in range [-1,1]
-
+def mu_encode(x: np.ndarray, mu: int=255, quantized: bool=True) -> np.ndarray:
+    """Mu-law encoding. Encode waveform based on mu-law companding. When quantized is True, the result will be converted to integer in range `[0,mu-1]`. Otherwise, the resulting waveform is in range `[-1,1]`.
 
-    Reference:
-        https://en.wikipedia.org/wiki/%CE%9C-law_algorithm
+    Args:
+        x (np.ndarray): The input waveform to encode.
+        mu (int, optional): The endoceding parameter. Defaults to 255.
+        quantized (bool, optional): If `True`, quantize the encoded values into `1 + mu` distinct integer values. Defaults to True.
 
+    Returns:
+        np.ndarray: The mu-law encoded waveform.
     """
     mu = 255
     y = np.sign(x) * np.log1p(mu * np.abs(x)) / np.log1p(mu)
@@ -564,17 +602,16 @@ def mu_encode(x: array, mu: int=255, quantized: bool=True) -> array:
     return y
 
 
-def mu_decode(y: array, mu: int=255, quantized: bool=True) -> array:
-    """Mu-law decoding.
-
-    Compute the mu-law decoding given an input code.
+def mu_decode(y: np.ndarray, mu: int=255, quantized: bool=True) -> np.ndarray:
+    """Mu-law decoding. Compute the mu-law decoding given an input code. It assumes that the input `y` is in range `[0,mu-1]` when quantize is True and `[-1,1]` otherwise.
 
-    it assumes that the input y is in
-    range [0,mu-1] when quantize is True and [-1,1] otherwise
-
-    Reference:
-        https://en.wikipedia.org/wiki/%CE%9C-law_algorithm
+    Args:
+        y (np.ndarray): The encoded waveform.
+        mu (int, optional): The endoceding parameter. Defaults to 255.
+        quantized (bool, optional): If `True`, the input is assumed to be quantized to `1 + mu` distinct integer values. Defaults to True.
 
+    Returns:
+        np.ndarray: The mu-law decoded waveform.
     """
     if mu < 1:
         raise ParameterError('mu is typically set as 2**k-1, k=1, 2, 3,...')
@@ -586,7 +623,7 @@ def mu_decode(y: array, mu: int=255, quantized: bool=True) -> array:
     return x
 
 
-def randint(high: int) -> int:
+def _randint(high: int) -> int:
     """Generate one random integer in range [0 high)
 
      This is a helper function for random data augmentaiton
@@ -594,20 +631,18 @@ def randint(high: int) -> int:
     return int(np.random.randint(0, high=high))
 
 
-def rand() -> float:
-    """Generate one floating-point number in range [0 1)
-
-    This is a helper function for random data augmentaiton
-    """
-    return float(np.random.rand(1))
-
-
-def depth_augment(y: array,
+def depth_augment(y: np.ndarray,
                   choices: List=['int8', 'int16'],
-                  probs: List[float]=[0.5, 0.5]) -> array:
-    """ Audio depth augmentation
+                  probs: List[float]=[0.5, 0.5]) -> np.ndarray:
+    """ Audio depth augmentation. Do audio depth augmentation to simulate the distortion brought by quantization.
+
+    Args:
+        y (np.ndarray): Input waveform array in 1D or 2D.
+        choices (List, optional): A list of data type to depth conversion. Defaults to ['int8', 'int16'].
+        probs (List[float], optional): Probabilities to depth conversion. Defaults to [0.5, 0.5].
 
-    Do audio depth augmentation to simulate the distortion brought by quantization.
+    Returns:
+        np.ndarray: The augmented waveform.
     """
     assert len(probs) == len(
         choices
@@ -621,13 +656,18 @@ def depth_augment(y: array,
     return y2
 
 
-def adaptive_spect_augment(spect: array, tempo_axis: int=0,
-                           level: float=0.1) -> array:
-    """Do adpative spectrogram augmentation
+def adaptive_spect_augment(spect: np.ndarray,
+                           tempo_axis: int=0,
+                           level: float=0.1) -> np.ndarray:
+    """Do adpative spectrogram augmentation. The level of the augmentation is gowern by the paramter level, ranging from 0 to 1, with 0 represents no augmentation.
 
-    The level of the augmentation is gowern by the paramter level,
-    ranging from 0 to 1, with 0 represents no augmentation。
+    Args:
+        spect (np.ndarray): Input spectrogram.
+        tempo_axis (int, optional): Indicate the tempo axis. Defaults to 0.
+        level (float, optional): The level factor of masking. Defaults to 0.1.
 
+    Returns:
+        np.ndarray: The augmented spectrogram.
     """
     assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
     if tempo_axis == 0:
@@ -643,32 +683,40 @@ def adaptive_spect_augment(spect: array, tempo_axis: int=0,
 
     if tempo_axis == 0:
         for _ in range(num_time_mask):
-            start = randint(nt - time_mask_width)
+            start = _randint(nt - time_mask_width)
             spect[start:start + time_mask_width, :] = 0
         for _ in range(num_freq_mask):
-            start = randint(nf - freq_mask_width)
+            start = _randint(nf - freq_mask_width)
             spect[:, start:start + freq_mask_width] = 0
     else:
         for _ in range(num_time_mask):
-            start = randint(nt - time_mask_width)
+            start = _randint(nt - time_mask_width)
             spect[:, start:start + time_mask_width] = 0
         for _ in range(num_freq_mask):
-            start = randint(nf - freq_mask_width)
+            start = _randint(nf - freq_mask_width)
             spect[start:start + freq_mask_width, :] = 0
 
     return spect
 
 
-def spect_augment(spect: array,
+def spect_augment(spect: np.ndarray,
                   tempo_axis: int=0,
                   max_time_mask: int=3,
                   max_freq_mask: int=3,
                   max_time_mask_width: int=30,
-                  max_freq_mask_width: int=20) -> array:
-    """Do spectrogram augmentation in both time and freq axis
+                  max_freq_mask_width: int=20) -> np.ndarray:
+    """Do spectrogram augmentation in both time and freq axis.
 
-    Reference:
+    Args:
+        spect (np.ndarray): Input spectrogram.
+        tempo_axis (int, optional): Indicate the tempo axis. Defaults to 0.
+        max_time_mask (int, optional): Maximum number of time masking. Defaults to 3.
+        max_freq_mask (int, optional): Maximum number of frenquence masking. Defaults to 3.
+        max_time_mask_width (int, optional): Maximum width of time masking. Defaults to 30.
+        max_freq_mask_width (int, optional): Maximum width of frenquence masking. Defaults to 20.
 
+    Returns:
+        np.ndarray: The augmented spectrogram.
     """
     assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
     if tempo_axis == 0:
@@ -676,52 +724,64 @@ def spect_augment(spect: array,
     else:
         nf, nt = spect.shape
 
-    num_time_mask = randint(max_time_mask)
-    num_freq_mask = randint(max_freq_mask)
+    num_time_mask = _randint(max_time_mask)
+    num_freq_mask = _randint(max_freq_mask)
 
-    time_mask_width = randint(max_time_mask_width)
-    freq_mask_width = randint(max_freq_mask_width)
+    time_mask_width = _randint(max_time_mask_width)
+    freq_mask_width = _randint(max_freq_mask_width)
 
     if tempo_axis == 0:
         for _ in range(num_time_mask):
-            start = randint(nt - time_mask_width)
+            start = _randint(nt - time_mask_width)
             spect[start:start + time_mask_width, :] = 0
         for _ in range(num_freq_mask):
-            start = randint(nf - freq_mask_width)
+            start = _randint(nf - freq_mask_width)
             spect[:, start:start + freq_mask_width] = 0
     else:
         for _ in range(num_time_mask):
-            start = randint(nt - time_mask_width)
+            start = _randint(nt - time_mask_width)
             spect[:, start:start + time_mask_width] = 0
         for _ in range(num_freq_mask):
-            start = randint(nf - freq_mask_width)
+            start = _randint(nf - freq_mask_width)
             spect[start:start + freq_mask_width, :] = 0
 
     return spect
 
 
-def random_crop1d(y: array, crop_len: int) -> array:
-    """ Do random cropping on 1d input signal
+def random_crop1d(y: np.ndarray, crop_len: int) -> np.ndarray:
+    """ Random cropping on a input waveform.
 
-    The input is a 1d signal, typically a sound waveform
+    Args:
+        y (np.ndarray): Input waveform array in 1D.
+        crop_len (int): Length of waveform to crop.
+
+    Returns:
+        np.ndarray: The cropped waveform.
     """
     if y.ndim != 1:
         'only accept 1d tensor or numpy array'
     n = len(y)
-    idx = randint(n - crop_len)
+    idx = _randint(n - crop_len)
     return y[idx:idx + crop_len]
 
 
-def random_crop2d(s: array, crop_len: int, tempo_axis: int=0) -> array:
-    """ Do random cropping for 2D array, typically a spectrogram.
+def random_crop2d(s: np.ndarray, crop_len: int,
+                  tempo_axis: int=0) -> np.ndarray:
+    """ Random cropping on a spectrogram.
 
-    The cropping is done in temporal direction on the time-freq input signal.
+    Args:
+        s (np.ndarray): Input spectrogram in 2D.
+        crop_len (int): Length of spectrogram to crop.
+        tempo_axis (int, optional): Indicate the tempo axis. Defaults to 0.
+
+    Returns:
+        np.ndarray: The cropped spectrogram.
     """
     if tempo_axis >= s.ndim:
         raise ParameterError('axis out of range')
 
     n = s.shape[tempo_axis]
-    idx = randint(high=n - crop_len)
+    idx = _randint(high=n - crop_len)
     sli = [slice(None) for i in range(s.ndim)]
     sli[tempo_axis] = slice(idx, idx + crop_len)
     out = s[tuple(sli)]
diff --git a/paddleaudio/paddleaudio/datasets/__init__.py b/paddleaudio/paddleaudio/datasets/__init__.py
index 5c5f03694..ebd4af984 100644
--- a/paddleaudio/paddleaudio/datasets/__init__.py
+++ b/paddleaudio/paddleaudio/datasets/__init__.py
@@ -13,5 +13,7 @@
 # limitations under the License.
 from .esc50 import ESC50
 from .gtzan import GTZAN
+from .rirs_noises import OpenRIRNoise
 from .tess import TESS
 from .urban_sound import UrbanSound8K
+from .voxceleb import VoxCeleb
diff --git a/paddleaudio/paddleaudio/datasets/rirs_noises.py b/paddleaudio/paddleaudio/datasets/rirs_noises.py
new file mode 100644
index 000000000..68639a604
--- /dev/null
+++ b/paddleaudio/paddleaudio/datasets/rirs_noises.py
@@ -0,0 +1,201 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+import csv
+import os
+import random
+from typing import List
+
+from paddle.io import Dataset
+from tqdm import tqdm
+
+from ..backends import load as load_audio
+from ..backends import save as save_wav
+from ..utils import DATA_HOME
+from ..utils.download import download_and_decompress
+from .dataset import feat_funcs
+
+__all__ = ['OpenRIRNoise']
+
+
+class OpenRIRNoise(Dataset):
+    archieves = [
+        {
+            'url': 'http://www.openslr.org/resources/28/rirs_noises.zip',
+            'md5': 'e6f48e257286e05de56413b4779d8ffb',
+        },
+    ]
+
+    sample_rate = 16000
+    meta_info = collections.namedtuple('META_INFO', ('id', 'duration', 'wav'))
+    base_path = os.path.join(DATA_HOME, 'open_rir_noise')
+    wav_path = os.path.join(base_path, 'RIRS_NOISES')
+    csv_path = os.path.join(base_path, 'csv')
+    subsets = ['rir', 'noise']
+
+    def __init__(self,
+                 subset: str='rir',
+                 feat_type: str='raw',
+                 target_dir=None,
+                 random_chunk: bool=True,
+                 chunk_duration: float=3.0,
+                 seed: int=0,
+                 **kwargs):
+
+        assert subset in self.subsets, \
+            'Dataset subset must be one in {}, but got {}'.format(self.subsets, subset)
+
+        self.subset = subset
+        self.feat_type = feat_type
+        self.feat_config = kwargs
+        self.random_chunk = random_chunk
+        self.chunk_duration = chunk_duration
+
+        OpenRIRNoise.csv_path = os.path.join(
+            target_dir, "open_rir_noise",
+            "csv") if target_dir else self.csv_path
+        self._data = self._get_data()
+        super(OpenRIRNoise, self).__init__()
+
+        # Set up a seed to reproduce training or predicting result.
+        # random.seed(seed)
+
+    def _get_data(self):
+        # Download audio files.
+        print(f"rirs noises base path: {self.base_path}")
+        if not os.path.isdir(self.base_path):
+            download_and_decompress(
+                self.archieves, self.base_path, decompress=True)
+        else:
+            print(
+                f"{self.base_path} already exists, we will not download and decompress again"
+            )
+
+        # Data preparation.
+        print(f"prepare the csv to {self.csv_path}")
+        if not os.path.isdir(self.csv_path):
+            os.makedirs(self.csv_path)
+            self.prepare_data()
+
+        data = []
+        with open(os.path.join(self.csv_path, f'{self.subset}.csv'), 'r') as rf:
+            for line in rf.readlines()[1:]:
+                audio_id, duration, wav = line.strip().split(',')
+                data.append(self.meta_info(audio_id, float(duration), wav))
+
+        random.shuffle(data)
+        return data
+
+    def _convert_to_record(self, idx: int):
+        sample = self._data[idx]
+
+        record = {}
+        # To show all fields in a namedtuple: `type(sample)._fields`
+        for field in type(sample)._fields:
+            record[field] = getattr(sample, field)
+
+        waveform, sr = load_audio(record['wav'])
+
+        assert self.feat_type in feat_funcs.keys(), \
+            f"Unknown feat_type: {self.feat_type}, it must be one in {list(feat_funcs.keys())}"
+        feat_func = feat_funcs[self.feat_type]
+        feat = feat_func(
+            waveform, sr=sr, **self.feat_config) if feat_func else waveform
+
+        record.update({'feat': feat})
+        return record
+
+    @staticmethod
+    def _get_chunks(seg_dur, audio_id, audio_duration):
+        num_chunks = int(audio_duration / seg_dur)  # all in milliseconds
+
+        chunk_lst = [
+            audio_id + "_" + str(i * seg_dur) + "_" + str(i * seg_dur + seg_dur)
+            for i in range(num_chunks)
+        ]
+        return chunk_lst
+
+    def _get_audio_info(self, wav_file: str,
+                        split_chunks: bool) -> List[List[str]]:
+        waveform, sr = load_audio(wav_file)
+        audio_id = wav_file.split("/open_rir_noise/")[-1].split(".")[0]
+        audio_duration = waveform.shape[0] / sr
+
+        ret = []
+        if split_chunks and audio_duration > self.chunk_duration:  # Split into pieces of self.chunk_duration seconds.
+            uniq_chunks_list = self._get_chunks(self.chunk_duration, audio_id,
+                                                audio_duration)
+
+            for idx, chunk in enumerate(uniq_chunks_list):
+                s, e = chunk.split("_")[-2:]  # Timestamps of start and end
+                start_sample = int(float(s) * sr)
+                end_sample = int(float(e) * sr)
+                new_wav_file = os.path.join(self.base_path,
+                                            audio_id + f'_chunk_{idx+1:02}.wav')
+                save_wav(waveform[start_sample:end_sample], sr, new_wav_file)
+                # id, duration, new_wav
+                ret.append([chunk, self.chunk_duration, new_wav_file])
+        else:  # Keep whole audio.
+            ret.append([audio_id, audio_duration, wav_file])
+        return ret
+
+    def generate_csv(self,
+                     wav_files: List[str],
+                     output_file: str,
+                     split_chunks: bool=True):
+        print(f'Generating csv: {output_file}')
+        header = ["id", "duration", "wav"]
+
+        infos = list(
+            tqdm(
+                map(self._get_audio_info, wav_files, [split_chunks] * len(
+                    wav_files)),
+                total=len(wav_files)))
+
+        csv_lines = []
+        for info in infos:
+            csv_lines.extend(info)
+
+        with open(output_file, mode="w") as csv_f:
+            csv_writer = csv.writer(
+                csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
+            csv_writer.writerow(header)
+            for line in csv_lines:
+                csv_writer.writerow(line)
+
+    def prepare_data(self):
+        rir_list = os.path.join(self.wav_path, "real_rirs_isotropic_noises",
+                                "rir_list")
+        rir_files = []
+        with open(rir_list, 'r') as f:
+            for line in f.readlines():
+                rir_file = line.strip().split(' ')[-1]
+                rir_files.append(os.path.join(self.base_path, rir_file))
+
+        noise_list = os.path.join(self.wav_path, "pointsource_noises",
+                                  "noise_list")
+        noise_files = []
+        with open(noise_list, 'r') as f:
+            for line in f.readlines():
+                noise_file = line.strip().split(' ')[-1]
+                noise_files.append(os.path.join(self.base_path, noise_file))
+
+        self.generate_csv(rir_files, os.path.join(self.csv_path, 'rir.csv'))
+        self.generate_csv(noise_files, os.path.join(self.csv_path, 'noise.csv'))
+
+    def __getitem__(self, idx):
+        return self._convert_to_record(idx)
+
+    def __len__(self):
+        return len(self._data)
diff --git a/paddleaudio/paddleaudio/datasets/voxceleb.py b/paddleaudio/paddleaudio/datasets/voxceleb.py
new file mode 100644
index 000000000..3f72b5f2e
--- /dev/null
+++ b/paddleaudio/paddleaudio/datasets/voxceleb.py
@@ -0,0 +1,356 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+import csv
+import glob
+import os
+import random
+from multiprocessing import cpu_count
+from typing import List
+
+from paddle.io import Dataset
+from pathos.multiprocessing import Pool
+from tqdm import tqdm
+
+from ..backends import load as load_audio
+from ..utils import DATA_HOME
+from ..utils import decompress
+from ..utils.download import download_and_decompress
+from .dataset import feat_funcs
+
+__all__ = ['VoxCeleb']
+
+
+class VoxCeleb(Dataset):
+    source_url = 'https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/'
+    archieves_audio_dev = [
+        {
+            'url': source_url + 'vox1_dev_wav_partaa',
+            'md5': 'e395d020928bc15670b570a21695ed96',
+        },
+        {
+            'url': source_url + 'vox1_dev_wav_partab',
+            'md5': 'bbfaaccefab65d82b21903e81a8a8020',
+        },
+        {
+            'url': source_url + 'vox1_dev_wav_partac',
+            'md5': '017d579a2a96a077f40042ec33e51512',
+        },
+        {
+            'url': source_url + 'vox1_dev_wav_partad',
+            'md5': '7bb1e9f70fddc7a678fa998ea8b3ba19',
+        },
+    ]
+    archieves_audio_test = [
+        {
+            'url': source_url + 'vox1_test_wav.zip',
+            'md5': '185fdc63c3c739954633d50379a3d102',
+        },
+    ]
+    archieves_meta = [
+        {
+            'url':
+            'https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test2.txt',
+            'md5':
+            'b73110731c9223c1461fe49cb48dddfc',
+        },
+    ]
+
+    num_speakers = 1211  # 1211 vox1, 5994 vox2, 7205 vox1+2, test speakers: 41
+    sample_rate = 16000
+    meta_info = collections.namedtuple(
+        'META_INFO', ('id', 'duration', 'wav', 'start', 'stop', 'spk_id'))
+    base_path = os.path.join(DATA_HOME, 'vox1')
+    wav_path = os.path.join(base_path, 'wav')
+    meta_path = os.path.join(base_path, 'meta')
+    veri_test_file = os.path.join(meta_path, 'veri_test2.txt')
+    csv_path = os.path.join(base_path, 'csv')
+    subsets = ['train', 'dev', 'enroll', 'test']
+
+    def __init__(
+            self,
+            subset: str='train',
+            feat_type: str='raw',
+            random_chunk: bool=True,
+            chunk_duration: float=3.0,  # seconds
+            split_ratio: float=0.9,  # train split ratio
+            seed: int=0,
+            target_dir: str=None,
+            vox2_base_path=None,
+            **kwargs):
+        """VoxCeleb data prepare and get the specific dataset audio info
+
+        Args:
+            subset (str, optional): dataset name, such as train, dev, enroll or test. Defaults to 'train'.
+            feat_type (str, optional): feat type, such raw, melspectrogram(fbank) or mfcc . Defaults to 'raw'.
+            random_chunk (bool, optional): random select a duration from audio. Defaults to True.
+            chunk_duration (float, optional): chunk duration if random_chunk flag is set. Defaults to 3.0.
+            target_dir (str, optional): data dir, audio info will be stored in this directory. Defaults to None.
+            vox2_base_path (_type_, optional): vox2 directory. vox2 data must be converted from m4a to wav. Defaults to None.
+        """
+        assert subset in self.subsets, \
+            'Dataset subset must be one in {}, but got {}'.format(self.subsets, subset)
+
+        self.subset = subset
+        self.spk_id2label = {}
+        self.feat_type = feat_type
+        self.feat_config = kwargs
+        self.random_chunk = random_chunk
+        self.chunk_duration = chunk_duration
+        self.split_ratio = split_ratio
+        self.target_dir = target_dir if target_dir else VoxCeleb.base_path
+        self.vox2_base_path = vox2_base_path
+
+        # if we set the target dir, we will change the vox data info data from base path to target dir
+        VoxCeleb.csv_path = os.path.join(
+            target_dir, "voxceleb", 'csv') if target_dir else VoxCeleb.csv_path
+        VoxCeleb.meta_path = os.path.join(
+            target_dir, "voxceleb",
+            'meta') if target_dir else VoxCeleb.meta_path
+        VoxCeleb.veri_test_file = os.path.join(VoxCeleb.meta_path,
+                                               'veri_test2.txt')
+        # self._data = self._get_data()[:1000]  # KP: Small dataset test.
+        self._data = self._get_data()
+        super(VoxCeleb, self).__init__()
+
+        # Set up a seed to reproduce training or predicting result.
+        # random.seed(seed)
+
+    def _get_data(self):
+        # Download audio files.
+        # We need the users to decompress all vox1/dev/wav and vox1/test/wav/ to vox1/wav/ dir
+        # so, we check the vox1/wav dir status
+        print(f"wav base path: {self.wav_path}")
+        if not os.path.isdir(self.wav_path):
+            print("start to download the voxceleb1 dataset")
+            download_and_decompress(  # multi-zip parts concatenate to vox1_dev_wav.zip
+                self.archieves_audio_dev,
+                self.base_path,
+                decompress=False)
+            download_and_decompress(  # download the vox1_test_wav.zip and unzip
+                self.archieves_audio_test,
+                self.base_path,
+                decompress=True)
+
+            # Download all parts and concatenate the files into one zip file.
+            dev_zipfile = os.path.join(self.base_path, 'vox1_dev_wav.zip')
+            print(f'Concatenating all parts to: {dev_zipfile}')
+            os.system(
+                f'cat {os.path.join(self.base_path, "vox1_dev_wav_parta*")} > {dev_zipfile}'
+            )
+
+            # Extract all audio files of dev and test set.
+            decompress(dev_zipfile, self.base_path)
+
+        # Download meta files.
+        if not os.path.isdir(self.meta_path):
+            print("prepare the meta data")
+            download_and_decompress(
+                self.archieves_meta, self.meta_path, decompress=False)
+
+        # Data preparation.
+        if not os.path.isdir(self.csv_path):
+            os.makedirs(self.csv_path)
+            self.prepare_data()
+
+        data = []
+        print(
+            f"read the {self.subset} from {os.path.join(self.csv_path, f'{self.subset}.csv')}"
+        )
+        with open(os.path.join(self.csv_path, f'{self.subset}.csv'), 'r') as rf:
+            for line in rf.readlines()[1:]:
+                audio_id, duration, wav, start, stop, spk_id = line.strip(
+                ).split(',')
+                data.append(
+                    self.meta_info(audio_id,
+                                   float(duration), wav,
+                                   int(start), int(stop), spk_id))
+
+        with open(os.path.join(self.meta_path, 'spk_id2label.txt'), 'r') as f:
+            for line in f.readlines():
+                spk_id, label = line.strip().split(' ')
+                self.spk_id2label[spk_id] = int(label)
+
+        return data
+
+    def _convert_to_record(self, idx: int):
+        sample = self._data[idx]
+
+        record = {}
+        # To show all fields in a namedtuple: `type(sample)._fields`
+        for field in type(sample)._fields:
+            record[field] = getattr(sample, field)
+
+        waveform, sr = load_audio(record['wav'])
+
+        # random select a chunk audio samples from the audio
+        if self.random_chunk:
+            num_wav_samples = waveform.shape[0]
+            num_chunk_samples = int(self.chunk_duration * sr)
+            start = random.randint(0, num_wav_samples - num_chunk_samples - 1)
+            stop = start + num_chunk_samples
+        else:
+            start = record['start']
+            stop = record['stop']
+
+        waveform = waveform[start:stop]
+
+        assert self.feat_type in feat_funcs.keys(), \
+            f"Unknown feat_type: {self.feat_type}, it must be one in {list(feat_funcs.keys())}"
+        feat_func = feat_funcs[self.feat_type]
+        feat = feat_func(
+            waveform, sr=sr, **self.feat_config) if feat_func else waveform
+
+        record.update({'feat': feat})
+        if self.subset in ['train',
+                           'dev']:  # Labels are available in train and dev.
+            record.update({'label': self.spk_id2label[record['spk_id']]})
+
+        return record
+
+    @staticmethod
+    def _get_chunks(seg_dur, audio_id, audio_duration):
+        num_chunks = int(audio_duration / seg_dur)  # all in milliseconds
+
+        chunk_lst = [
+            audio_id + "_" + str(i * seg_dur) + "_" + str(i * seg_dur + seg_dur)
+            for i in range(num_chunks)
+        ]
+        return chunk_lst
+
+    def _get_audio_info(self, wav_file: str,
+                        split_chunks: bool) -> List[List[str]]:
+        waveform, sr = load_audio(wav_file)
+        spk_id, sess_id, utt_id = wav_file.split("/")[-3:]
+        audio_id = '-'.join([spk_id, sess_id, utt_id.split(".")[0]])
+        audio_duration = waveform.shape[0] / sr
+
+        ret = []
+        if split_chunks:  # Split into pieces of self.chunk_duration seconds.
+            uniq_chunks_list = self._get_chunks(self.chunk_duration, audio_id,
+                                                audio_duration)
+
+            for chunk in uniq_chunks_list:
+                s, e = chunk.split("_")[-2:]  # Timestamps of start and end
+                start_sample = int(float(s) * sr)
+                end_sample = int(float(e) * sr)
+                # id, duration, wav, start, stop, spk_id
+                ret.append([
+                    chunk, audio_duration, wav_file, start_sample, end_sample,
+                    spk_id
+                ])
+        else:  # Keep whole audio.
+            ret.append([
+                audio_id, audio_duration, wav_file, 0, waveform.shape[0], spk_id
+            ])
+        return ret
+
+    def generate_csv(self,
+                     wav_files: List[str],
+                     output_file: str,
+                     split_chunks: bool=True):
+        print(f'Generating csv: {output_file}')
+        header = ["ID", "duration", "wav", "start", "stop", "spk_id"]
+        # Note: this may occurs c++ execption, but the program will execute fine
+        # so we can ignore the execption 
+        with Pool(cpu_count()) as p:
+            infos = list(
+                tqdm(
+                    p.imap(lambda x: self._get_audio_info(x, split_chunks),
+                           wav_files),
+                    total=len(wav_files)))
+
+        csv_lines = []
+        for info in infos:
+            csv_lines.extend(info)
+
+        with open(output_file, mode="w") as csv_f:
+            csv_writer = csv.writer(
+                csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
+            csv_writer.writerow(header)
+            for line in csv_lines:
+                csv_writer.writerow(line)
+
+    def prepare_data(self):
+        # Audio of speakers in veri_test_file should not be included in training set.
+        print("start to prepare the data csv file")
+        enroll_files = set()
+        test_files = set()
+        # get the enroll and test audio file path
+        with open(self.veri_test_file, 'r') as f:
+            for line in f.readlines():
+                _, enrol_file, test_file = line.strip().split(' ')
+                enroll_files.add(os.path.join(self.wav_path, enrol_file))
+                test_files.add(os.path.join(self.wav_path, test_file))
+            enroll_files = sorted(enroll_files)
+            test_files = sorted(test_files)
+
+        # get the enroll and test speakers
+        test_spks = set()
+        for file in (enroll_files + test_files):
+            spk = file.split('/wav/')[1].split('/')[0]
+            test_spks.add(spk)
+
+        # get all the train and dev audios file path
+        audio_files = []
+        speakers = set()
+        print("Getting file list...")
+        for path in [self.wav_path, self.vox2_base_path]:
+            # if vox2 directory is not set and vox2 is not a directory 
+            # we will not process this directory
+            if not path or not os.path.exists(path):
+                print(f"{path} is an invalid path, please check again, "
+                      "and we will ignore the vox2 base path")
+                continue
+            for file in glob.glob(
+                    os.path.join(path, "**", "*.wav"), recursive=True):
+                spk = file.split('/wav/')[1].split('/')[0]
+                if spk in test_spks:
+                    continue
+                speakers.add(spk)
+                audio_files.append(file)
+
+        print(
+            f"start to generate the {os.path.join(self.meta_path, 'spk_id2label.txt')}"
+        )
+        # encode the train and dev speakers label to spk_id2label.txt
+        with open(os.path.join(self.meta_path, 'spk_id2label.txt'), 'w') as f:
+            for label, spk_id in enumerate(
+                    sorted(speakers)):  # 1211 vox1, 5994 vox2, 7205 vox1+2
+                f.write(f'{spk_id} {label}\n')
+
+        audio_files = sorted(audio_files)
+        random.shuffle(audio_files)
+        split_idx = int(self.split_ratio * len(audio_files))
+        # split_ratio to train
+        train_files, dev_files = audio_files[:split_idx], audio_files[
+            split_idx:]
+
+        self.generate_csv(train_files, os.path.join(self.csv_path, 'train.csv'))
+        self.generate_csv(dev_files, os.path.join(self.csv_path, 'dev.csv'))
+
+        self.generate_csv(
+            enroll_files,
+            os.path.join(self.csv_path, 'enroll.csv'),
+            split_chunks=False)
+        self.generate_csv(
+            test_files,
+            os.path.join(self.csv_path, 'test.csv'),
+            split_chunks=False)
+
+    def __getitem__(self, idx):
+        return self._convert_to_record(idx)
+
+    def __len__(self):
+        return len(self._data)
diff --git a/paddleaudio/paddleaudio/features/layers.py b/paddleaudio/paddleaudio/features/layers.py
index 4a2c1673a..09037255d 100644
--- a/paddleaudio/paddleaudio/features/layers.py
+++ b/paddleaudio/paddleaudio/features/layers.py
@@ -17,6 +17,7 @@ from typing import Union
 
 import paddle
 import paddle.nn as nn
+from paddle import Tensor
 
 from ..functional import compute_fbank_matrix
 from ..functional import create_dct
@@ -32,42 +33,34 @@ __all__ = [
 
 
 class Spectrogram(nn.Layer):
+    """Compute spectrogram of given signals, typically audio waveforms.
+    The spectorgram is defined as the complex norm of the short-time Fourier transformation.
+
+    Args:
+        n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512.
+        hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None.
+        win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
+        window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
+        power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
+        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True.
+        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
+        dtype (str, optional): Data type of input and window. Defaults to 'float32'.
+    """
+
     def __init__(self,
                  n_fft: int=512,
                  hop_length: Optional[int]=None,
                  win_length: Optional[int]=None,
                  window: str='hann',
+                 power: float=2.0,
                  center: bool=True,
                  pad_mode: str='reflect',
-                 dtype: str=paddle.float32):
-        """Compute spectrogram of a given signal, typically an audio waveform.
-        The spectorgram is defined as the complex norm of the short-time
-        Fourier transformation.
-        Parameters:
-            n_fft (int): the number of frequency components of the discrete Fourier transform.
-                The default value is 2048,
-            hop_length (int|None): the hop length of the short time FFT. If None, it is set to win_length//4.
-                The default value is None.
-            win_length: the window length of the short time FFt. If None, it is set to same as n_fft.
-                The default value is None.
-            window (str): the name of the window function applied to the single before the Fourier transform.
-                The folllowing window names are supported: 'hamming','hann','kaiser','gaussian',
-                'exponential','triang','bohman','blackman','cosine','tukey','taylor'.
-                The default value is 'hann'
-            center (bool): if True, the signal is padded so that frame t is centered at x[t * hop_length].
-                If False, frame t begins at x[t * hop_length]
-                The default value is True
-            pad_mode (str): the mode to pad the signal if necessary. The supported modes are 'reflect'
-                and 'constant'. The default value is 'reflect'.
-            dtype (str): the data type of input and window.
-        Notes:
-            The Spectrogram transform relies on STFT transform to compute the spectrogram.
-            By default, the weights are not learnable. To fine-tune the Fourier coefficients,
-            set stop_gradient=False before training.
-            For more information, see STFT().
-        """
+                 dtype: str='float32') -> None:
         super(Spectrogram, self).__init__()
 
+        assert power > 0, 'Power of spectrogram must be > 0.'
+        self.power = power
+
         if win_length is None:
             win_length = n_fft
 
@@ -83,19 +76,46 @@ class Spectrogram(nn.Layer):
             pad_mode=pad_mode)
         self.register_buffer('fft_window', self.fft_window)
 
-    def forward(self, x):
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Args:
+            x (Tensor): Tensor of waveforms with shape `(N, T)`
+
+        Returns:
+            Tensor: Spectrograms with shape `(N, n_fft//2 + 1, num_frames)`.
+        """
         stft = self._stft(x)
-        spectrogram = paddle.square(paddle.abs(stft))
+        spectrogram = paddle.pow(paddle.abs(stft), self.power)
         return spectrogram
 
 
 class MelSpectrogram(nn.Layer):
+    """Compute the melspectrogram of given signals, typically audio waveforms. It is computed by multiplying spectrogram with Mel filter bank matrix.
+
+    Args:
+        sr (int, optional): Sample rate. Defaults to 22050.
+        n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512.
+        hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None.
+        win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
+        window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
+        power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
+        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True.
+        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
+        n_mels (int, optional): Number of mel bins. Defaults to 64.
+        f_min (float, optional): Minimum frequency in Hz. Defaults to 50.0.
+        f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
+        htk (bool, optional): Use HTK formula in computing fbank matrix. Defaults to False.
+        norm (Union[str, float], optional): Type of normalization in computing fbank matrix. Slaney-style is used by default. You can specify norm=1.0/2.0 to use customized p-norm normalization. Defaults to 'slaney'.
+        dtype (str, optional): Data type of input and window. Defaults to 'float32'.
+    """
+
     def __init__(self,
                  sr: int=22050,
                  n_fft: int=512,
                  hop_length: Optional[int]=None,
                  win_length: Optional[int]=None,
                  window: str='hann',
+                 power: float=2.0,
                  center: bool=True,
                  pad_mode: str='reflect',
                  n_mels: int=64,
@@ -103,38 +123,7 @@ class MelSpectrogram(nn.Layer):
                  f_max: Optional[float]=None,
                  htk: bool=False,
                  norm: Union[str, float]='slaney',
-                 dtype: str=paddle.float32):
-        """Compute the melspectrogram of a given signal, typically an audio waveform.
-        The melspectrogram is also known as filterbank or fbank feature in audio community.
-        It is computed by multiplying spectrogram with Mel filter bank matrix.
-        Parameters:
-            sr(int): the audio sample rate.
-                The default value is 22050.
-            n_fft(int): the number of frequency components of the discrete Fourier transform.
-                The default value is 2048,
-            hop_length(int|None): the hop length of the short time FFT. If None, it is set to win_length//4.
-                The default value is None.
-            win_length: the window length of the short time FFt. If None, it is set to same as n_fft.
-                The default value is None.
-            window(str): the name of the window function applied to the single before the Fourier transform.
-                The folllowing window names are supported: 'hamming','hann','kaiser','gaussian',
-                'exponential','triang','bohman','blackman','cosine','tukey','taylor'.
-                The default value is 'hann'
-            center(bool): if True, the signal is padded so that frame t is centered at x[t * hop_length].
-                If False, frame t begins at x[t * hop_length]
-                The default value is True
-            pad_mode(str): the mode to pad the signal if necessary. The supported modes are 'reflect'
-                and 'constant'.
-                The default value is 'reflect'.
-            n_mels(int): the mel bins.
-            f_min(float): the lower cut-off frequency, below which the filter response is zero.
-            f_max(float): the upper cut-off frequency, above which the filter response is zeros.
-            htk(bool): whether to use HTK formula in computing fbank matrix.
-            norm(str|float): the normalization type in computing fbank matrix.  Slaney-style is used by default.
-                You can specify norm=1.0/2.0 to use customized p-norm normalization.
-            dtype(str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical
-                accuracy. Note that the final transform will be conducted in float32 regardless of dtype of fbank matrix.
-        """
+                 dtype: str='float32') -> None:
         super(MelSpectrogram, self).__init__()
 
         self._spectrogram = Spectrogram(
@@ -142,6 +131,7 @@ class MelSpectrogram(nn.Layer):
             hop_length=hop_length,
             win_length=win_length,
             window=window,
+            power=power,
             center=center,
             pad_mode=pad_mode,
             dtype=dtype)
@@ -163,19 +153,49 @@ class MelSpectrogram(nn.Layer):
             dtype=dtype)  # float64 for better numerical results
         self.register_buffer('fbank_matrix', self.fbank_matrix)
 
-    def forward(self, x):
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Args:
+            x (Tensor): Tensor of waveforms with shape `(N, T)`
+
+        Returns:
+            Tensor: Mel spectrograms with shape `(N, n_mels, num_frames)`.
+        """
         spect_feature = self._spectrogram(x)
         mel_feature = paddle.matmul(self.fbank_matrix, spect_feature)
         return mel_feature
 
 
 class LogMelSpectrogram(nn.Layer):
+    """Compute log-mel-spectrogram feature of given signals, typically audio waveforms.
+
+    Args:
+        sr (int, optional): Sample rate. Defaults to 22050.
+        n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512.
+        hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None.
+        win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
+        window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
+        power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
+        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True.
+        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
+        n_mels (int, optional): Number of mel bins. Defaults to 64.
+        f_min (float, optional): Minimum frequency in Hz. Defaults to 50.0.
+        f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
+        htk (bool, optional): Use HTK formula in computing fbank matrix. Defaults to False.
+        norm (Union[str, float], optional): Type of normalization in computing fbank matrix. Slaney-style is used by default. You can specify norm=1.0/2.0 to use customized p-norm normalization. Defaults to 'slaney'.
+        ref_value (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0.
+        amin (float, optional): The minimum value of input magnitude. Defaults to 1e-10.
+        top_db (Optional[float], optional): The maximum db value of spectrogram. Defaults to None.
+        dtype (str, optional): Data type of input and window. Defaults to 'float32'.
+    """
+
     def __init__(self,
                  sr: int=22050,
                  n_fft: int=512,
                  hop_length: Optional[int]=None,
                  win_length: Optional[int]=None,
                  window: str='hann',
+                 power: float=2.0,
                  center: bool=True,
                  pad_mode: str='reflect',
                  n_mels: int=64,
@@ -186,44 +206,7 @@ class LogMelSpectrogram(nn.Layer):
                  ref_value: float=1.0,
                  amin: float=1e-10,
                  top_db: Optional[float]=None,
-                 dtype: str=paddle.float32):
-        """Compute log-mel-spectrogram(also known as LogFBank) feature of a given signal,
-        typically an audio waveform.
-        Parameters:
-            sr (int): the audio sample rate.
-                The default value is 22050.
-            n_fft (int): the number of frequency components of the discrete Fourier transform.
-                The default value is 2048,
-            hop_length (int|None): the hop length of the short time FFT. If None, it is set to win_length//4.
-                The default value is None.
-            win_length: the window length of the short time FFt. If None, it is set to same as n_fft.
-                The default value is None.
-            window (str): the name of the window function applied to the single before the Fourier transform.
-                The folllowing window names are supported: 'hamming','hann','kaiser','gaussian',
-                'exponential','triang','bohman','blackman','cosine','tukey','taylor'.
-                The default value is 'hann'
-            center (bool): if True, the signal is padded so that frame t is centered at x[t * hop_length].
-                If False, frame t begins at x[t * hop_length]
-                The default value is True
-            pad_mode (str): the mode to pad the signal if necessary. The supported modes are 'reflect'
-                and 'constant'.
-                The default value is 'reflect'.
-            n_mels (int): the mel bins.
-            f_min (float): the lower cut-off frequency, below which the filter response is zero.
-            f_max (float): the upper cut-off frequency, above which the filter response is zeros.
-            htk (bool): whether to use HTK formula in computing fbank matrix.
-            norm (str|float): the normalization type in computing fbank matrix. Slaney-style is used by default.
-                You can specify norm=1.0/2.0 to use customized p-norm normalization.
-            ref_value (float): the reference value. If smaller than 1.0, the db level
-            amin (float): the minimum value of input magnitude, below which the input of the signal will be pulled up accordingly.
-                Otherwise, the db level is pushed down.
-                magnitude is clipped(to amin). For numerical stability, set amin to a larger value,
-                e.g., 1e-3.
-            top_db (float): the maximum db value of resulting spectrum, above which the
-                spectrum is clipped(to top_db).
-            dtype (str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical
-                accuracy. Note that the final transform will be conducted in float32 regardless of dtype of fbank matrix.
-        """
+                 dtype: str='float32') -> None:
         super(LogMelSpectrogram, self).__init__()
 
         self._melspectrogram = MelSpectrogram(
@@ -232,6 +215,7 @@ class LogMelSpectrogram(nn.Layer):
             hop_length=hop_length,
             win_length=win_length,
             window=window,
+            power=power,
             center=center,
             pad_mode=pad_mode,
             n_mels=n_mels,
@@ -245,8 +229,14 @@ class LogMelSpectrogram(nn.Layer):
         self.amin = amin
         self.top_db = top_db
 
-    def forward(self, x):
-        # import ipdb; ipdb.set_trace()
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Args:
+            x (Tensor): Tensor of waveforms with shape `(N, T)`
+
+        Returns:
+            Tensor: Log mel spectrograms with shape `(N, n_mels, num_frames)`.
+        """
         mel_feature = self._melspectrogram(x)
         log_mel_feature = power_to_db(
             mel_feature,
@@ -257,6 +247,29 @@ class LogMelSpectrogram(nn.Layer):
 
 
 class MFCC(nn.Layer):
+    """Compute mel frequency cepstral coefficients(MFCCs) feature of given waveforms.
+
+    Args:
+        sr (int, optional): Sample rate. Defaults to 22050.
+        n_mfcc (int, optional): [description]. Defaults to 40.
+        n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512.
+        hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None.
+        win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
+        window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
+        power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
+        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\_length` at the center of `t`-th frame. Defaults to True.
+        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
+        n_mels (int, optional): Number of mel bins. Defaults to 64.
+        f_min (float, optional): Minimum frequency in Hz. Defaults to 50.0.
+        f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
+        htk (bool, optional): Use HTK formula in computing fbank matrix. Defaults to False.
+        norm (Union[str, float], optional): Type of normalization in computing fbank matrix. Slaney-style is used by default. You can specify norm=1.0/2.0 to use customized p-norm normalization. Defaults to 'slaney'.
+        ref_value (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0.
+        amin (float, optional): The minimum value of input magnitude. Defaults to 1e-10.
+        top_db (Optional[float], optional): The maximum db value of spectrogram. Defaults to None.
+        dtype (str, optional): Data type of input and window. Defaults to 'float32'.
+    """
+
     def __init__(self,
                  sr: int=22050,
                  n_mfcc: int=40,
@@ -264,6 +277,7 @@ class MFCC(nn.Layer):
                  hop_length: Optional[int]=None,
                  win_length: Optional[int]=None,
                  window: str='hann',
+                 power: float=2.0,
                  center: bool=True,
                  pad_mode: str='reflect',
                  n_mels: int=64,
@@ -274,45 +288,7 @@ class MFCC(nn.Layer):
                  ref_value: float=1.0,
                  amin: float=1e-10,
                  top_db: Optional[float]=None,
-                 dtype: str=paddle.float32):
-        """Compute mel frequency cepstral coefficients(MFCCs) feature of given waveforms.
-
-        Parameters:
-            sr(int): the audio sample rate.
-                The default value is 22050.
-            n_mfcc (int, optional): Number of cepstra in MFCC. Defaults to 40.
-            n_fft (int): the number of frequency components of the discrete Fourier transform.
-                The default value is 2048,
-            hop_length (int|None): the hop length of the short time FFT. If None, it is set to win_length//4.
-                The default value is None.
-            win_length: the window length of the short time FFt. If None, it is set to same as n_fft.
-                The default value is None.
-            window (str): the name of the window function applied to the single before the Fourier transform.
-                The folllowing window names are supported: 'hamming','hann','kaiser','gaussian',
-                'exponential','triang','bohman','blackman','cosine','tukey','taylor'.
-                The default value is 'hann'
-            center (bool): if True, the signal is padded so that frame t is centered at x[t * hop_length].
-                If False, frame t begins at x[t * hop_length]
-                The default value is True
-            pad_mode (str): the mode to pad the signal if necessary. The supported modes are 'reflect'
-                and 'constant'.
-                The default value is 'reflect'.
-            n_mels (int): the mel bins.
-            f_min (float): the lower cut-off frequency, below which the filter response is zero.
-            f_max (float): the upper cut-off frequency, above which the filter response is zeros.
-            htk (bool): whether to use HTK formula in computing fbank matrix.
-            norm (str|float): the normalization type in computing fbank matrix. Slaney-style is used by default.
-                You can specify norm=1.0/2.0 to use customized p-norm normalization.
-            ref_value (float): the reference value. If smaller than 1.0, the db level
-            amin (float): the minimum value of input magnitude, below which the input of the signal will be pulled up accordingly.
-                Otherwise, the db level is pushed down.
-                magnitude is clipped(to amin). For numerical stability, set amin to a larger value,
-                e.g., 1e-3.
-            top_db (float): the maximum db value of resulting spectrum, above which the
-                spectrum is clipped(to top_db).
-            dtype (str): the datatype of fbank matrix used in the transform. Use float64 to increase numerical
-                accuracy. Note that the final transform will be conducted in float32 regardless of dtype of fbank matrix.
-        """
+                 dtype: str=paddle.float32) -> None:
         super(MFCC, self).__init__()
         assert n_mfcc <= n_mels, 'n_mfcc cannot be larger than n_mels: %d vs %d' % (
             n_mfcc, n_mels)
@@ -322,6 +298,7 @@ class MFCC(nn.Layer):
             hop_length=hop_length,
             win_length=win_length,
             window=window,
+            power=power,
             center=center,
             pad_mode=pad_mode,
             n_mels=n_mels,
@@ -336,7 +313,14 @@ class MFCC(nn.Layer):
         self.dct_matrix = create_dct(n_mfcc=n_mfcc, n_mels=n_mels, dtype=dtype)
         self.register_buffer('dct_matrix', self.dct_matrix)
 
-    def forward(self, x):
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Args:
+            x (Tensor): Tensor of waveforms with shape `(N, T)`
+
+        Returns:
+            Tensor: Mel frequency cepstral coefficients with shape `(N, n_mfcc, num_frames)`.
+        """
         log_mel_feature = self._log_melspectrogram(x)
         mfcc = paddle.matmul(
             log_mel_feature.transpose((0, 2, 1)), self.dct_matrix).transpose(
diff --git a/paddleaudio/paddleaudio/functional/functional.py b/paddleaudio/paddleaudio/functional/functional.py
index c5ab30453..19c63a9ae 100644
--- a/paddleaudio/paddleaudio/functional/functional.py
+++ b/paddleaudio/paddleaudio/functional/functional.py
@@ -17,6 +17,7 @@ from typing import Optional
 from typing import Union
 
 import paddle
+from paddle import Tensor
 
 __all__ = [
     'hz_to_mel',
@@ -29,19 +30,20 @@ __all__ = [
 ]
 
 
-def hz_to_mel(freq: Union[paddle.Tensor, float],
-              htk: bool=False) -> Union[paddle.Tensor, float]:
+def hz_to_mel(freq: Union[Tensor, float],
+              htk: bool=False) -> Union[Tensor, float]:
     """Convert Hz to Mels.
-    Parameters:
-        freq: the input tensor of arbitrary shape, or a single floating point number.
-        htk: use HTK formula to do the conversion.
-            The default value is False.
+
+    Args:
+        freq (Union[Tensor, float]): The input tensor with arbitrary shape.
+        htk (bool, optional): Use htk scaling. Defaults to False.
+
     Returns:
-        The frequencies represented in Mel-scale.
+        Union[Tensor, float]: Frequency in mels.
     """
 
     if htk:
-        if isinstance(freq, paddle.Tensor):
+        if isinstance(freq, Tensor):
             return 2595.0 * paddle.log10(1.0 + freq / 700.0)
         else:
             return 2595.0 * math.log10(1.0 + freq / 700.0)
@@ -58,7 +60,7 @@ def hz_to_mel(freq: Union[paddle.Tensor, float],
     min_log_mel = (min_log_hz - f_min) / f_sp  # same (Mels)
     logstep = math.log(6.4) / 27.0  # step size for log region
 
-    if isinstance(freq, paddle.Tensor):
+    if isinstance(freq, Tensor):
         target = min_log_mel + paddle.log(
             freq / min_log_hz + 1e-10) / logstep  # prevent nan with 1e-10
         mask = (freq > min_log_hz).astype(freq.dtype)
@@ -71,14 +73,16 @@ def hz_to_mel(freq: Union[paddle.Tensor, float],
     return mels
 
 
-def mel_to_hz(mel: Union[float, paddle.Tensor],
-              htk: bool=False) -> Union[float, paddle.Tensor]:
+def mel_to_hz(mel: Union[float, Tensor],
+              htk: bool=False) -> Union[float, Tensor]:
     """Convert mel bin numbers to frequencies.
-    Parameters:
-        mel: the mel frequency represented as a tensor of arbitrary shape, or a floating point number.
-        htk: use HTK formula to do the conversion.
+
+    Args:
+        mel (Union[float, Tensor]): The mel frequency represented as a tensor with arbitrary shape.
+        htk (bool, optional): Use htk scaling. Defaults to False.
+
     Returns:
-        The frequencies represented in hz.
+        Union[float, Tensor]: Frequencies in Hz.
     """
     if htk:
         return 700.0 * (10.0**(mel / 2595.0) - 1.0)
@@ -90,7 +94,7 @@ def mel_to_hz(mel: Union[float, paddle.Tensor],
     min_log_hz = 1000.0  # beginning of log region (Hz)
     min_log_mel = (min_log_hz - f_min) / f_sp  # same (Mels)
     logstep = math.log(6.4) / 27.0  # step size for log region
-    if isinstance(mel, paddle.Tensor):
+    if isinstance(mel, Tensor):
         target = min_log_hz * paddle.exp(logstep * (mel - min_log_mel))
         mask = (mel > min_log_mel).astype(mel.dtype)
         freqs = target * mask + freqs * (
@@ -106,16 +110,18 @@ def mel_frequencies(n_mels: int=64,
                     f_min: float=0.0,
                     f_max: float=11025.0,
                     htk: bool=False,
-                    dtype: str=paddle.float32):
+                    dtype: str='float32') -> Tensor:
     """Compute mel frequencies.
-    Parameters:
-        n_mels(int): number of Mel bins.
-        f_min(float): the lower cut-off frequency, below which the filter response is zero.
-        f_max(float): the upper cut-off frequency, above which the filter response is zero.
-        htk(bool): whether to use htk formula.
-        dtype(str): the datatype of the return frequencies.
+
+    Args:
+        n_mels (int, optional): Number of mel bins. Defaults to 64.
+        f_min (float, optional): Minimum frequency in Hz. Defaults to 0.0.
+        fmax (float, optional): Maximum frequency in Hz. Defaults to 11025.0.
+        htk (bool, optional): Use htk scaling. Defaults to False.
+        dtype (str, optional): The data type of the return frequencies. Defaults to 'float32'.
+
     Returns:
-        The frequencies represented in Mel-scale
+        Tensor: Tensor of n_mels frequencies in Hz with shape `(n_mels,)`.
     """
     # 'Center freqs' of mel bands - uniformly spaced between limits
     min_mel = hz_to_mel(f_min, htk=htk)
@@ -125,14 +131,16 @@ def mel_frequencies(n_mels: int=64,
     return freqs
 
 
-def fft_frequencies(sr: int, n_fft: int, dtype: str=paddle.float32):
+def fft_frequencies(sr: int, n_fft: int, dtype: str='float32') -> Tensor:
     """Compute fourier frequencies.
-    Parameters:
-        sr(int): the audio sample rate.
-        n_fft(float): the number of fft bins.
-        dtype(str): the datatype of the return frequencies.
+
+    Args:
+        sr (int): Sample rate.
+        n_fft (int): Number of fft bins.
+        dtype (str, optional): The data type of the return frequencies. Defaults to 'float32'.
+
     Returns:
-        The frequencies represented in hz.
+        Tensor: FFT frequencies in Hz with shape `(n_fft//2 + 1,)`.
     """
     return paddle.linspace(0, float(sr) / 2, int(1 + n_fft // 2), dtype=dtype)
 
@@ -144,23 +152,21 @@ def compute_fbank_matrix(sr: int,
                          f_max: Optional[float]=None,
                          htk: bool=False,
                          norm: Union[str, float]='slaney',
-                         dtype: str=paddle.float32):
+                         dtype: str='float32') -> Tensor:
     """Compute fbank matrix.
-    Parameters:
-        sr(int): the audio sample rate.
-        n_fft(int): the number of fft bins.
-        n_mels(int): the number of Mel bins.
-        f_min(float): the lower cut-off frequency, below which the filter response is zero.
-        f_max(float): the upper cut-off frequency, above which the filter response is zero.
-        htk: whether to use htk formula.
-        return_complex(bool): whether to return complex matrix. If True, the matrix will
-            be complex type. Otherwise, the real and image part will be stored in the last
-            axis of returned tensor.
-        dtype(str): the datatype of the returned fbank matrix.
+
+    Args:
+        sr (int): Sample rate.
+        n_fft (int): Number of fft bins.
+        n_mels (int, optional): Number of mel bins. Defaults to 64.
+        f_min (float, optional): Minimum frequency in Hz. Defaults to 0.0.
+        f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
+        htk (bool, optional): Use htk scaling. Defaults to False.
+        norm (Union[str, float], optional): Type of normalization. Defaults to 'slaney'.
+        dtype (str, optional): The data type of the return matrix. Defaults to 'float32'.
+
     Returns:
-        The fbank matrix of shape (n_mels, int(1+n_fft//2)).
-    Shape:
-        output: (n_mels, int(1+n_fft//2))
+        Tensor: Mel transform matrix with shape `(n_mels, n_fft//2 + 1)`.
     """
 
     if f_max is None:
@@ -199,27 +205,20 @@ def compute_fbank_matrix(sr: int,
     return weights
 
 
-def power_to_db(magnitude: paddle.Tensor,
+def power_to_db(spect: Tensor,
                 ref_value: float=1.0,
                 amin: float=1e-10,
-                top_db: Optional[float]=None) -> paddle.Tensor:
-    """Convert a power spectrogram (amplitude squared) to decibel (dB) units.
-    The function computes the scaling ``10 * log10(x / ref)`` in a numerically
-    stable way.
-    Parameters:
-        magnitude(Tensor): the input magnitude tensor of any shape.
-        ref_value(float): the reference value. If smaller than 1.0, the db level
-            of the signal will be pulled up accordingly. Otherwise, the db level
-            is pushed down.
-        amin(float): the minimum value of input magnitude, below which the input
-            magnitude is clipped(to amin).
-        top_db(float): the maximum db value of resulting spectrum, above which the
-            spectrum is clipped(to top_db).
+                top_db: Optional[float]=None) -> Tensor:
+    """Convert a power spectrogram (amplitude squared) to decibel (dB) units. The function computes the scaling `10 * log10(x / ref)` in a numerically stable way.
+
+    Args:
+        spect (Tensor): STFT power spectrogram.
+        ref_value (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0.
+        amin (float, optional): Minimum threshold. Defaults to 1e-10.
+        top_db (Optional[float], optional): Threshold the output at `top_db` below the peak. Defaults to None.
+
     Returns:
-        The spectrogram in log-scale.
-    shape:
-        input: any shape
-        output: same as input
+        Tensor: Power spectrogram in db scale.
     """
     if amin <= 0:
         raise Exception("amin must be strictly positive")
@@ -227,8 +226,8 @@ def power_to_db(magnitude: paddle.Tensor,
     if ref_value <= 0:
         raise Exception("ref_value must be strictly positive")
 
-    ones = paddle.ones_like(magnitude)
-    log_spec = 10.0 * paddle.log10(paddle.maximum(ones * amin, magnitude))
+    ones = paddle.ones_like(spect)
+    log_spec = 10.0 * paddle.log10(paddle.maximum(ones * amin, spect))
     log_spec -= 10.0 * math.log10(max(ref_value, amin))
 
     if top_db is not None:
@@ -242,15 +241,17 @@ def power_to_db(magnitude: paddle.Tensor,
 def create_dct(n_mfcc: int,
                n_mels: int,
                norm: Optional[str]='ortho',
-               dtype: Optional[str]=paddle.float32) -> paddle.Tensor:
+               dtype: str='float32') -> Tensor:
     """Create a discrete cosine transform(DCT) matrix.
 
-    Parameters:
+    Args:
         n_mfcc (int): Number of mel frequency cepstral coefficients. 
         n_mels (int): Number of mel filterbanks.
-        norm (str, optional): Normalizaiton type. Defaults to 'ortho'.
+        norm (Optional[str], optional): Normalizaiton type. Defaults to 'ortho'.
+        dtype (str, optional): The data type of the return matrix. Defaults to 'float32'.
+
     Returns:
-        Tensor: The DCT matrix with shape (n_mels, n_mfcc).
+        Tensor: The DCT matrix with shape `(n_mels, n_mfcc)`.
     """
     n = paddle.arange(n_mels, dtype=dtype)
     k = paddle.arange(n_mfcc, dtype=dtype).unsqueeze(1)
diff --git a/paddleaudio/paddleaudio/functional/window.py b/paddleaudio/paddleaudio/functional/window.py
index f321b38ef..c99d50462 100644
--- a/paddleaudio/paddleaudio/functional/window.py
+++ b/paddleaudio/paddleaudio/functional/window.py
@@ -20,24 +20,11 @@ from paddle import Tensor
 
 __all__ = [
     'get_window',
-
-    # windows
-    'taylor',
-    'hamming',
-    'hann',
-    'tukey',
-    'kaiser',
-    'gaussian',
-    'exponential',
-    'triang',
-    'bohman',
-    'blackman',
-    'cosine',
 ]
 
 
-def _cat(a: List[Tensor], data_type: str) -> Tensor:
-    l = [paddle.to_tensor(_a, data_type) for _a in a]
+def _cat(x: List[Tensor], data_type: str) -> Tensor:
+    l = [paddle.to_tensor(_, data_type) for _ in x]
     return paddle.concat(l)
 
 
@@ -48,7 +35,7 @@ def _acosh(x: Union[Tensor, float]) -> Tensor:
 
 
 def _extend(M: int, sym: bool) -> bool:
-    """Extend window by 1 sample if needed for DFT-even symmetry"""
+    """Extend window by 1 sample if needed for DFT-even symmetry. """
     if not sym:
         return M + 1, True
     else:
@@ -56,7 +43,7 @@ def _extend(M: int, sym: bool) -> bool:
 
 
 def _len_guards(M: int) -> bool:
-    """Handle small or incorrect window lengths"""
+    """Handle small or incorrect window lengths. """
     if int(M) != M or M < 0:
         raise ValueError('Window length M must be a non-negative integer')
 
@@ -64,15 +51,15 @@ def _len_guards(M: int) -> bool:
 
 
 def _truncate(w: Tensor, needed: bool) -> Tensor:
-    """Truncate window by 1 sample if needed for DFT-even symmetry"""
+    """Truncate window by 1 sample if needed for DFT-even symmetry. """
     if needed:
         return w[:-1]
     else:
         return w
 
 
-def general_gaussian(M: int, p, sig, sym: bool=True,
-                     dtype: str='float64') -> Tensor:
+def _general_gaussian(M: int, p, sig, sym: bool=True,
+                      dtype: str='float64') -> Tensor:
     """Compute a window with a generalized Gaussian shape.
     This function is consistent with scipy.signal.windows.general_gaussian().
     """
@@ -86,8 +73,8 @@ def general_gaussian(M: int, p, sig, sym: bool=True,
     return _truncate(w, needs_trunc)
 
 
-def general_cosine(M: int, a: float, sym: bool=True,
-                   dtype: str='float64') -> Tensor:
+def _general_cosine(M: int, a: float, sym: bool=True,
+                    dtype: str='float64') -> Tensor:
     """Compute a generic weighted sum of cosine terms window.
     This function is consistent with scipy.signal.windows.general_cosine().
     """
@@ -101,31 +88,23 @@ def general_cosine(M: int, a: float, sym: bool=True,
     return _truncate(w, needs_trunc)
 
 
-def general_hamming(M: int, alpha: float, sym: bool=True,
-                    dtype: str='float64') -> Tensor:
+def _general_hamming(M: int, alpha: float, sym: bool=True,
+                     dtype: str='float64') -> Tensor:
     """Compute a generalized Hamming window.
     This function is consistent with scipy.signal.windows.general_hamming()
     """
-    return general_cosine(M, [alpha, 1. - alpha], sym, dtype=dtype)
+    return _general_cosine(M, [alpha, 1. - alpha], sym, dtype=dtype)
 
 
-def taylor(M: int,
-           nbar=4,
-           sll=30,
-           norm=True,
-           sym: bool=True,
-           dtype: str='float64') -> Tensor:
+def _taylor(M: int,
+            nbar=4,
+            sll=30,
+            norm=True,
+            sym: bool=True,
+            dtype: str='float64') -> Tensor:
     """Compute a Taylor window.
     The Taylor window taper function approximates the Dolph-Chebyshev window's
     constant sidelobe level for a parameterized number of near-in sidelobes.
-    Parameters:
-        M(int): window size
-        nbar, sil, norm: the window-specific parameter.
-        sym(bool)：whether to return symmetric window.
-            The default value is True
-        dtype(str): the datatype of returned tensor.
-    Returns:
-        Tensor: the window tensor
     """
     if _len_guards(M):
         return paddle.ones((M, ), dtype=dtype)
@@ -171,46 +150,25 @@ def taylor(M: int,
     return _truncate(w, needs_trunc)
 
 
-def hamming(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+def _hamming(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
     """Compute a Hamming window.
     The Hamming window is a taper formed by using a raised cosine with
     non-zero endpoints, optimized to minimize the nearest side lobe.
-    Parameters:
-        M(int): window size
-        sym(bool)：whether to return symmetric window.
-            The default value is True
-        dtype(str): the datatype of returned tensor.
-    Returns:
-        Tensor: the window tensor
     """
-    return general_hamming(M, 0.54, sym, dtype=dtype)
+    return _general_hamming(M, 0.54, sym, dtype=dtype)
 
 
-def hann(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+def _hann(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
     """Compute a Hann window.
     The Hann window is a taper formed by using a raised cosine or sine-squared
     with ends that touch zero.
-    Parameters:
-        M(int): window size
-        sym(bool)：whether to return symmetric window.
-            The default value is True
-        dtype(str): the datatype of returned tensor.
-    Returns:
-        Tensor: the window tensor
     """
-    return general_hamming(M, 0.5, sym, dtype=dtype)
+    return _general_hamming(M, 0.5, sym, dtype=dtype)
 
 
-def tukey(M: int, alpha=0.5, sym: bool=True, dtype: str='float64') -> Tensor:
+def _tukey(M: int, alpha=0.5, sym: bool=True, dtype: str='float64') -> Tensor:
     """Compute a Tukey window.
     The Tukey window is also known as a tapered cosine window.
-    Parameters:
-        M(int): window size
-        sym(bool)：whether to return symmetric window.
-            The default value is True
-        dtype(str): the datatype of returned tensor.
-    Returns:
-        Tensor: the window tensor
     """
     if _len_guards(M):
         return paddle.ones((M, ), dtype=dtype)
@@ -237,32 +195,18 @@ def tukey(M: int, alpha=0.5, sym: bool=True, dtype: str='float64') -> Tensor:
     return _truncate(w, needs_trunc)
 
 
-def kaiser(M: int, beta: float, sym: bool=True, dtype: str='float64') -> Tensor:
+def _kaiser(M: int, beta: float, sym: bool=True,
+            dtype: str='float64') -> Tensor:
     """Compute a Kaiser window.
     The Kaiser window is a taper formed by using a Bessel function.
-    Parameters:
-        M(int): window size.
-        beta(float): the window-specific parameter.
-        sym(bool)：whether to return symmetric window.
-            The default value is True
-    Returns:
-        Tensor: the window tensor
     """
     raise NotImplementedError()
 
 
-def gaussian(M: int, std: float, sym: bool=True,
-             dtype: str='float64') -> Tensor:
+def _gaussian(M: int, std: float, sym: bool=True,
+              dtype: str='float64') -> Tensor:
     """Compute a Gaussian window.
     The Gaussian widows has a Gaussian shape defined by the standard deviation(std).
-    Parameters:
-        M(int): window size.
-        std(float): the window-specific parameter.
-        sym(bool)：whether to return symmetric window.
-            The default value is True
-        dtype(str): the datatype of returned tensor.
-    Returns:
-        Tensor: the window tensor
     """
     if _len_guards(M):
         return paddle.ones((M, ), dtype=dtype)
@@ -275,21 +219,12 @@ def gaussian(M: int, std: float, sym: bool=True,
     return _truncate(w, needs_trunc)
 
 
-def exponential(M: int,
-                center=None,
-                tau=1.,
-                sym: bool=True,
-                dtype: str='float64') -> Tensor:
-    """Compute an exponential (or Poisson) window.
-    Parameters:
-        M(int): window size.
-        tau(float): the window-specific parameter.
-        sym(bool)：whether to return symmetric window.
-            The default value is True
-        dtype(str): the datatype of returned tensor.
-    Returns:
-        Tensor: the window tensor
-    """
+def _exponential(M: int,
+                 center=None,
+                 tau=1.,
+                 sym: bool=True,
+                 dtype: str='float64') -> Tensor:
+    """Compute an exponential (or Poisson) window. """
     if sym and center is not None:
         raise ValueError("If sym==True, center must be None.")
     if _len_guards(M):
@@ -305,15 +240,8 @@ def exponential(M: int,
     return _truncate(w, needs_trunc)
 
 
-def triang(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+def _triang(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
     """Compute a triangular window.
-    Parameters:
-        M(int): window size.
-        sym(bool)：whether to return symmetric window.
-            The default value is True
-        dtype(str): the datatype of returned tensor.
-    Returns:
-        Tensor: the window tensor
     """
     if _len_guards(M):
         return paddle.ones((M, ), dtype=dtype)
@@ -330,16 +258,9 @@ def triang(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
     return _truncate(w, needs_trunc)
 
 
-def bohman(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+def _bohman(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
     """Compute a Bohman window.
     The Bohman window is the autocorrelation of a cosine window.
-    Parameters:
-        M(int): window size.
-        sym(bool)：whether to return symmetric window.
-            The default value is True
-        dtype(str): the datatype of returned tensor.
-    Returns:
-        Tensor: the window tensor
     """
     if _len_guards(M):
         return paddle.ones((M, ), dtype=dtype)
@@ -353,32 +274,18 @@ def bohman(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
     return _truncate(w, needs_trunc)
 
 
-def blackman(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+def _blackman(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
     """Compute a Blackman window.
     The Blackman window is a taper formed by using the first three terms of
     a summation of cosines. It was designed to have close to the minimal
     leakage possible.  It is close to optimal, only slightly worse than a
     Kaiser window.
-    Parameters:
-        M(int): window size.
-        sym(bool)：whether to return symmetric window.
-            The default value is True
-        dtype(str): the datatype of returned tensor.
-    Returns:
-        Tensor: the window tensor
     """
-    return general_cosine(M, [0.42, 0.50, 0.08], sym, dtype=dtype)
+    return _general_cosine(M, [0.42, 0.50, 0.08], sym, dtype=dtype)
 
 
-def cosine(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
+def _cosine(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
     """Compute a window with a simple cosine shape.
-    Parameters:
-        M(int): window size.
-        sym(bool)：whether to return symmetric window.
-            The default value is True
-        dtype(str): the datatype of returned tensor.
-    Returns:
-        Tensor: the window tensor
     """
     if _len_guards(M):
         return paddle.ones((M, ), dtype=dtype)
@@ -388,19 +295,20 @@ def cosine(M: int, sym: bool=True, dtype: str='float64') -> Tensor:
     return _truncate(w, needs_trunc)
 
 
-## factory function
 def get_window(window: Union[str, Tuple[str, float]],
                win_length: int,
                fftbins: bool=True,
                dtype: str='float64') -> Tensor:
     """Return a window of a given length and type.
-    Parameters:
-        window(str|(str,float)): the type of window to create.
-        win_length(int): the number of samples in the window.
-        fftbins(bool): If True, create a "periodic" window. Otherwise,
-            create a "symmetric" window, for use in filter design.
+
+    Args:
+        window (Union[str, Tuple[str, float]]): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'.
+        win_length (int): Number of samples.
+        fftbins (bool, optional): If True, create a "periodic" window. Otherwise, create a "symmetric" window, for use in filter design. Defaults to True.
+        dtype (str, optional): The data type of the return window. Defaults to 'float64'.
+
     Returns:
-       The window represented as a tensor.
+        Tensor: The window represented as a tensor.
     """
     sym = not fftbins
 
@@ -420,7 +328,7 @@ def get_window(window: Union[str, Tuple[str, float]],
                          str(type(window)))
 
     try:
-        winfunc = eval(winstr)
+        winfunc = eval('_' + winstr)
     except KeyError as e:
         raise ValueError("Unknown window type.") from e
 
diff --git a/paddleaudio/paddleaudio/metric/__init__.py b/paddleaudio/paddleaudio/metric/__init__.py
index a96530ff6..d2b3a1360 100644
--- a/paddleaudio/paddleaudio/metric/__init__.py
+++ b/paddleaudio/paddleaudio/metric/__init__.py
@@ -12,4 +12,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from .dtw import dtw_distance
-from .mcd import mcd_distance
+from .eer import compute_eer
+from .eer import compute_minDCF
diff --git a/paddleaudio/paddleaudio/metric/dtw.py b/paddleaudio/paddleaudio/metric/dtw.py
index d27f56e28..662e4506d 100644
--- a/paddleaudio/paddleaudio/metric/dtw.py
+++ b/paddleaudio/paddleaudio/metric/dtw.py
@@ -20,17 +20,19 @@ __all__ = [
 
 
 def dtw_distance(xs: np.ndarray, ys: np.ndarray) -> float:
-    """dtw distance
-
-    Dynamic Time Warping.
+    """Dynamic Time Warping.
     This function keeps a compact matrix, not the full warping paths matrix.
     Uses dynamic programming to compute:
 
-    wps[i, j] = (s1[i]-s2[j])**2 + min(
-                    wps[i-1, j  ] + penalty,  // vertical   / insertion / expansion
-                    wps[i  , j-1] + penalty,  // horizontal / deletion  / compression
-                    wps[i-1, j-1])            // diagonal   / match
-    dtw = sqrt(wps[-1, -1])
+    Examples:
+        .. code-block:: python
+
+            wps[i, j] = (s1[i]-s2[j])**2 + min(
+                            wps[i-1, j  ] + penalty,  // vertical   / insertion / expansion
+                            wps[i  , j-1] + penalty,  // horizontal / deletion  / compression
+                            wps[i-1, j-1])            // diagonal   / match
+
+            dtw = sqrt(wps[-1, -1])
 
     Args:
         xs (np.ndarray): ref sequence, [T,D]
diff --git a/paddleaudio/paddleaudio/metric/eer.py b/paddleaudio/paddleaudio/metric/eer.py
new file mode 100644
index 000000000..a1166d3f9
--- /dev/null
+++ b/paddleaudio/paddleaudio/metric/eer.py
@@ -0,0 +1,100 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List
+
+import numpy as np
+import paddle
+from sklearn.metrics import roc_curve
+
+
+def compute_eer(labels: np.ndarray, scores: np.ndarray) -> List[float]:
+    """Compute EER and return score threshold.
+
+    Args:
+        labels (np.ndarray): the trial label, shape: [N], one-dimention, N refer to the samples num
+        scores (np.ndarray): the trial scores, shape: [N], one-dimention, N refer to the samples num
+
+    Returns:
+        List[float]: eer and the specific threshold
+    """
+    fpr, tpr, threshold = roc_curve(y_true=labels, y_score=scores)
+    fnr = 1 - tpr
+    eer_threshold = threshold[np.nanargmin(np.absolute((fnr - fpr)))]
+    eer = fpr[np.nanargmin(np.absolute((fnr - fpr)))]
+    return eer, eer_threshold
+
+
+def compute_minDCF(positive_scores,
+                   negative_scores,
+                   c_miss=1.0,
+                   c_fa=1.0,
+                   p_target=0.01):
+    """
+    This is modified from SpeechBrain
+    https://github.com/speechbrain/speechbrain/blob/085be635c07f16d42cd1295045bc46c407f1e15b/speechbrain/utils/metric_stats.py#L509
+    Computes the minDCF metric normally used to evaluate speaker verification
+    systems. The min_DCF is the minimum of the following C_det function computed
+    within the defined threshold range:
+
+    C_det =  c_miss * p_miss * p_target + c_fa * p_fa * (1 -p_target)
+
+    where p_miss is the missing probability and p_fa is the probability of having
+    a false alarm.
+
+    Args:
+        positive_scores (Paddle.Tensor): The scores from entries of the same class.
+        negative_scores (Paddle.Tensor): The scores from entries of different classes.
+        c_miss (float, optional): Cost assigned to a missing error (default 1.0).
+        c_fa (float, optional): Cost assigned to a false alarm (default 1.0).
+        p_target (float, optional): Prior probability of having a target (default 0.01).
+
+    Returns:
+        List[float]: min dcf and the specific threshold
+    """
+    # Computing candidate thresholds
+    if len(positive_scores.shape) > 1:
+        positive_scores = positive_scores.squeeze()
+
+    if len(negative_scores.shape) > 1:
+        negative_scores = negative_scores.squeeze()
+
+    thresholds = paddle.sort(paddle.concat([positive_scores, negative_scores]))
+    thresholds = paddle.unique(thresholds)
+
+    # Adding intermediate thresholds
+    interm_thresholds = (thresholds[0:-1] + thresholds[1:]) / 2
+    thresholds = paddle.sort(paddle.concat([thresholds, interm_thresholds]))
+
+    # Computing False Rejection Rate (miss detection)
+    positive_scores = paddle.concat(
+        len(thresholds) * [positive_scores.unsqueeze(0)])
+    pos_scores_threshold = positive_scores.transpose(perm=[1, 0]) <= thresholds
+    p_miss = (pos_scores_threshold.sum(0)
+              ).astype("float32") / positive_scores.shape[1]
+    del positive_scores
+    del pos_scores_threshold
+
+    # Computing False Acceptance Rate (false alarm)
+    negative_scores = paddle.concat(
+        len(thresholds) * [negative_scores.unsqueeze(0)])
+    neg_scores_threshold = negative_scores.transpose(perm=[1, 0]) > thresholds
+    p_fa = (neg_scores_threshold.sum(0)
+            ).astype("float32") / negative_scores.shape[1]
+    del negative_scores
+    del neg_scores_threshold
+
+    c_det = c_miss * p_miss * p_target + c_fa * p_fa * (1 - p_target)
+    c_min = paddle.min(c_det, axis=0)
+    min_index = paddle.argmin(c_det, axis=0)
+    return float(c_min), float(thresholds[min_index])
diff --git a/paddleaudio/paddleaudio/metric/mcd.py b/paddleaudio/paddleaudio/metric/mcd.py
deleted file mode 100644
index 465cd5a45..000000000
--- a/paddleaudio/paddleaudio/metric/mcd.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import mcd.metrics_fast as mt
-import numpy as np
-from mcd import dtw
-
-__all__ = [
-    'mcd_distance',
-]
-
-
-def mcd_distance(xs: np.ndarray, ys: np.ndarray, cost_fn=mt.logSpecDbDist):
-    """Mel cepstral distortion (MCD), dtw distance.
-
-    Dynamic Time Warping.
-    Uses dynamic programming to compute:
-    wps[i, j] = cost_fn(xs[i], ys[j]) + min(
-                    wps[i-1, j  ],  // vertical   / insertion / expansion
-                    wps[i  , j-1],  // horizontal / deletion  / compression
-                    wps[i-1, j-1])  // diagonal   / match
-    dtw = sqrt(wps[-1, -1])
-
-    Cost Function:
-    logSpecDbConst = 10.0 / math.log(10.0) * math.sqrt(2.0)
-    def logSpecDbDist(x, y):
-        diff = x - y
-        return logSpecDbConst * math.sqrt(np.inner(diff, diff))
-
-    Args:
-        xs (np.ndarray): ref sequence, [T,D]
-        ys (np.ndarray): hyp sequence, [T,D]
-
-    Returns:
-        float: dtw distance
-    """
-    min_cost, path = dtw.dtw(xs, ys, cost_fn)
-    return min_cost
diff --git a/paddleaudio/paddleaudio/utils/download.py b/paddleaudio/paddleaudio/utils/download.py
index 4658352f9..07d5eea84 100644
--- a/paddleaudio/paddleaudio/utils/download.py
+++ b/paddleaudio/paddleaudio/utils/download.py
@@ -37,7 +37,9 @@ def decompress(file: str):
     download._decompress(file)
 
 
-def download_and_decompress(archives: List[Dict[str, str]], path: str):
+def download_and_decompress(archives: List[Dict[str, str]],
+                            path: str,
+                            decompress: bool=True):
     """
     Download archieves and decompress to specific path.
     """
@@ -47,8 +49,8 @@ def download_and_decompress(archives: List[Dict[str, str]], path: str):
     for archive in archives:
         assert 'url' in archive and 'md5' in archive, \
             'Dictionary keys of "url" and "md5" are required in the archive, but got: {list(archieve.keys())}'
-
-        download.get_path_from_url(archive['url'], path, archive['md5'])
+        download.get_path_from_url(
+            archive['url'], path, archive['md5'], decompress=decompress)
 
 
 def load_state_dict_from_url(url: str, path: str, md5: str=None):
diff --git a/paddleaudio/setup.py b/paddleaudio/setup.py
index 7623443a6..c92e5c73f 100644
--- a/paddleaudio/setup.py
+++ b/paddleaudio/setup.py
@@ -11,19 +11,46 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import glob
+import os
+
 import setuptools
+from setuptools.command.install import install
+from setuptools.command.test import test
 
 # set the version here
-VERSION = '0.2.0'
+VERSION = '0.2.1'
+
+
+# Inspired by the example at https://pytest.org/latest/goodpractises.html
+class TestCommand(test):
+    def finalize_options(self):
+        test.finalize_options(self)
+        self.test_args = []
+        self.test_suite = True
+
+    def run(self):
+        self.run_benchmark()
+        super(TestCommand, self).run()
+
+    def run_tests(self):
+        # Run nose ensuring that argv simulates running nosetests directly
+        import nose
+        nose.run_exit(argv=['nosetests', '-w', 'tests'])
+
+    def run_benchmark(self):
+        for benchmark_item in glob.glob('tests/benchmark/*py'):
+            os.system(f'pytest {benchmark_item}')
+
+
+class InstallCommand(install):
+    def run(self):
+        install.run(self)
 
 
 def write_version_py(filename='paddleaudio/__init__.py'):
-    import paddleaudio
-    if hasattr(paddleaudio,
-               "__version__") and paddleaudio.__version__ == VERSION:
-        return
     with open(filename, "a") as f:
-        f.write(f"\n__version__ = '{VERSION}'\n")
+        f.write(f"__version__ = '{VERSION}'")
 
 
 def remove_version_py(filename='paddleaudio/__init__.py'):
@@ -35,6 +62,7 @@ def remove_version_py(filename='paddleaudio/__init__.py'):
                 f.write(line)
 
 
+remove_version_py()
 write_version_py()
 
 setuptools.setup(
@@ -54,13 +82,18 @@ setuptools.setup(
     ],
     python_requires='>=3.6',
     install_requires=[
-        'numpy >= 1.15.0',
-        'scipy >= 1.0.0',
-        'resampy >= 0.2.2',
-        'soundfile >= 0.9.0',
-        'colorlog',
-        'dtaidistance >= 2.3.6',
-        'mcd >= 0.4',
-    ], )
+        'numpy >= 1.15.0', 'scipy >= 1.0.0', 'resampy >= 0.2.2',
+        'soundfile >= 0.9.0', 'colorlog', 'dtaidistance == 2.3.1', 'pathos'
+        ],
+    extras_require={
+        'test': [
+            'nose', 'librosa==0.8.1', 'soundfile==0.10.3.post1',
+            'torchaudio==0.10.2', 'pytest-benchmark'
+        ],
+    },
+    cmdclass={
+        'install': InstallCommand,
+        'test': TestCommand,
+    }, )
 
 remove_version_py()
diff --git a/paddleaudio/tests/backends/__init__.py b/paddleaudio/tests/backends/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/paddleaudio/tests/backends/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddleaudio/tests/backends/base.py b/paddleaudio/tests/backends/base.py
new file mode 100644
index 000000000..a67191887
--- /dev/null
+++ b/paddleaudio/tests/backends/base.py
@@ -0,0 +1,34 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import unittest
+import urllib.request
+
+mono_channel_wav = 'https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav'
+multi_channels_wav = 'https://paddlespeech.bj.bcebos.com/PaddleAudio/cat.wav'
+
+
+class BackendTest(unittest.TestCase):
+    def setUp(self):
+        self.initWavInput()
+
+    def initWavInput(self):
+        self.files = []
+        for url in [mono_channel_wav, multi_channels_wav]:
+            if not os.path.isfile(os.path.basename(url)):
+                urllib.request.urlretrieve(url, os.path.basename(url))
+            self.files.append(os.path.basename(url))
+
+    def initParmas(self):
+        raise NotImplementedError
diff --git a/paddleaudio/tests/backends/soundfile/__init__.py b/paddleaudio/tests/backends/soundfile/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/paddleaudio/tests/backends/soundfile/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddleaudio/tests/backends/soundfile/test_io.py b/paddleaudio/tests/backends/soundfile/test_io.py
new file mode 100644
index 000000000..0f7580a40
--- /dev/null
+++ b/paddleaudio/tests/backends/soundfile/test_io.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import filecmp
+import os
+import unittest
+
+import numpy as np
+import soundfile as sf
+
+import paddleaudio
+from ..base import BackendTest
+
+
+class TestIO(BackendTest):
+    def test_load_mono_channel(self):
+        sf_data, sf_sr = sf.read(self.files[0])
+        pa_data, pa_sr = paddleaudio.load(
+            self.files[0], normal=False, dtype='float64')
+
+        self.assertEqual(sf_data.dtype, pa_data.dtype)
+        self.assertEqual(sf_sr, pa_sr)
+        np.testing.assert_array_almost_equal(sf_data, pa_data)
+
+    def test_load_multi_channels(self):
+        sf_data, sf_sr = sf.read(self.files[1])
+        sf_data = sf_data.T  # Channel dim first
+        pa_data, pa_sr = paddleaudio.load(
+            self.files[1], mono=False, normal=False, dtype='float64')
+
+        self.assertEqual(sf_data.dtype, pa_data.dtype)
+        self.assertEqual(sf_sr, pa_sr)
+        np.testing.assert_array_almost_equal(sf_data, pa_data)
+
+    def test_save_mono_channel(self):
+        waveform, sr = np.random.randint(
+            low=-32768, high=32768, size=(48000), dtype=np.int16), 16000
+        sf_tmp_file = 'sf_tmp.wav'
+        pa_tmp_file = 'pa_tmp.wav'
+
+        sf.write(sf_tmp_file, waveform, sr)
+        paddleaudio.save(waveform, sr, pa_tmp_file)
+
+        self.assertTrue(filecmp.cmp(sf_tmp_file, pa_tmp_file))
+        for file in [sf_tmp_file, pa_tmp_file]:
+            os.remove(file)
+
+    def test_save_multi_channels(self):
+        waveform, sr = np.random.randint(
+            low=-32768, high=32768, size=(2, 48000), dtype=np.int16), 16000
+        sf_tmp_file = 'sf_tmp.wav'
+        pa_tmp_file = 'pa_tmp.wav'
+
+        sf.write(sf_tmp_file, waveform.T, sr)
+        paddleaudio.save(waveform.T, sr, pa_tmp_file)
+
+        self.assertTrue(filecmp.cmp(sf_tmp_file, pa_tmp_file))
+        for file in [sf_tmp_file, pa_tmp_file]:
+            os.remove(file)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/paddleaudio/tests/benchmark/README.md b/paddleaudio/tests/benchmark/README.md
new file mode 100644
index 000000000..b9034100d
--- /dev/null
+++ b/paddleaudio/tests/benchmark/README.md
@@ -0,0 +1,39 @@
+# 1. Prepare
+First, install `pytest-benchmark` via pip.
+```sh
+pip install pytest-benchmark
+```
+
+# 2. Run
+Run the specific script for profiling.
+```sh
+pytest melspectrogram.py
+```
+
+Result:
+```sh
+========================================================================== test session starts ==========================================================================
+platform linux -- Python 3.7.7, pytest-7.0.1, pluggy-1.0.0
+benchmark: 3.4.1 (defaults: timer=time.perf_counter disable_gc=False min_rounds=5 min_time=0.000005 max_time=1.0 calibration_precision=10 warmup=False warmup_iterations=100000)
+rootdir: /ssd3/chenxiaojie06/PaddleSpeech/DeepSpeech/paddleaudio
+plugins: typeguard-2.12.1, benchmark-3.4.1, anyio-3.5.0
+collected 4 items
+
+melspectrogram.py ....                                                                                                                                            [100%]
+
+
+-------------------------------------------------------------------------------------------------- benchmark: 4 tests -------------------------------------------------------------------------------------------------
+Name (time in us)                        Min                    Max                   Mean              StdDev                 Median                 IQR            Outliers         OPS            Rounds  Iterations
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+test_melspect_gpu_torchaudio        202.0765 (1.0)         360.6230 (1.0)         218.1168 (1.0)       16.3022 (1.0)         214.2871 (1.0)       21.8451 (1.0)          40;3  4,584.7001 (1.0)         286           1
+test_melspect_gpu                   657.8509 (3.26)        908.0470 (2.52)        724.2545 (3.32)     106.5771 (6.54)        669.9096 (3.13)     113.4719 (5.19)          1;0  1,380.7300 (0.30)          5           1
+test_melspect_cpu_torchaudio      1,247.6053 (6.17)      2,892.5799 (8.02)      1,443.2853 (6.62)     345.3732 (21.19)     1,262.7263 (5.89)     221.6385 (10.15)       56;53    692.8637 (0.15)        399           1
+test_melspect_cpu                20,326.2549 (100.59)   20,607.8682 (57.15)    20,473.4125 (93.86)     63.8654 (3.92)     20,467.0429 (95.51)     68.4294 (3.13)          8;1     48.8438 (0.01)         29           1
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+
+Legend:
+  Outliers: 1 Standard Deviation from Mean; 1.5 IQR (InterQuartile Range) from 1st Quartile and 3rd Quartile.
+  OPS: Operations Per Second, computed as 1 / Mean
+========================================================================== 4 passed in 21.12s ===========================================================================
+
+```
diff --git a/paddleaudio/tests/benchmark/log_melspectrogram.py b/paddleaudio/tests/benchmark/log_melspectrogram.py
new file mode 100644
index 000000000..5230acd42
--- /dev/null
+++ b/paddleaudio/tests/benchmark/log_melspectrogram.py
@@ -0,0 +1,124 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import urllib.request
+
+import librosa
+import numpy as np
+import paddle
+import torch
+import torchaudio
+
+import paddleaudio
+
+wav_url = 'https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav'
+if not os.path.isfile(os.path.basename(wav_url)):
+    urllib.request.urlretrieve(wav_url, os.path.basename(wav_url))
+
+waveform, sr = paddleaudio.load(os.path.abspath(os.path.basename(wav_url)))
+waveform_tensor = paddle.to_tensor(waveform).unsqueeze(0)
+waveform_tensor_torch = torch.from_numpy(waveform).unsqueeze(0)
+
+# Feature conf
+mel_conf = {
+    'sr': sr,
+    'n_fft': 512,
+    'hop_length': 128,
+    'n_mels': 40,
+}
+
+mel_conf_torchaudio = {
+    'sample_rate': sr,
+    'n_fft': 512,
+    'hop_length': 128,
+    'n_mels': 40,
+    'norm': 'slaney',
+    'mel_scale': 'slaney',
+}
+
+
+def enable_cpu_device():
+    paddle.set_device('cpu')
+
+
+def enable_gpu_device():
+    paddle.set_device('gpu')
+
+
+log_mel_extractor = paddleaudio.features.LogMelSpectrogram(
+    **mel_conf, f_min=0.0, top_db=80.0, dtype=waveform_tensor.dtype)
+
+
+def log_melspectrogram():
+    return log_mel_extractor(waveform_tensor).squeeze(0)
+
+
+def test_log_melspect_cpu(benchmark):
+    enable_cpu_device()
+    feature_paddleaudio = benchmark(log_melspectrogram)
+    feature_librosa = librosa.feature.melspectrogram(waveform, **mel_conf)
+    feature_librosa = librosa.power_to_db(feature_librosa, top_db=80.0)
+    np.testing.assert_array_almost_equal(
+        feature_librosa, feature_paddleaudio, decimal=3)
+
+
+def test_log_melspect_gpu(benchmark):
+    enable_gpu_device()
+    feature_paddleaudio = benchmark(log_melspectrogram)
+    feature_librosa = librosa.feature.melspectrogram(waveform, **mel_conf)
+    feature_librosa = librosa.power_to_db(feature_librosa, top_db=80.0)
+    np.testing.assert_array_almost_equal(
+        feature_librosa, feature_paddleaudio, decimal=2)
+
+
+mel_extractor_torchaudio = torchaudio.transforms.MelSpectrogram(
+    **mel_conf_torchaudio, f_min=0.0)
+amplitude_to_DB = torchaudio.transforms.AmplitudeToDB('power', top_db=80.0)
+
+
+def melspectrogram_torchaudio():
+    return mel_extractor_torchaudio(waveform_tensor_torch).squeeze(0)
+
+
+def log_melspectrogram_torchaudio():
+    mel_specgram = mel_extractor_torchaudio(waveform_tensor_torch)
+    return amplitude_to_DB(mel_specgram).squeeze(0)
+
+
+def test_log_melspect_cpu_torchaudio(benchmark):
+    global waveform_tensor_torch, mel_extractor_torchaudio, amplitude_to_DB
+
+    mel_extractor_torchaudio = mel_extractor_torchaudio.to('cpu')
+    waveform_tensor_torch = waveform_tensor_torch.to('cpu')
+    amplitude_to_DB = amplitude_to_DB.to('cpu')
+
+    feature_paddleaudio = benchmark(log_melspectrogram_torchaudio)
+    feature_librosa = librosa.feature.melspectrogram(waveform, **mel_conf)
+    feature_librosa = librosa.power_to_db(feature_librosa, top_db=80.0)
+    np.testing.assert_array_almost_equal(
+        feature_librosa, feature_paddleaudio, decimal=3)
+
+
+def test_log_melspect_gpu_torchaudio(benchmark):
+    global waveform_tensor_torch, mel_extractor_torchaudio, amplitude_to_DB
+
+    mel_extractor_torchaudio = mel_extractor_torchaudio.to('cuda')
+    waveform_tensor_torch = waveform_tensor_torch.to('cuda')
+    amplitude_to_DB = amplitude_to_DB.to('cuda')
+
+    feature_torchaudio = benchmark(log_melspectrogram_torchaudio)
+    feature_librosa = librosa.feature.melspectrogram(waveform, **mel_conf)
+    feature_librosa = librosa.power_to_db(feature_librosa, top_db=80.0)
+    np.testing.assert_array_almost_equal(
+        feature_librosa, feature_torchaudio.cpu(), decimal=2)
diff --git a/paddleaudio/tests/benchmark/melspectrogram.py b/paddleaudio/tests/benchmark/melspectrogram.py
new file mode 100644
index 000000000..e0b79b45a
--- /dev/null
+++ b/paddleaudio/tests/benchmark/melspectrogram.py
@@ -0,0 +1,108 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import urllib.request
+
+import librosa
+import numpy as np
+import paddle
+import torch
+import torchaudio
+
+import paddleaudio
+
+wav_url = 'https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav'
+if not os.path.isfile(os.path.basename(wav_url)):
+    urllib.request.urlretrieve(wav_url, os.path.basename(wav_url))
+
+waveform, sr = paddleaudio.load(os.path.abspath(os.path.basename(wav_url)))
+waveform_tensor = paddle.to_tensor(waveform).unsqueeze(0)
+waveform_tensor_torch = torch.from_numpy(waveform).unsqueeze(0)
+
+# Feature conf
+mel_conf = {
+    'sr': sr,
+    'n_fft': 512,
+    'hop_length': 128,
+    'n_mels': 40,
+}
+
+mel_conf_torchaudio = {
+    'sample_rate': sr,
+    'n_fft': 512,
+    'hop_length': 128,
+    'n_mels': 40,
+    'norm': 'slaney',
+    'mel_scale': 'slaney',
+}
+
+
+def enable_cpu_device():
+    paddle.set_device('cpu')
+
+
+def enable_gpu_device():
+    paddle.set_device('gpu')
+
+
+mel_extractor = paddleaudio.features.MelSpectrogram(
+    **mel_conf, f_min=0.0, dtype=waveform_tensor.dtype)
+
+
+def melspectrogram():
+    return mel_extractor(waveform_tensor).squeeze(0)
+
+
+def test_melspect_cpu(benchmark):
+    enable_cpu_device()
+    feature_paddleaudio = benchmark(melspectrogram)
+    feature_librosa = librosa.feature.melspectrogram(waveform, **mel_conf)
+    np.testing.assert_array_almost_equal(
+        feature_librosa, feature_paddleaudio, decimal=3)
+
+
+def test_melspect_gpu(benchmark):
+    enable_gpu_device()
+    feature_paddleaudio = benchmark(melspectrogram)
+    feature_librosa = librosa.feature.melspectrogram(waveform, **mel_conf)
+    np.testing.assert_array_almost_equal(
+        feature_librosa, feature_paddleaudio, decimal=3)
+
+
+mel_extractor_torchaudio = torchaudio.transforms.MelSpectrogram(
+    **mel_conf_torchaudio, f_min=0.0)
+
+
+def melspectrogram_torchaudio():
+    return mel_extractor_torchaudio(waveform_tensor_torch).squeeze(0)
+
+
+def test_melspect_cpu_torchaudio(benchmark):
+    global waveform_tensor_torch, mel_extractor_torchaudio
+    mel_extractor_torchaudio = mel_extractor_torchaudio.to('cpu')
+    waveform_tensor_torch = waveform_tensor_torch.to('cpu')
+    feature_paddleaudio = benchmark(melspectrogram_torchaudio)
+    feature_librosa = librosa.feature.melspectrogram(waveform, **mel_conf)
+    np.testing.assert_array_almost_equal(
+        feature_librosa, feature_paddleaudio, decimal=3)
+
+
+def test_melspect_gpu_torchaudio(benchmark):
+    global waveform_tensor_torch, mel_extractor_torchaudio
+    mel_extractor_torchaudio = mel_extractor_torchaudio.to('cuda')
+    waveform_tensor_torch = waveform_tensor_torch.to('cuda')
+    feature_torchaudio = benchmark(melspectrogram_torchaudio)
+    feature_librosa = librosa.feature.melspectrogram(waveform, **mel_conf)
+    np.testing.assert_array_almost_equal(
+        feature_librosa, feature_torchaudio.cpu(), decimal=3)
diff --git a/paddleaudio/tests/benchmark/mfcc.py b/paddleaudio/tests/benchmark/mfcc.py
new file mode 100644
index 000000000..2572ff33d
--- /dev/null
+++ b/paddleaudio/tests/benchmark/mfcc.py
@@ -0,0 +1,122 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import urllib.request
+
+import librosa
+import numpy as np
+import paddle
+import torch
+import torchaudio
+
+import paddleaudio
+
+wav_url = 'https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav'
+if not os.path.isfile(os.path.basename(wav_url)):
+    urllib.request.urlretrieve(wav_url, os.path.basename(wav_url))
+
+waveform, sr = paddleaudio.load(os.path.abspath(os.path.basename(wav_url)))
+waveform_tensor = paddle.to_tensor(waveform).unsqueeze(0)
+waveform_tensor_torch = torch.from_numpy(waveform).unsqueeze(0)
+
+# Feature conf
+mel_conf = {
+    'sr': sr,
+    'n_fft': 512,
+    'hop_length': 128,
+    'n_mels': 40,
+}
+mfcc_conf = {
+    'n_mfcc': 20,
+    'top_db': 80.0,
+}
+mfcc_conf.update(mel_conf)
+
+mel_conf_torchaudio = {
+    'sample_rate': sr,
+    'n_fft': 512,
+    'hop_length': 128,
+    'n_mels': 40,
+    'norm': 'slaney',
+    'mel_scale': 'slaney',
+}
+mfcc_conf_torchaudio = {
+    'sample_rate': sr,
+    'n_mfcc': 20,
+}
+
+
+def enable_cpu_device():
+    paddle.set_device('cpu')
+
+
+def enable_gpu_device():
+    paddle.set_device('gpu')
+
+
+mfcc_extractor = paddleaudio.features.MFCC(
+    **mfcc_conf, f_min=0.0, dtype=waveform_tensor.dtype)
+
+
+def mfcc():
+    return mfcc_extractor(waveform_tensor).squeeze(0)
+
+
+def test_mfcc_cpu(benchmark):
+    enable_cpu_device()
+    feature_paddleaudio = benchmark(mfcc)
+    feature_librosa = librosa.feature.mfcc(waveform, **mel_conf)
+    np.testing.assert_array_almost_equal(
+        feature_librosa, feature_paddleaudio, decimal=3)
+
+
+def test_mfcc_gpu(benchmark):
+    enable_gpu_device()
+    feature_paddleaudio = benchmark(mfcc)
+    feature_librosa = librosa.feature.mfcc(waveform, **mel_conf)
+    np.testing.assert_array_almost_equal(
+        feature_librosa, feature_paddleaudio, decimal=3)
+
+
+del mel_conf_torchaudio['sample_rate']
+mfcc_extractor_torchaudio = torchaudio.transforms.MFCC(
+    **mfcc_conf_torchaudio, melkwargs=mel_conf_torchaudio)
+
+
+def mfcc_torchaudio():
+    return mfcc_extractor_torchaudio(waveform_tensor_torch).squeeze(0)
+
+
+def test_mfcc_cpu_torchaudio(benchmark):
+    global waveform_tensor_torch, mfcc_extractor_torchaudio
+
+    mel_extractor_torchaudio = mfcc_extractor_torchaudio.to('cpu')
+    waveform_tensor_torch = waveform_tensor_torch.to('cpu')
+
+    feature_paddleaudio = benchmark(mfcc_torchaudio)
+    feature_librosa = librosa.feature.mfcc(waveform, **mel_conf)
+    np.testing.assert_array_almost_equal(
+        feature_librosa, feature_paddleaudio, decimal=3)
+
+
+def test_mfcc_gpu_torchaudio(benchmark):
+    global waveform_tensor_torch, mfcc_extractor_torchaudio
+
+    mel_extractor_torchaudio = mfcc_extractor_torchaudio.to('cuda')
+    waveform_tensor_torch = waveform_tensor_torch.to('cuda')
+
+    feature_torchaudio = benchmark(mfcc_torchaudio)
+    feature_librosa = librosa.feature.mfcc(waveform, **mel_conf)
+    np.testing.assert_array_almost_equal(
+        feature_librosa, feature_torchaudio.cpu(), decimal=3)
diff --git a/paddleaudio/tests/features/__init__.py b/paddleaudio/tests/features/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/paddleaudio/tests/features/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddleaudio/tests/features/base.py b/paddleaudio/tests/features/base.py
new file mode 100644
index 000000000..725e1e2e7
--- /dev/null
+++ b/paddleaudio/tests/features/base.py
@@ -0,0 +1,49 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import unittest
+import urllib.request
+
+import numpy as np
+import paddle
+
+from paddleaudio import load
+
+wav_url = 'https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav'
+
+
+class FeatTest(unittest.TestCase):
+    def setUp(self):
+        self.initParmas()
+        self.initWavInput()
+        self.setUpDevice()
+
+    def setUpDevice(self, device='cpu'):
+        paddle.set_device(device)
+
+    def initWavInput(self, url=wav_url):
+        if not os.path.isfile(os.path.basename(url)):
+            urllib.request.urlretrieve(url, os.path.basename(url))
+        self.waveform, self.sr = load(os.path.abspath(os.path.basename(url)))
+        self.waveform = self.waveform.astype(
+            np.float32
+        )  # paddlespeech.s2t.transform.spectrogram only supports float32 
+        dim = len(self.waveform.shape)
+
+        assert dim in [1, 2]
+        if dim == 1:
+            self.waveform = np.expand_dims(self.waveform, 0)
+
+    def initParmas(self):
+        raise NotImplementedError
diff --git a/paddleaudio/tests/features/test_istft.py b/paddleaudio/tests/features/test_istft.py
new file mode 100644
index 000000000..23371200b
--- /dev/null
+++ b/paddleaudio/tests/features/test_istft.py
@@ -0,0 +1,49 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+import numpy as np
+import paddle
+
+from .base import FeatTest
+from paddleaudio.functional.window import get_window
+from paddlespeech.s2t.transform.spectrogram import IStft
+from paddlespeech.s2t.transform.spectrogram import Stft
+
+
+class TestIstft(FeatTest):
+    def initParmas(self):
+        self.n_fft = 512
+        self.hop_length = 128
+        self.window_str = 'hann'
+
+    def test_istft(self):
+        ps_stft = Stft(self.n_fft, self.hop_length)
+        ps_res = ps_stft(
+            self.waveform.T).squeeze(1).T  # (n_fft//2 + 1, n_frmaes)
+        x = paddle.to_tensor(ps_res)
+
+        ps_istft = IStft(self.hop_length)
+        ps_res = ps_istft(ps_res.T)
+
+        window = get_window(
+            self.window_str, self.n_fft, dtype=self.waveform.dtype)
+        pd_res = paddle.signal.istft(
+            x, self.n_fft, self.hop_length, window=window)
+
+        np.testing.assert_array_almost_equal(ps_res, pd_res, decimal=5)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/paddleaudio/tests/features/test_kaldi.py b/paddleaudio/tests/features/test_kaldi.py
new file mode 100644
index 000000000..6e826aaa7
--- /dev/null
+++ b/paddleaudio/tests/features/test_kaldi.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+import numpy as np
+import paddle
+import torch
+import torchaudio
+
+import paddleaudio
+from .base import FeatTest
+
+
+class TestKaldi(FeatTest):
+    def initParmas(self):
+        self.window_size = 1024
+        self.dtype = 'float32'
+
+    def test_window(self):
+        t_hann_window = torch.hann_window(
+            self.window_size, periodic=False, dtype=eval(f'torch.{self.dtype}'))
+        t_hamm_window = torch.hamming_window(
+            self.window_size,
+            periodic=False,
+            alpha=0.54,
+            beta=0.46,
+            dtype=eval(f'torch.{self.dtype}'))
+        t_povey_window = torch.hann_window(
+            self.window_size, periodic=False,
+            dtype=eval(f'torch.{self.dtype}')).pow(0.85)
+
+        p_hann_window = paddleaudio.functional.window.get_window(
+            'hann',
+            self.window_size,
+            fftbins=False,
+            dtype=eval(f'paddle.{self.dtype}'))
+        p_hamm_window = paddleaudio.functional.window.get_window(
+            'hamming',
+            self.window_size,
+            fftbins=False,
+            dtype=eval(f'paddle.{self.dtype}'))
+        p_povey_window = paddleaudio.functional.window.get_window(
+            'hann',
+            self.window_size,
+            fftbins=False,
+            dtype=eval(f'paddle.{self.dtype}')).pow(0.85)
+
+        np.testing.assert_array_almost_equal(t_hann_window, p_hann_window)
+        np.testing.assert_array_almost_equal(t_hamm_window, p_hamm_window)
+        np.testing.assert_array_almost_equal(t_povey_window, p_povey_window)
+
+    def test_fbank(self):
+        ta_features = torchaudio.compliance.kaldi.fbank(
+            torch.from_numpy(self.waveform.astype(self.dtype)))
+        pa_features = paddleaudio.compliance.kaldi.fbank(
+            paddle.to_tensor(self.waveform.astype(self.dtype)))
+        np.testing.assert_array_almost_equal(
+            ta_features, pa_features, decimal=4)
+
+    def test_mfcc(self):
+        ta_features = torchaudio.compliance.kaldi.mfcc(
+            torch.from_numpy(self.waveform.astype(self.dtype)))
+        pa_features = paddleaudio.compliance.kaldi.mfcc(
+            paddle.to_tensor(self.waveform.astype(self.dtype)))
+        np.testing.assert_array_almost_equal(
+            ta_features, pa_features, decimal=4)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/paddleaudio/tests/features/test_librosa.py b/paddleaudio/tests/features/test_librosa.py
new file mode 100644
index 000000000..cf0c98c72
--- /dev/null
+++ b/paddleaudio/tests/features/test_librosa.py
@@ -0,0 +1,281 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+import librosa
+import numpy as np
+import paddle
+
+import paddleaudio
+from .base import FeatTest
+from paddleaudio.functional.window import get_window
+
+
+class TestLibrosa(FeatTest):
+    def initParmas(self):
+        self.n_fft = 512
+        self.hop_length = 128
+        self.n_mels = 40
+        self.n_mfcc = 20
+        self.fmin = 0.0
+        self.window_str = 'hann'
+        self.pad_mode = 'reflect'
+        self.top_db = 80.0
+
+    def test_stft(self):
+        if len(self.waveform.shape) == 2:  # (C, T)
+            self.waveform = self.waveform.squeeze(
+                0)  # 1D input for librosa.feature.melspectrogram
+
+        feature_librosa = librosa.core.stft(
+            y=self.waveform,
+            n_fft=self.n_fft,
+            hop_length=self.hop_length,
+            win_length=None,
+            window=self.window_str,
+            center=True,
+            dtype=None,
+            pad_mode=self.pad_mode, )
+        x = paddle.to_tensor(self.waveform).unsqueeze(0)
+        window = get_window(self.window_str, self.n_fft, dtype=x.dtype)
+        feature_paddle = paddle.signal.stft(
+            x=x,
+            n_fft=self.n_fft,
+            hop_length=self.hop_length,
+            win_length=None,
+            window=window,
+            center=True,
+            pad_mode=self.pad_mode,
+            normalized=False,
+            onesided=True, ).squeeze(0)
+
+        np.testing.assert_array_almost_equal(
+            feature_librosa, feature_paddle, decimal=5)
+
+    def test_istft(self):
+        if len(self.waveform.shape) == 2:  # (C, T)
+            self.waveform = self.waveform.squeeze(
+                0)  # 1D input for librosa.feature.melspectrogram
+
+        # Get stft result from librosa.
+        stft_matrix = librosa.core.stft(
+            y=self.waveform,
+            n_fft=self.n_fft,
+            hop_length=self.hop_length,
+            win_length=None,
+            window=self.window_str,
+            center=True,
+            pad_mode=self.pad_mode, )
+
+        feature_librosa = librosa.core.istft(
+            stft_matrix=stft_matrix,
+            hop_length=self.hop_length,
+            win_length=None,
+            window=self.window_str,
+            center=True,
+            dtype=None,
+            length=None, )
+
+        x = paddle.to_tensor(stft_matrix).unsqueeze(0)
+        window = get_window(
+            self.window_str,
+            self.n_fft,
+            dtype=paddle.to_tensor(self.waveform).dtype)
+        feature_paddle = paddle.signal.istft(
+            x=x,
+            n_fft=self.n_fft,
+            hop_length=self.hop_length,
+            win_length=None,
+            window=window,
+            center=True,
+            normalized=False,
+            onesided=True,
+            length=None,
+            return_complex=False, ).squeeze(0)
+
+        np.testing.assert_array_almost_equal(
+            feature_librosa, feature_paddle, decimal=5)
+
+    def test_mel(self):
+        feature_librosa = librosa.filters.mel(
+            sr=self.sr,
+            n_fft=self.n_fft,
+            n_mels=self.n_mels,
+            fmin=self.fmin,
+            fmax=None,
+            htk=False,
+            norm='slaney',
+            dtype=self.waveform.dtype, )
+        feature_compliance = paddleaudio.compliance.librosa.compute_fbank_matrix(
+            sr=self.sr,
+            n_fft=self.n_fft,
+            n_mels=self.n_mels,
+            fmin=self.fmin,
+            fmax=None,
+            htk=False,
+            norm='slaney',
+            dtype=self.waveform.dtype, )
+        x = paddle.to_tensor(self.waveform)
+        feature_functional = paddleaudio.functional.compute_fbank_matrix(
+            sr=self.sr,
+            n_fft=self.n_fft,
+            n_mels=self.n_mels,
+            f_min=self.fmin,
+            f_max=None,
+            htk=False,
+            norm='slaney',
+            dtype=x.dtype, )
+
+        np.testing.assert_array_almost_equal(feature_librosa,
+                                             feature_compliance)
+        np.testing.assert_array_almost_equal(feature_librosa,
+                                             feature_functional)
+
+    def test_melspect(self):
+        if len(self.waveform.shape) == 2:  # (C, T)
+            self.waveform = self.waveform.squeeze(
+                0)  # 1D input for librosa.feature.melspectrogram
+
+        # librosa:
+        feature_librosa = librosa.feature.melspectrogram(
+            y=self.waveform,
+            sr=self.sr,
+            n_fft=self.n_fft,
+            hop_length=self.hop_length,
+            n_mels=self.n_mels,
+            fmin=self.fmin)
+
+        # paddleaudio.compliance.librosa:
+        feature_compliance = paddleaudio.compliance.librosa.melspectrogram(
+            x=self.waveform,
+            sr=self.sr,
+            window_size=self.n_fft,
+            hop_length=self.hop_length,
+            n_mels=self.n_mels,
+            fmin=self.fmin,
+            to_db=False)
+
+        # paddleaudio.features.layer
+        x = paddle.to_tensor(
+            self.waveform, dtype=paddle.float64).unsqueeze(0)  # Add batch dim.
+        feature_extractor = paddleaudio.features.MelSpectrogram(
+            sr=self.sr,
+            n_fft=self.n_fft,
+            hop_length=self.hop_length,
+            n_mels=self.n_mels,
+            f_min=self.fmin,
+            dtype=x.dtype)
+        feature_layer = feature_extractor(x).squeeze(0).numpy()
+
+        np.testing.assert_array_almost_equal(
+            feature_librosa, feature_compliance, decimal=5)
+        np.testing.assert_array_almost_equal(
+            feature_librosa, feature_layer, decimal=5)
+
+    def test_log_melspect(self):
+        if len(self.waveform.shape) == 2:  # (C, T)
+            self.waveform = self.waveform.squeeze(
+                0)  # 1D input for librosa.feature.melspectrogram
+
+        # librosa:
+        feature_librosa = librosa.feature.melspectrogram(
+            y=self.waveform,
+            sr=self.sr,
+            n_fft=self.n_fft,
+            hop_length=self.hop_length,
+            n_mels=self.n_mels,
+            fmin=self.fmin)
+        feature_librosa = librosa.power_to_db(feature_librosa, top_db=None)
+
+        # paddleaudio.compliance.librosa:
+        feature_compliance = paddleaudio.compliance.librosa.melspectrogram(
+            x=self.waveform,
+            sr=self.sr,
+            window_size=self.n_fft,
+            hop_length=self.hop_length,
+            n_mels=self.n_mels,
+            fmin=self.fmin)
+
+        # paddleaudio.features.layer
+        x = paddle.to_tensor(
+            self.waveform, dtype=paddle.float64).unsqueeze(0)  # Add batch dim.
+        feature_extractor = paddleaudio.features.LogMelSpectrogram(
+            sr=self.sr,
+            n_fft=self.n_fft,
+            hop_length=self.hop_length,
+            n_mels=self.n_mels,
+            f_min=self.fmin,
+            dtype=x.dtype)
+        feature_layer = feature_extractor(x).squeeze(0).numpy()
+
+        np.testing.assert_array_almost_equal(
+            feature_librosa, feature_compliance, decimal=5)
+        np.testing.assert_array_almost_equal(
+            feature_librosa, feature_layer, decimal=4)
+
+    def test_mfcc(self):
+        if len(self.waveform.shape) == 2:  # (C, T)
+            self.waveform = self.waveform.squeeze(
+                0)  # 1D input for librosa.feature.melspectrogram
+
+        # librosa:
+        feature_librosa = librosa.feature.mfcc(
+            y=self.waveform,
+            sr=self.sr,
+            S=None,
+            n_mfcc=self.n_mfcc,
+            dct_type=2,
+            norm='ortho',
+            lifter=0,
+            n_fft=self.n_fft,
+            hop_length=self.hop_length,
+            n_mels=self.n_mels,
+            fmin=self.fmin)
+
+        # paddleaudio.compliance.librosa:
+        feature_compliance = paddleaudio.compliance.librosa.mfcc(
+            x=self.waveform,
+            sr=self.sr,
+            n_mfcc=self.n_mfcc,
+            dct_type=2,
+            norm='ortho',
+            lifter=0,
+            window_size=self.n_fft,
+            hop_length=self.hop_length,
+            n_mels=self.n_mels,
+            fmin=self.fmin,
+            top_db=self.top_db)
+
+        # paddleaudio.features.layer
+        x = paddle.to_tensor(
+            self.waveform, dtype=paddle.float64).unsqueeze(0)  # Add batch dim.
+        feature_extractor = paddleaudio.features.MFCC(
+            sr=self.sr,
+            n_mfcc=self.n_mfcc,
+            n_fft=self.n_fft,
+            hop_length=self.hop_length,
+            n_mels=self.n_mels,
+            f_min=self.fmin,
+            top_db=self.top_db,
+            dtype=x.dtype)
+        feature_layer = feature_extractor(x).squeeze(0).numpy()
+
+        np.testing.assert_array_almost_equal(
+            feature_librosa, feature_compliance, decimal=4)
+        np.testing.assert_array_almost_equal(
+            feature_librosa, feature_layer, decimal=4)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/paddleaudio/tests/features/test_log_melspectrogram.py b/paddleaudio/tests/features/test_log_melspectrogram.py
new file mode 100644
index 000000000..6bae2df3f
--- /dev/null
+++ b/paddleaudio/tests/features/test_log_melspectrogram.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+import numpy as np
+import paddle
+
+import paddleaudio
+from .base import FeatTest
+from paddlespeech.s2t.transform.spectrogram import LogMelSpectrogram
+
+
+class TestLogMelSpectrogram(FeatTest):
+    def initParmas(self):
+        self.n_fft = 512
+        self.hop_length = 128
+        self.n_mels = 40
+
+    def test_log_melspect(self):
+        ps_melspect = LogMelSpectrogram(self.sr, self.n_mels, self.n_fft,
+                                        self.hop_length)
+        ps_res = ps_melspect(self.waveform.T).squeeze(1).T
+
+        x = paddle.to_tensor(self.waveform)
+        # paddlespeech.s2t的特征存在幅度谱和功率谱滥用的情况
+        ps_melspect = paddleaudio.features.LogMelSpectrogram(
+            self.sr,
+            self.n_fft,
+            self.hop_length,
+            power=1.0,
+            n_mels=self.n_mels,
+            f_min=0.0)
+        pa_res = (ps_melspect(x) / 10.0).squeeze(0).numpy()
+
+        np.testing.assert_array_almost_equal(ps_res, pa_res, decimal=5)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/paddleaudio/tests/features/test_spectrogram.py b/paddleaudio/tests/features/test_spectrogram.py
new file mode 100644
index 000000000..50b21403b
--- /dev/null
+++ b/paddleaudio/tests/features/test_spectrogram.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+import numpy as np
+import paddle
+
+import paddleaudio
+from .base import FeatTest
+from paddlespeech.s2t.transform.spectrogram import Spectrogram
+
+
+class TestSpectrogram(FeatTest):
+    def initParmas(self):
+        self.n_fft = 512
+        self.hop_length = 128
+
+    def test_spectrogram(self):
+        ps_spect = Spectrogram(self.n_fft, self.hop_length)
+        ps_res = ps_spect(self.waveform.T).squeeze(1).T  # Magnitude
+
+        x = paddle.to_tensor(self.waveform)
+        pa_spect = paddleaudio.features.Spectrogram(
+            self.n_fft, self.hop_length, power=1.0)
+        pa_res = pa_spect(x).squeeze(0).numpy()
+
+        np.testing.assert_array_almost_equal(ps_res, pa_res, decimal=5)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/paddleaudio/tests/features/test_stft.py b/paddleaudio/tests/features/test_stft.py
new file mode 100644
index 000000000..c64b5ebe6
--- /dev/null
+++ b/paddleaudio/tests/features/test_stft.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+import numpy as np
+import paddle
+
+from .base import FeatTest
+from paddleaudio.functional.window import get_window
+from paddlespeech.s2t.transform.spectrogram import Stft
+
+
+class TestStft(FeatTest):
+    def initParmas(self):
+        self.n_fft = 512
+        self.hop_length = 128
+        self.window_str = 'hann'
+
+    def test_stft(self):
+        ps_stft = Stft(self.n_fft, self.hop_length)
+        ps_res = ps_stft(
+            self.waveform.T).squeeze(1).T  # (n_fft//2 + 1, n_frmaes)
+
+        x = paddle.to_tensor(self.waveform)
+        window = get_window(self.window_str, self.n_fft, dtype=x.dtype)
+        pd_res = paddle.signal.stft(
+            x, self.n_fft, self.hop_length, window=window).squeeze(0).numpy()
+
+        np.testing.assert_array_almost_equal(ps_res, pd_res, decimal=5)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/paddlespeech/cli/README.md b/paddlespeech/cli/README.md
index 5ac7a3bca..19c822040 100644
--- a/paddlespeech/cli/README.md
+++ b/paddlespeech/cli/README.md
@@ -13,6 +13,12 @@
  paddlespeech cls --input input.wav
  ```
 
+ ## Speaker Verification
+
+ ```bash
+ paddlespeech vector --task spk --input input_16k.wav
+ ```
+
  ## Automatic Speech Recognition
  ```
  paddlespeech asr --lang zh --input input_16k.wav
diff --git a/paddlespeech/cli/README_cn.md b/paddlespeech/cli/README_cn.md
index 75ab9e41b..4b15d6c7b 100644
--- a/paddlespeech/cli/README_cn.md
+++ b/paddlespeech/cli/README_cn.md
@@ -12,6 +12,12 @@
  ## 声音分类
  ```bash
  paddlespeech cls --input input.wav
+ ```
+
+  ## 声纹识别
+
+ ```bash
+ paddlespeech vector --task spk --input input_16k.wav
  ```
 
  ## 语音识别
diff --git a/paddlespeech/cli/__init__.py b/paddlespeech/cli/__init__.py
index b526a3849..ddf0359bc 100644
--- a/paddlespeech/cli/__init__.py
+++ b/paddlespeech/cli/__init__.py
@@ -21,5 +21,6 @@ from .st import STExecutor
 from .stats import StatsExecutor
 from .text import TextExecutor
 from .tts import TTSExecutor
+from .vector import VectorExecutor
 
 _locale._getdefaultlocale = (lambda *args: ['en_US', 'utf8'])
diff --git a/paddlespeech/cli/cls/infer.py b/paddlespeech/cli/cls/infer.py
index ab5eee6e2..f56d8a579 100644
--- a/paddlespeech/cli/cls/infer.py
+++ b/paddlespeech/cli/cls/infer.py
@@ -193,7 +193,8 @@ class CLSExecutor(BaseExecutor):
             sr=feat_conf['sample_rate'],
             mono=True,
             dtype='float32')
-        logger.info("Preprocessing audio_file:" + audio_file)
+        if isinstance(audio_file, (str, os.PathLike)):
+            logger.info("Preprocessing audio_file:" + audio_file)
 
         # Feature extraction
         feature_extractor = LogMelSpectrogram(
diff --git a/paddlespeech/cli/executor.py b/paddlespeech/cli/executor.py
index d77d27b03..064939a85 100644
--- a/paddlespeech/cli/executor.py
+++ b/paddlespeech/cli/executor.py
@@ -178,7 +178,8 @@ class BaseExecutor(ABC):
         Returns:
             bool: return `True` for job input, `False` otherwise.
         """
-        return input_ and os.path.isfile(input_) and input_.endswith('.job')
+        return input_ and os.path.isfile(input_) and (input_.endswith('.job') or
+                                                      input_.endswith('.txt'))
 
     def _get_job_contents(
             self, job_input: os.PathLike) -> Dict[str, Union[str, os.PathLike]]:
diff --git a/paddlespeech/cli/tts/infer.py b/paddlespeech/cli/tts/infer.py
index 8423dfa8d..c7a1edc93 100644
--- a/paddlespeech/cli/tts/infer.py
+++ b/paddlespeech/cli/tts/infer.py
@@ -237,6 +237,42 @@ pretrained_models = {
         'speech_stats':
         'feats_stats.npy',
     },
+    "hifigan_ljspeech-en": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_ljspeech_ckpt_0.2.0.zip',
+        'md5':
+        '70e9131695decbca06a65fe51ed38a72',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_2500000.pdz',
+        'speech_stats':
+        'feats_stats.npy',
+    },
+    "hifigan_aishell3-zh": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_ckpt_0.2.0.zip',
+        'md5':
+        '3bb49bc75032ed12f79c00c8cc79a09a',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_2500000.pdz',
+        'speech_stats':
+        'feats_stats.npy',
+    },
+    "hifigan_vctk-en": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_ckpt_0.2.0.zip',
+        'md5':
+        '7da8f88359bca2457e705d924cf27bd4',
+        'config':
+        'default.yaml',
+        'ckpt':
+        'snapshot_iter_2500000.pdz',
+        'speech_stats':
+        'feats_stats.npy',
+    },
 
     # wavernn
     "wavernn_csmsc-zh": {
@@ -365,6 +401,9 @@ class TTSExecutor(BaseExecutor):
                 'mb_melgan_csmsc',
                 'style_melgan_csmsc',
                 'hifigan_csmsc',
+                'hifigan_ljspeech',
+                'hifigan_aishell3',
+                'hifigan_vctk',
                 'wavernn_csmsc',
             ],
             help='Choose vocoder type of tts task.')
diff --git a/paddlespeech/cli/utils.py b/paddlespeech/cli/utils.py
index d7dcc90c7..f7d64b9a9 100644
--- a/paddlespeech/cli/utils.py
+++ b/paddlespeech/cli/utils.py
@@ -192,7 +192,7 @@ class ConfigCache:
             try:
                 cfg = yaml.load(file, Loader=yaml.FullLoader)
                 self._data.update(cfg)
-            except:
+            except Exception as e:
                 self.flush()
 
     @property
diff --git a/paddlespeech/cli/vector/__init__.py b/paddlespeech/cli/vector/__init__.py
new file mode 100644
index 000000000..038596af0
--- /dev/null
+++ b/paddlespeech/cli/vector/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .infer import VectorExecutor
diff --git a/paddlespeech/cli/vector/infer.py b/paddlespeech/cli/vector/infer.py
new file mode 100644
index 000000000..175a9723e
--- /dev/null
+++ b/paddlespeech/cli/vector/infer.py
@@ -0,0 +1,448 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import sys
+from collections import OrderedDict
+from typing import List
+from typing import Optional
+from typing import Union
+
+import paddle
+import soundfile
+from yacs.config import CfgNode
+
+from ..executor import BaseExecutor
+from ..log import logger
+from ..utils import cli_register
+from ..utils import download_and_decompress
+from ..utils import MODEL_HOME
+from ..utils import stats_wrapper
+from paddleaudio.backends import load as load_audio
+from paddleaudio.compliance.librosa import melspectrogram
+from paddlespeech.s2t.utils.dynamic_import import dynamic_import
+from paddlespeech.vector.io.batch import feature_normalize
+from paddlespeech.vector.modules.sid_model import SpeakerIdetification
+
+pretrained_models = {
+    # The tags for pretrained_models should be "{model_name}[-{dataset}][-{sr}][-...]".
+    # e.g. "ecapatdnn_voxceleb12-16k".
+    # Command line and python api use "{model_name}[-{dataset}]" as --model, usage:
+    # "paddlespeech vector --task spk --model ecapatdnn_voxceleb12-16k --sr 16000 --input ./input.wav"
+    "ecapatdnn_voxceleb12-16k": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/vector/voxceleb/sv0_ecapa_tdnn_voxceleb12_ckpt_0_1_1.tar.gz',
+        'md5':
+        'a1c0dba7d4de997187786ff517d5b4ec',
+        'cfg_path':
+        'conf/model.yaml',  # the yaml config path
+        'ckpt_path':
+        'model/model',  # the format is ${dir}/{model_name}, 
+        # so the first 'model' is dir, the second 'model' is the name
+        # this means we have a model stored as model/model.pdparams
+    },
+}
+
+model_alias = {
+    "ecapatdnn": "paddlespeech.vector.models.ecapa_tdnn:EcapaTdnn",
+}
+
+
+@cli_register(
+    name="paddlespeech.vector",
+    description="Speech to vector embedding infer command.")
+class VectorExecutor(BaseExecutor):
+    def __init__(self):
+        super(VectorExecutor, self).__init__()
+
+        self.parser = argparse.ArgumentParser(
+            prog="paddlespeech.vector", add_help=True)
+
+        self.parser.add_argument(
+            "--model",
+            type=str,
+            default="ecapatdnn_voxceleb12",
+            choices=["ecapatdnn_voxceleb12"],
+            help="Choose model type of vector task.")
+        self.parser.add_argument(
+            "--task",
+            type=str,
+            default="spk",
+            choices=["spk"],
+            help="task type in vector domain")
+        self.parser.add_argument(
+            "--input",
+            type=str,
+            default=None,
+            help="Audio file to extract embedding.")
+        self.parser.add_argument(
+            "--sample_rate",
+            type=int,
+            default=16000,
+            choices=[16000],
+            help="Choose the audio sample rate of the model. 8000 or 16000")
+        self.parser.add_argument(
+            "--ckpt_path",
+            type=str,
+            default=None,
+            help="Checkpoint file of model.")
+        self.parser.add_argument(
+            '--config',
+            type=str,
+            default=None,
+            help='Config of asr task. Use deault config when it is None.')
+        self.parser.add_argument(
+            "--device",
+            type=str,
+            default=paddle.get_device(),
+            help="Choose device to execute model inference.")
+        self.parser.add_argument(
+            '-d',
+            '--job_dump_result',
+            action='store_true',
+            help='Save job result into file.')
+
+        self.parser.add_argument(
+            '-v',
+            '--verbose',
+            action='store_true',
+            help='Increase logger verbosity of current task.')
+
+    def execute(self, argv: List[str]) -> bool:
+        """Command line entry for vector model
+
+        Args:
+            argv (List[str]): command line args list
+
+        Returns:
+            bool: 
+                 False: some audio occurs error
+                 True: all audio process success
+        """
+        # stage 0: parse the args and get the required args
+        parser_args = self.parser.parse_args(argv)
+        model = parser_args.model
+        sample_rate = parser_args.sample_rate
+        config = parser_args.config
+        ckpt_path = parser_args.ckpt_path
+        device = parser_args.device
+
+        # stage 1: configurate the verbose flag
+        if not parser_args.verbose:
+            self.disable_task_loggers()
+
+        # stage 2: read the input data and store them as a list
+        task_source = self.get_task_source(parser_args.input)
+        logger.info(f"task source: {task_source}")
+
+        # stage 3: process the audio one by one
+        task_result = OrderedDict()
+        has_exceptions = False
+        for id_, input_ in task_source.items():
+            try:
+                res = self(input_, model, sample_rate, config, ckpt_path,
+                           device)
+                task_result[id_] = res
+            except Exception as e:
+                has_exceptions = True
+                task_result[id_] = f'{e.__class__.__name__}: {e}'
+
+        logger.info("task result as follows: ")
+        logger.info(f"{task_result}")
+
+        # stage 4: process the all the task results
+        self.process_task_results(parser_args.input, task_result,
+                                  parser_args.job_dump_result)
+
+        # stage 5: return the exception flag
+        #          if return False, somen audio process occurs error
+        if has_exceptions:
+            return False
+        else:
+            return True
+
+    @stats_wrapper
+    def __call__(self,
+                 audio_file: os.PathLike,
+                 model: str='ecapatdnn_voxceleb12',
+                 sample_rate: int=16000,
+                 config: os.PathLike=None,
+                 ckpt_path: os.PathLike=None,
+                 device=paddle.get_device()):
+        """Extract the audio embedding
+
+        Args:
+            audio_file (os.PathLike): audio path, 
+                                      whose format must be wav and sample rate must be matched the model
+            model (str, optional): mode type, which is been loaded from the pretrained model list. 
+                                   Defaults to 'ecapatdnn-voxceleb12'.
+            sample_rate (int, optional): model sample rate. Defaults to 16000.
+            config (os.PathLike, optional): yaml config. Defaults to None.
+            ckpt_path (os.PathLike, optional): pretrained model path. Defaults to None.
+            device (optional): paddle running host device. Defaults to paddle.get_device().
+
+        Returns:
+            dict: return the audio embedding and the embedding shape
+        """
+        # stage 0: check the audio format
+        audio_file = os.path.abspath(audio_file)
+        if not self._check(audio_file, sample_rate):
+            sys.exit(-1)
+
+        # stage 1: set the paddle runtime host device
+        logger.info(f"device type: {device}")
+        paddle.device.set_device(device)
+
+        # stage 2: read the specific pretrained model
+        self._init_from_path(model, sample_rate, config, ckpt_path)
+
+        # stage 3: preprocess the audio and get the audio feat
+        self.preprocess(model, audio_file)
+
+        # stage 4: infer the model and get the audio embedding
+        self.infer(model)
+
+        # stage 5: process the result and set them to output dict
+        res = self.postprocess()
+
+        return res
+
+    def _get_pretrained_path(self, tag: str) -> os.PathLike:
+        """get the neural network path from the pretrained model list
+           we stored all the pretained mode in the variable `pretrained_models`
+
+        Args:
+            tag (str): model tag in the pretrained model list
+
+        Returns:
+            os.PathLike: the downloaded pretrained model path in the disk
+        """
+        support_models = list(pretrained_models.keys())
+        assert tag in pretrained_models, \
+            'The model "{}" you want to use has not been supported,'\
+            'please choose other models.\n' \
+            'The support models includes\n\t\t{}'.format(tag, "\n\t\t".join(support_models))
+
+        res_path = os.path.join(MODEL_HOME, tag)
+        decompressed_path = download_and_decompress(pretrained_models[tag],
+                                                    res_path)
+
+        decompressed_path = os.path.abspath(decompressed_path)
+        logger.info(
+            'Use pretrained model stored in: {}'.format(decompressed_path))
+
+        return decompressed_path
+
+    def _init_from_path(self,
+                        model_type: str='ecapatdnn_voxceleb12',
+                        sample_rate: int=16000,
+                        cfg_path: Optional[os.PathLike]=None,
+                        ckpt_path: Optional[os.PathLike]=None):
+        """Init the neural network from the model path
+
+        Args:
+            model_type (str, optional): model tag in the pretrained model list. 
+                                        Defaults to 'ecapatdnn_voxceleb12'.
+            sample_rate (int, optional): model sample rate. 
+                                         Defaults to 16000.
+            cfg_path (Optional[os.PathLike], optional): yaml config file path. 
+                                                        Defaults to None.
+            ckpt_path (Optional[os.PathLike], optional): the pretrained model path, which is stored in the disk. 
+                                                         Defaults to None.
+        """
+        # stage 0: avoid to init the mode again
+        if hasattr(self, "model"):
+            logger.info("Model has been initialized")
+            return
+
+        # stage 1: get the model and config path
+        #          if we want init the network from the model stored in the disk,
+        #          we must pass the config path and the ckpt model path
+        if cfg_path is None or ckpt_path is None:
+            # get the mode from pretrained list
+            sample_rate_str = "16k" if sample_rate == 16000 else "8k"
+            tag = model_type + "-" + sample_rate_str
+            logger.info(f"load the pretrained model: {tag}")
+            # get the model from the pretrained list
+            # we download the pretrained model and store it in the res_path
+            res_path = self._get_pretrained_path(tag)
+            self.res_path = res_path
+
+            self.cfg_path = os.path.join(res_path,
+                                         pretrained_models[tag]['cfg_path'])
+            self.ckpt_path = os.path.join(
+                res_path, pretrained_models[tag]['ckpt_path'] + '.pdparams')
+        else:
+            # get the model from disk
+            self.cfg_path = os.path.abspath(cfg_path)
+            self.ckpt_path = os.path.abspath(ckpt_path + ".pdparams")
+            self.res_path = os.path.dirname(
+                os.path.dirname(os.path.abspath(self.cfg_path)))
+
+        logger.info(f"start to read the ckpt from {self.ckpt_path}")
+        logger.info(f"read the config from {self.cfg_path}")
+        logger.info(f"get the res path {self.res_path}")
+
+        # stage 2: read and config and init the model body
+        self.config = CfgNode(new_allowed=True)
+        self.config.merge_from_file(self.cfg_path)
+
+        # stage 3: get the model name to instance the model network with dynamic_import
+        logger.info("start to dynamic import the model class")
+        model_name = model_type[:model_type.rindex('_')]
+        logger.info(f"model name {model_name}")
+        model_class = dynamic_import(model_name, model_alias)
+        model_conf = self.config.model
+        backbone = model_class(**model_conf)
+        model = SpeakerIdetification(
+            backbone=backbone, num_class=self.config.num_speakers)
+        self.model = model
+        self.model.eval()
+
+        # stage 4: load the model parameters
+        logger.info("start to set the model parameters to model")
+        model_dict = paddle.load(self.ckpt_path)
+        self.model.set_state_dict(model_dict)
+
+        logger.info("create the model instance success")
+
+    @paddle.no_grad()
+    def infer(self, model_type: str):
+        """Infer the model to get the embedding
+
+        Args:
+            model_type (str): speaker verification model type
+        """
+        # stage 0: get the feat and length from _inputs
+        feats = self._inputs["feats"]
+        lengths = self._inputs["lengths"]
+        logger.info("start to do backbone network model forward")
+        logger.info(
+            f"feats shape:{feats.shape}, lengths shape: {lengths.shape}")
+
+        # stage 1: get the audio embedding
+        # embedding from (1, emb_size, 1) -> (emb_size)
+        embedding = self.model.backbone(feats, lengths).squeeze().numpy()
+        logger.info(f"embedding size: {embedding.shape}")
+
+        # stage 2: put the embedding and dim info to _outputs property
+        #          the embedding type is numpy.array
+        self._outputs["embedding"] = embedding
+
+    def postprocess(self) -> Union[str, os.PathLike]:
+        """Return the audio embedding info
+
+        Returns:
+            Union[str, os.PathLike]: audio embedding info
+        """
+        embedding = self._outputs["embedding"]
+        return embedding
+
+    def preprocess(self, model_type: str, input_file: Union[str, os.PathLike]):
+        """Extract the audio feat
+
+        Args:
+            model_type (str): speaker verification model type
+            input_file (Union[str, os.PathLike]): audio file path
+        """
+        audio_file = input_file
+        if isinstance(audio_file, (str, os.PathLike)):
+            logger.info(f"Preprocess audio file: {audio_file}")
+
+        # stage 1: load the audio sample points
+        #    Note: this process must match the training process
+        waveform, sr = load_audio(audio_file)
+        logger.info(f"load the audio sample points, shape is: {waveform.shape}")
+
+        # stage 2: get the audio feat
+        # Note: Now we only support fbank feature
+        try:
+            feat = melspectrogram(
+                x=waveform,
+                sr=self.config.sr,
+                n_mels=self.config.n_mels,
+                window_size=self.config.window_size,
+                hop_length=self.config.hop_size)
+            logger.info(f"extract the audio feat, shape is: {feat.shape}")
+        except Exception as e:
+            logger.info(f"feat occurs exception {e}")
+            sys.exit(-1)
+
+        feat = paddle.to_tensor(feat).unsqueeze(0)
+        # in inference period, the lengths is all one without padding
+        lengths = paddle.ones([1])
+
+        # stage 3: we do feature normalize,
+        #          Now we assume that the feat must do normalize
+        feat = feature_normalize(feat, mean_norm=True, std_norm=False)
+
+        # stage 4: store the feat and length in the _inputs,
+        #          which will be used in other function
+        logger.info(f"feats shape: {feat.shape}")
+        self._inputs["feats"] = feat
+        self._inputs["lengths"] = lengths
+
+        logger.info("audio extract the feat success")
+
+    def _check(self, audio_file: str, sample_rate: int):
+        """Check if the model sample match the audio sample rate 
+
+        Args:
+            audio_file (str): audio file path, which will be extracted the embedding
+            sample_rate (int): the desired model sample rate 
+
+        Returns:
+            bool: return if the audio sample rate matches the model sample rate
+        """
+        self.sample_rate = sample_rate
+        if self.sample_rate != 16000 and self.sample_rate != 8000:
+            logger.error(
+                "invalid sample rate, please input --sr 8000 or --sr 16000")
+            return False
+
+        if isinstance(audio_file, (str, os.PathLike)):
+            if not os.path.isfile(audio_file):
+                logger.error("Please input the right audio file path")
+                return False
+
+        logger.info("checking the aduio file format......")
+        try:
+            audio, audio_sample_rate = soundfile.read(
+                audio_file, dtype="float32", always_2d=True)
+        except Exception as e:
+            logger.exception(e)
+            logger.error(
+                "can not open the audio file, please check the audio file format is 'wav'. \n \
+                 you can try to use sox to change the file format.\n \
+                 For example: \n \
+                 sample rate: 16k \n \
+                 sox input_audio.xx --rate 16k --bits 16 --channels 1 output_audio.wav \n \
+                 sample rate: 8k \n \
+                 sox input_audio.xx --rate 8k --bits 16 --channels 1 output_audio.wav \n \
+                 ")
+            return False
+
+        logger.info(f"The sample rate is {audio_sample_rate}")
+
+        if audio_sample_rate != self.sample_rate:
+            logger.error("The sample rate of the input file is not {}.\n \
+                            The program will resample the wav file to {}.\n \
+                            If the result does not meet your expectations，\n \
+                            Please input the 16k 16 bit 1 channel wav file. \
+                        ".format(self.sample_rate, self.sample_rate))
+            sys.exit(-1)
+        else:
+            logger.info("The audio file format is right")
+
+        return True
diff --git a/paddlespeech/s2t/decoders/recog.py b/paddlespeech/s2t/decoders/recog.py
index 88955eacb..2d2aa2109 100644
--- a/paddlespeech/s2t/decoders/recog.py
+++ b/paddlespeech/s2t/decoders/recog.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # Reference espnet Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
+# Modified from espnet(https://github.com/espnet/espnet)
 """V2 backend for `asr_recog.py` using py:class:`decoders.beam_search.BeamSearch`."""
 import jsonlines
 import paddle
diff --git a/paddlespeech/s2t/decoders/recog_bin.py b/paddlespeech/s2t/decoders/recog_bin.py
index cd7a360ae..37b49f3a0 100644
--- a/paddlespeech/s2t/decoders/recog_bin.py
+++ b/paddlespeech/s2t/decoders/recog_bin.py
@@ -12,15 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # Reference espnet Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
+# Modified from espnet(https://github.com/espnet/espnet)
 """End-to-end speech recognition model decoding script."""
 import logging
 import os
 import random
 import sys
-from distutils.util import strtobool
 
 import configargparse
 import numpy as np
+from distutils.util import strtobool
 
 
 def get_parser():
diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py
index d7bee6d7f..efcc9629f 100644
--- a/paddlespeech/s2t/exps/u2/model.py
+++ b/paddlespeech/s2t/exps/u2/model.py
@@ -239,7 +239,7 @@ class U2Trainer(Trainer):
                 n_iter_processes=config.num_workers,
                 subsampling_factor=1,
                 num_encs=1,
-                dist_sampler=False,
+                dist_sampler=config.get('dist_sampler', False),
                 shortest_first=False)
 
             self.valid_loader = BatchDataLoader(
@@ -260,7 +260,7 @@ class U2Trainer(Trainer):
                 n_iter_processes=config.num_workers,
                 subsampling_factor=1,
                 num_encs=1,
-                dist_sampler=False,
+                dist_sampler=config.get('dist_sampler', False),
                 shortest_first=False)
             logger.info("Setup train/valid Dataloader!")
         else:
diff --git a/paddlespeech/s2t/frontend/audio.py b/paddlespeech/s2t/frontend/audio.py
index d0368cc8d..7f71e5dd9 100644
--- a/paddlespeech/s2t/frontend/audio.py
+++ b/paddlespeech/s2t/frontend/audio.py
@@ -208,6 +208,18 @@ class AudioSegment():
             io.BytesIO(bytes), dtype='float32')
         return cls(samples, sample_rate)
 
+    @classmethod
+    def from_pcm(cls, samples, sample_rate):
+        """Create audio segment from a byte string containing audio samples.
+        :param samples: Audio samples [num_samples x num_channels].
+        :type samples: numpy.ndarray
+        :param sample_rate: Audio sample rate.
+        :type sample_rate: int
+        :return: Audio segment instance.
+        :rtype: AudioSegment
+        """
+        return cls(samples, sample_rate)
+
     @classmethod
     def concatenate(cls, *segments):
         """Concatenate an arbitrary number of audio segments together.
diff --git a/paddlespeech/s2t/frontend/speech.py b/paddlespeech/s2t/frontend/speech.py
index 8fd661c92..969971047 100644
--- a/paddlespeech/s2t/frontend/speech.py
+++ b/paddlespeech/s2t/frontend/speech.py
@@ -107,6 +107,27 @@ class SpeechSegment(AudioSegment):
         return cls(audio.samples, audio.sample_rate, transcript, tokens,
                    token_ids)
 
+    @classmethod
+    def from_pcm(cls,
+                 samples,
+                 sample_rate,
+                 transcript,
+                 tokens=None,
+                 token_ids=None):
+        """Create speech segment from pcm on online mode 
+        Args:
+            samples (numpy.ndarray): Audio samples [num_samples x num_channels].
+            sample_rate (int): Audio sample rate.
+            transcript (str): Transcript text for the speech.
+            tokens (List[str], optional): text tokens. Defaults to None.
+            token_ids (List[int], optional): text token ids. Defaults to None.
+        Returns: 
+            SpeechSegment: Speech segment instance.
+        """
+        audio = AudioSegment.from_pcm(samples, sample_rate)
+        return cls(audio.samples, audio.sample_rate, transcript, tokens,
+                   token_ids)
+
     @classmethod
     def concatenate(cls, *segments):
         """Concatenate an arbitrary number of speech segments together, both
diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py
index 910798127..6a98607b6 100644
--- a/paddlespeech/s2t/models/u2/u2.py
+++ b/paddlespeech/s2t/models/u2/u2.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# Modified from wenet(https://github.com/wenet-e2e/wenet)
 """U2 ASR Model
 Unified Streaming and Non-streaming Two-pass End-to-end Model for Speech Recognition
 (https://arxiv.org/pdf/2012.05481.pdf)
@@ -36,6 +37,7 @@ from paddlespeech.s2t.modules.ctc import CTCDecoderBase
 from paddlespeech.s2t.modules.decoder import TransformerDecoder
 from paddlespeech.s2t.modules.encoder import ConformerEncoder
 from paddlespeech.s2t.modules.encoder import TransformerEncoder
+from paddlespeech.s2t.modules.initializer import DefaultInitializerContext
 from paddlespeech.s2t.modules.loss import LabelSmoothingLoss
 from paddlespeech.s2t.modules.mask import make_pad_mask
 from paddlespeech.s2t.modules.mask import mask_finished_preds
@@ -72,6 +74,7 @@ class U2BaseModel(ASRInterface, nn.Layer):
         assert 0.0 <= ctc_weight <= 1.0, ctc_weight
 
         nn.Layer.__init__(self)
+
         # note that eos is the same as sos (equivalent ID)
         self.sos = vocab_size - 1
         self.eos = vocab_size - 1
@@ -780,9 +783,12 @@ class U2DecodeModel(U2BaseModel):
 
 class U2Model(U2DecodeModel):
     def __init__(self, configs: dict):
-        vocab_size, encoder, decoder, ctc = U2Model._init_from_config(configs)
-
         model_conf = configs.get('model_conf', dict())
+        init_type = model_conf.get("init_type", None)
+        with DefaultInitializerContext(init_type):
+            vocab_size, encoder, decoder, ctc = U2Model._init_from_config(
+                configs)
+
         super().__init__(
             vocab_size=vocab_size,
             encoder=encoder,
diff --git a/paddlespeech/s2t/models/u2/updater.py b/paddlespeech/s2t/models/u2/updater.py
index bb18fe416..c59090a84 100644
--- a/paddlespeech/s2t/models/u2/updater.py
+++ b/paddlespeech/s2t/models/u2/updater.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# Modified from wenet(https://github.com/wenet-e2e/wenet)
 from contextlib import nullcontext
 
 import paddle
diff --git a/paddlespeech/s2t/models/u2_st/u2_st.py b/paddlespeech/s2t/models/u2_st/u2_st.py
index 999723e51..6447753c5 100644
--- a/paddlespeech/s2t/models/u2_st/u2_st.py
+++ b/paddlespeech/s2t/models/u2_st/u2_st.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# Modified from wenet(https://github.com/wenet-e2e/wenet)
 """U2 ASR Model
 Unified Streaming and Non-streaming Two-pass End-to-end Model for Speech Recognition
 (https://arxiv.org/pdf/2012.05481.pdf)
diff --git a/paddlespeech/s2t/modules/activation.py b/paddlespeech/s2t/modules/activation.py
index 4081f7f81..2f387b0d9 100644
--- a/paddlespeech/s2t/modules/activation.py
+++ b/paddlespeech/s2t/modules/activation.py
@@ -17,6 +17,8 @@ import paddle
 from paddle import nn
 from paddle.nn import functional as F
 
+from paddlespeech.s2t.modules.align import Conv2D
+from paddlespeech.s2t.modules.align import Linear
 from paddlespeech.s2t.utils.log import Log
 
 logger = Log(__name__).getlog()
@@ -51,7 +53,7 @@ class LinearGLUBlock(nn.Layer):
             idim (int): input and output dimension
         """
         super().__init__()
-        self.fc = nn.Linear(idim, idim * 2)
+        self.fc = Linear(idim, idim * 2)
 
     def forward(self, xs):
         return glu(self.fc(xs), dim=-1)
@@ -75,7 +77,7 @@ class ConvGLUBlock(nn.Layer):
         self.conv_residual = None
         if in_ch != out_ch:
             self.conv_residual = nn.utils.weight_norm(
-                nn.Conv2D(
+                Conv2D(
                     in_channels=in_ch, out_channels=out_ch, kernel_size=(1, 1)),
                 name='weight',
                 dim=0)
@@ -86,7 +88,7 @@ class ConvGLUBlock(nn.Layer):
         layers = OrderedDict()
         if bottlececk_dim == 0:
             layers['conv'] = nn.utils.weight_norm(
-                nn.Conv2D(
+                Conv2D(
                     in_channels=in_ch,
                     out_channels=out_ch * 2,
                     kernel_size=(kernel_size, 1)),
@@ -106,7 +108,7 @@ class ConvGLUBlock(nn.Layer):
                 dim=0)
             layers['dropout_in'] = nn.Dropout(p=dropout)
             layers['conv_bottleneck'] = nn.utils.weight_norm(
-                nn.Conv2D(
+                Conv2D(
                     in_channels=bottlececk_dim,
                     out_channels=bottlececk_dim,
                     kernel_size=(kernel_size, 1)),
@@ -115,7 +117,7 @@ class ConvGLUBlock(nn.Layer):
             layers['dropout'] = nn.Dropout(p=dropout)
             layers['glu'] = GLU()
             layers['conv_out'] = nn.utils.weight_norm(
-                nn.Conv2D(
+                Conv2D(
                     in_channels=bottlececk_dim,
                     out_channels=out_ch * 2,
                     kernel_size=(1, 1)),
diff --git a/paddlespeech/s2t/modules/align.py b/paddlespeech/s2t/modules/align.py
new file mode 100644
index 000000000..f88916793
--- /dev/null
+++ b/paddlespeech/s2t/modules/align.py
@@ -0,0 +1,139 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+from paddle import nn
+
+from paddlespeech.s2t.modules.initializer import KaimingUniform
+"""
+    To align the initializer between paddle and torch, 
+    the API below are set defalut initializer with priority higger than global initializer.
+"""
+global_init_type = None
+
+
+class LayerNorm(nn.LayerNorm):
+    def __init__(self,
+                 normalized_shape,
+                 epsilon=1e-05,
+                 weight_attr=None,
+                 bias_attr=None,
+                 name=None):
+        if weight_attr is None:
+            weight_attr = paddle.ParamAttr(
+                initializer=nn.initializer.Constant(1.0))
+        if bias_attr is None:
+            bias_attr = paddle.ParamAttr(
+                initializer=nn.initializer.Constant(0.0))
+        super(LayerNorm, self).__init__(normalized_shape, epsilon, weight_attr,
+                                        bias_attr, name)
+
+
+class BatchNorm1D(nn.BatchNorm1D):
+    def __init__(self,
+                 num_features,
+                 momentum=0.9,
+                 epsilon=1e-05,
+                 weight_attr=None,
+                 bias_attr=None,
+                 data_format='NCL',
+                 name=None):
+        if weight_attr is None:
+            weight_attr = paddle.ParamAttr(
+                initializer=nn.initializer.Constant(1.0))
+        if bias_attr is None:
+            bias_attr = paddle.ParamAttr(
+                initializer=nn.initializer.Constant(0.0))
+        super(BatchNorm1D,
+              self).__init__(num_features, momentum, epsilon, weight_attr,
+                             bias_attr, data_format, name)
+
+
+class Embedding(nn.Embedding):
+    def __init__(self,
+                 num_embeddings,
+                 embedding_dim,
+                 padding_idx=None,
+                 sparse=False,
+                 weight_attr=None,
+                 name=None):
+        if weight_attr is None:
+            weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal())
+        super(Embedding, self).__init__(num_embeddings, embedding_dim,
+                                        padding_idx, sparse, weight_attr, name)
+
+
+class Linear(nn.Linear):
+    def __init__(self,
+                 in_features,
+                 out_features,
+                 weight_attr=None,
+                 bias_attr=None,
+                 name=None):
+        if weight_attr is None:
+            if global_init_type == "kaiming_uniform":
+                weight_attr = paddle.ParamAttr(initializer=KaimingUniform())
+        if bias_attr is None:
+            if global_init_type == "kaiming_uniform":
+                bias_attr = paddle.ParamAttr(initializer=KaimingUniform())
+        super(Linear, self).__init__(in_features, out_features, weight_attr,
+                                     bias_attr, name)
+
+
+class Conv1D(nn.Conv1D):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 padding_mode='zeros',
+                 weight_attr=None,
+                 bias_attr=None,
+                 data_format='NCL'):
+        if weight_attr is None:
+            if global_init_type == "kaiming_uniform":
+                print("set kaiming_uniform")
+                weight_attr = paddle.ParamAttr(initializer=KaimingUniform())
+        if bias_attr is None:
+            if global_init_type == "kaiming_uniform":
+                bias_attr = paddle.ParamAttr(initializer=KaimingUniform())
+        super(Conv1D, self).__init__(
+            in_channels, out_channels, kernel_size, stride, padding, dilation,
+            groups, padding_mode, weight_attr, bias_attr, data_format)
+
+
+class Conv2D(nn.Conv2D):
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 padding_mode='zeros',
+                 weight_attr=None,
+                 bias_attr=None,
+                 data_format='NCHW'):
+        if weight_attr is None:
+            if global_init_type == "kaiming_uniform":
+                weight_attr = paddle.ParamAttr(initializer=KaimingUniform())
+        if bias_attr is None:
+            if global_init_type == "kaiming_uniform":
+                bias_attr = paddle.ParamAttr(initializer=KaimingUniform())
+        super(Conv2D, self).__init__(
+            in_channels, out_channels, kernel_size, stride, padding, dilation,
+            groups, padding_mode, weight_attr, bias_attr, data_format)
diff --git a/paddlespeech/s2t/modules/attention.py b/paddlespeech/s2t/modules/attention.py
index 3d5f8cd1d..438efd2a1 100644
--- a/paddlespeech/s2t/modules/attention.py
+++ b/paddlespeech/s2t/modules/attention.py
@@ -22,6 +22,7 @@ import paddle
 from paddle import nn
 from paddle.nn import initializer as I
 
+from paddlespeech.s2t.modules.align import Linear
 from paddlespeech.s2t.utils.log import Log
 
 logger = Log(__name__).getlog()
@@ -48,10 +49,10 @@ class MultiHeadedAttention(nn.Layer):
         # We assume d_v always equals d_k
         self.d_k = n_feat // n_head
         self.h = n_head
-        self.linear_q = nn.Linear(n_feat, n_feat)
-        self.linear_k = nn.Linear(n_feat, n_feat)
-        self.linear_v = nn.Linear(n_feat, n_feat)
-        self.linear_out = nn.Linear(n_feat, n_feat)
+        self.linear_q = Linear(n_feat, n_feat)
+        self.linear_k = Linear(n_feat, n_feat)
+        self.linear_v = Linear(n_feat, n_feat)
+        self.linear_out = Linear(n_feat, n_feat)
         self.dropout = nn.Dropout(p=dropout_rate)
 
     def forward_qkv(self,
@@ -95,7 +96,7 @@ class MultiHeadedAttention(nn.Layer):
             mask (paddle.Tensor): Mask, size (#batch, 1, time2) or
                 (#batch, time1, time2).
         Returns:
-            paddle.Tensor: Transformed value weighted 
+            paddle.Tensor: Transformed value weighted
                 by the attention score, (#batch, time1, d_model).
         """
         n_batch = value.shape[0]
@@ -150,7 +151,7 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):
         """
         super().__init__(n_head, n_feat, dropout_rate)
         # linear transformation for positional encoding
-        self.linear_pos = nn.Linear(n_feat, n_feat, bias_attr=False)
+        self.linear_pos = Linear(n_feat, n_feat, bias_attr=False)
         # these two learnable bias are used in matrix c and matrix d
         # as described in https://arxiv.org/abs/1901.02860 Section 3.3
         #self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k))
diff --git a/paddlespeech/s2t/modules/conformer_convolution.py b/paddlespeech/s2t/modules/conformer_convolution.py
index 7ec92554e..89e652688 100644
--- a/paddlespeech/s2t/modules/conformer_convolution.py
+++ b/paddlespeech/s2t/modules/conformer_convolution.py
@@ -21,6 +21,9 @@ import paddle
 from paddle import nn
 from typeguard import check_argument_types
 
+from paddlespeech.s2t.modules.align import BatchNorm1D
+from paddlespeech.s2t.modules.align import Conv1D
+from paddlespeech.s2t.modules.align import LayerNorm
 from paddlespeech.s2t.utils.log import Log
 
 logger = Log(__name__).getlog()
@@ -49,7 +52,7 @@ class ConvolutionModule(nn.Layer):
         """
         assert check_argument_types()
         super().__init__()
-        self.pointwise_conv1 = nn.Conv1D(
+        self.pointwise_conv1 = Conv1D(
             channels,
             2 * channels,
             kernel_size=1,
@@ -60,8 +63,8 @@ class ConvolutionModule(nn.Layer):
         )
 
         # self.lorder is used to distinguish if it's a causal convolution,
-        # if self.lorder > 0: 
-        #    it's a causal convolution, the input will be padded with 
+        # if self.lorder > 0:
+        #    it's a causal convolution, the input will be padded with
         #    `self.lorder` frames on the left in forward (causal conv impl).
         # else: it's a symmetrical convolution
         if causal:
@@ -73,7 +76,7 @@ class ConvolutionModule(nn.Layer):
             padding = (kernel_size - 1) // 2
             self.lorder = 0
 
-        self.depthwise_conv = nn.Conv1D(
+        self.depthwise_conv = Conv1D(
             channels,
             channels,
             kernel_size,
@@ -87,12 +90,12 @@ class ConvolutionModule(nn.Layer):
         assert norm in ['batch_norm', 'layer_norm']
         if norm == "batch_norm":
             self.use_layer_norm = False
-            self.norm = nn.BatchNorm1D(channels)
+            self.norm = BatchNorm1D(channels)
         else:
             self.use_layer_norm = True
-            self.norm = nn.LayerNorm(channels)
+            self.norm = LayerNorm(channels)
 
-        self.pointwise_conv2 = nn.Conv1D(
+        self.pointwise_conv2 = Conv1D(
             channels,
             channels,
             kernel_size=1,
diff --git a/paddlespeech/s2t/modules/ctc.py b/paddlespeech/s2t/modules/ctc.py
index 2094182af..33ad472de 100644
--- a/paddlespeech/s2t/modules/ctc.py
+++ b/paddlespeech/s2t/modules/ctc.py
@@ -18,6 +18,7 @@ from paddle import nn
 from paddle.nn import functional as F
 from typeguard import check_argument_types
 
+from paddlespeech.s2t.modules.align import Linear
 from paddlespeech.s2t.modules.loss import CTCLoss
 from paddlespeech.s2t.utils import ctc_utils
 from paddlespeech.s2t.utils.log import Log
@@ -69,7 +70,7 @@ class CTCDecoderBase(nn.Layer):
         self.blank_id = blank_id
         self.odim = odim
         self.dropout = nn.Dropout(dropout_rate)
-        self.ctc_lo = nn.Linear(enc_n_units, self.odim)
+        self.ctc_lo = Linear(enc_n_units, self.odim)
         reduction_type = "sum" if reduction else "none"
         self.criterion = CTCLoss(
             blank=self.blank_id,
diff --git a/paddlespeech/s2t/modules/decoder.py b/paddlespeech/s2t/modules/decoder.py
index 6b4d95912..3a851ec62 100644
--- a/paddlespeech/s2t/modules/decoder.py
+++ b/paddlespeech/s2t/modules/decoder.py
@@ -24,6 +24,9 @@ from paddle import nn
 from typeguard import check_argument_types
 
 from paddlespeech.s2t.decoders.scorers.scorer_interface import BatchScorerInterface
+from paddlespeech.s2t.modules.align import Embedding
+from paddlespeech.s2t.modules.align import LayerNorm
+from paddlespeech.s2t.modules.align import Linear
 from paddlespeech.s2t.modules.attention import MultiHeadedAttention
 from paddlespeech.s2t.modules.decoder_layer import DecoderLayer
 from paddlespeech.s2t.modules.embedding import PositionalEncoding
@@ -76,21 +79,22 @@ class TransformerDecoder(BatchScorerInterface, nn.Layer):
             concat_after: bool=False, ):
 
         assert check_argument_types()
+
         nn.Layer.__init__(self)
         self.selfattention_layer_type = 'selfattn'
         attention_dim = encoder_output_size
 
         if input_layer == "embed":
             self.embed = nn.Sequential(
-                nn.Embedding(vocab_size, attention_dim),
+                Embedding(vocab_size, attention_dim),
                 PositionalEncoding(attention_dim, positional_dropout_rate), )
         else:
             raise ValueError(f"only 'embed' is supported: {input_layer}")
 
         self.normalize_before = normalize_before
-        self.after_norm = nn.LayerNorm(attention_dim, epsilon=1e-12)
+        self.after_norm = LayerNorm(attention_dim, epsilon=1e-12)
         self.use_output_layer = use_output_layer
-        self.output_layer = nn.Linear(attention_dim, vocab_size)
+        self.output_layer = Linear(attention_dim, vocab_size)
 
         self.decoders = nn.LayerList([
             DecoderLayer(
diff --git a/paddlespeech/s2t/modules/decoder_layer.py b/paddlespeech/s2t/modules/decoder_layer.py
index 520b18dea..b7f8694c1 100644
--- a/paddlespeech/s2t/modules/decoder_layer.py
+++ b/paddlespeech/s2t/modules/decoder_layer.py
@@ -20,6 +20,8 @@ from typing import Tuple
 import paddle
 from paddle import nn
 
+from paddlespeech.s2t.modules.align import LayerNorm
+from paddlespeech.s2t.modules.align import Linear
 from paddlespeech.s2t.utils.log import Log
 
 logger = Log(__name__).getlog()
@@ -62,14 +64,14 @@ class DecoderLayer(nn.Layer):
         self.self_attn = self_attn
         self.src_attn = src_attn
         self.feed_forward = feed_forward
-        self.norm1 = nn.LayerNorm(size, epsilon=1e-12)
-        self.norm2 = nn.LayerNorm(size, epsilon=1e-12)
-        self.norm3 = nn.LayerNorm(size, epsilon=1e-12)
+        self.norm1 = LayerNorm(size, epsilon=1e-12)
+        self.norm2 = LayerNorm(size, epsilon=1e-12)
+        self.norm3 = LayerNorm(size, epsilon=1e-12)
         self.dropout = nn.Dropout(dropout_rate)
         self.normalize_before = normalize_before
         self.concat_after = concat_after
-        self.concat_linear1 = nn.Linear(size + size, size)
-        self.concat_linear2 = nn.Linear(size + size, size)
+        self.concat_linear1 = Linear(size + size, size)
+        self.concat_linear2 = Linear(size + size, size)
 
     def forward(
             self,
diff --git a/paddlespeech/s2t/modules/encoder.py b/paddlespeech/s2t/modules/encoder.py
index 5c8ba0810..c843c0e20 100644
--- a/paddlespeech/s2t/modules/encoder.py
+++ b/paddlespeech/s2t/modules/encoder.py
@@ -23,6 +23,7 @@ from paddle import nn
 from typeguard import check_argument_types
 
 from paddlespeech.s2t.modules.activation import get_activation
+from paddlespeech.s2t.modules.align import LayerNorm
 from paddlespeech.s2t.modules.attention import MultiHeadedAttention
 from paddlespeech.s2t.modules.attention import RelPositionMultiHeadedAttention
 from paddlespeech.s2t.modules.conformer_convolution import ConvolutionModule
@@ -129,7 +130,7 @@ class BaseEncoder(nn.Layer):
                 d_model=output_size, dropout_rate=positional_dropout_rate), )
 
         self.normalize_before = normalize_before
-        self.after_norm = nn.LayerNorm(output_size, epsilon=1e-12)
+        self.after_norm = LayerNorm(output_size, epsilon=1e-12)
         self.static_chunk_size = static_chunk_size
         self.use_dynamic_chunk = use_dynamic_chunk
         self.use_dynamic_left_chunk = use_dynamic_left_chunk
@@ -457,6 +458,7 @@ class ConformerEncoder(BaseEncoder):
             cnn_module_norm (str): cnn conv norm type, Optional['batch_norm','layer_norm']
         """
         assert check_argument_types()
+
         super().__init__(input_size, output_size, attention_heads, linear_units,
                          num_blocks, dropout_rate, positional_dropout_rate,
                          attention_dropout_rate, input_layer,
diff --git a/paddlespeech/s2t/modules/encoder_layer.py b/paddlespeech/s2t/modules/encoder_layer.py
index d39c0695a..e80a298d6 100644
--- a/paddlespeech/s2t/modules/encoder_layer.py
+++ b/paddlespeech/s2t/modules/encoder_layer.py
@@ -20,6 +20,8 @@ from typing import Tuple
 import paddle
 from paddle import nn
 
+from paddlespeech.s2t.modules.align import LayerNorm
+from paddlespeech.s2t.modules.align import Linear
 from paddlespeech.s2t.utils.log import Log
 
 logger = Log(__name__).getlog()
@@ -39,7 +41,7 @@ class TransformerEncoderLayer(nn.Layer):
             normalize_before: bool=True,
             concat_after: bool=False, ):
         """Construct an EncoderLayer object.
-        
+
         Args:
             size (int): Input dimension.
             self_attn (nn.Layer): Self-attention module instance.
@@ -59,15 +61,15 @@ class TransformerEncoderLayer(nn.Layer):
         super().__init__()
         self.self_attn = self_attn
         self.feed_forward = feed_forward
-        self.norm1 = nn.LayerNorm(size, epsilon=1e-12)
-        self.norm2 = nn.LayerNorm(size, epsilon=1e-12)
+        self.norm1 = LayerNorm(size, epsilon=1e-12)
+        self.norm2 = LayerNorm(size, epsilon=1e-12)
         self.dropout = nn.Dropout(dropout_rate)
         self.size = size
         self.normalize_before = normalize_before
         self.concat_after = concat_after
         # concat_linear may be not used in forward fuction,
         # but will be saved in the *.pt
-        self.concat_linear = nn.Linear(size + size, size)
+        self.concat_linear = Linear(size + size, size)
 
     def forward(
             self,
@@ -147,7 +149,7 @@ class ConformerEncoderLayer(nn.Layer):
             normalize_before: bool=True,
             concat_after: bool=False, ):
         """Construct an EncoderLayer object.
-        
+
         Args:
             size (int): Input dimension.
             self_attn (nn.Layer): Self-attention module instance.
@@ -174,23 +176,23 @@ class ConformerEncoderLayer(nn.Layer):
         self.feed_forward = feed_forward
         self.feed_forward_macaron = feed_forward_macaron
         self.conv_module = conv_module
-        self.norm_ff = nn.LayerNorm(size, epsilon=1e-12)  # for the FNN module
-        self.norm_mha = nn.LayerNorm(size, epsilon=1e-12)  # for the MHA module
+        self.norm_ff = LayerNorm(size, epsilon=1e-12)  # for the FNN module
+        self.norm_mha = LayerNorm(size, epsilon=1e-12)  # for the MHA module
         if feed_forward_macaron is not None:
-            self.norm_ff_macaron = nn.LayerNorm(size, epsilon=1e-12)
+            self.norm_ff_macaron = LayerNorm(size, epsilon=1e-12)
             self.ff_scale = 0.5
         else:
             self.ff_scale = 1.0
         if self.conv_module is not None:
-            self.norm_conv = nn.LayerNorm(
+            self.norm_conv = LayerNorm(
                 size, epsilon=1e-12)  # for the CNN module
-            self.norm_final = nn.LayerNorm(
+            self.norm_final = LayerNorm(
                 size, epsilon=1e-12)  # for the final output of the block
         self.dropout = nn.Dropout(dropout_rate)
         self.size = size
         self.normalize_before = normalize_before
         self.concat_after = concat_after
-        self.concat_linear = nn.Linear(size + size, size)
+        self.concat_linear = Linear(size + size, size)
 
     def forward(
             self,
diff --git a/paddlespeech/s2t/modules/initializer.py b/paddlespeech/s2t/modules/initializer.py
new file mode 100644
index 000000000..30a04e44f
--- /dev/null
+++ b/paddlespeech/s2t/modules/initializer.py
@@ -0,0 +1,172 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+from paddle.fluid import framework
+from paddle.fluid import unique_name
+from paddle.fluid.core import VarDesc
+from paddle.fluid.initializer import MSRAInitializer
+
+__all__ = ['KaimingUniform']
+
+
+class KaimingUniform(MSRAInitializer):
+    r"""Implements the Kaiming Uniform initializer
+
+    This class implements the weight initialization from the paper
+    `Delving Deep into Rectifiers: Surpassing Human-Level Performance on
+    ImageNet Classification <https://arxiv.org/abs/1502.01852>`_
+    by Kaiming He, Xiangyu Zhang, Shaoqing Ren and Jian Sun. This is a
+    robust initialization method that particularly considers the rectifier
+    nonlinearities.
+
+    In case of Uniform distribution, the range is [-x, x], where
+
+    .. math::
+
+        x = \sqrt{\frac{1.0}{fan\_in}}
+
+    In case of Normal distribution, the mean is 0 and the standard deviation
+    is
+
+    .. math::
+
+        \sqrt{\\frac{2.0}{fan\_in}}
+
+    Args:
+        fan_in (float32|None): fan_in for Kaiming uniform Initializer. If None, it is\
+        inferred from the variable. default is None.
+
+    Note:
+        It is recommended to set fan_in to None for most cases.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn as nn
+
+            linear = nn.Linear(2,
+                               4,
+                               weight_attr=nn.initializer.KaimingUniform())
+            data = paddle.rand([30, 10, 2], dtype='float32')
+            res = linear(data)
+
+    """
+
+    def __init__(self, fan_in=None):
+        super(KaimingUniform, self).__init__(
+            uniform=True, fan_in=fan_in, seed=0)
+
+    def __call__(self, var, block=None):
+        """Initialize the input tensor with MSRA initialization.
+
+        Args:
+            var(Tensor): Tensor that needs to be initialized.
+            block(Block, optional): The block in which initialization ops
+                   should be added. Used in static graph only, default None.
+
+        Returns:
+            The initialization op
+        """
+        block = self._check_block(block)
+
+        assert isinstance(var, framework.Variable)
+        assert isinstance(block, framework.Block)
+        f_in, f_out = self._compute_fans(var)
+
+        # If fan_in is passed, use it
+        fan_in = f_in if self._fan_in is None else self._fan_in
+
+        if self._seed == 0:
+            self._seed = block.program.random_seed
+
+        # to be compatible of fp16 initalizers
+        if var.dtype == VarDesc.VarType.FP16 or (
+                var.dtype == VarDesc.VarType.BF16 and not self._uniform):
+            out_dtype = VarDesc.VarType.FP32
+            out_var = block.create_var(
+                name=unique_name.generate(
+                    ".".join(['masra_init', var.name, 'tmp'])),
+                shape=var.shape,
+                dtype=out_dtype,
+                type=VarDesc.VarType.LOD_TENSOR,
+                persistable=False)
+        else:
+            out_dtype = var.dtype
+            out_var = var
+
+        if self._uniform:
+            limit = np.sqrt(1.0 / float(fan_in))
+            op = block.append_op(
+                type="uniform_random",
+                inputs={},
+                outputs={"Out": out_var},
+                attrs={
+                    "shape": out_var.shape,
+                    "dtype": int(out_dtype),
+                    "min": -limit,
+                    "max": limit,
+                    "seed": self._seed
+                },
+                stop_gradient=True)
+
+        else:
+            std = np.sqrt(2.0 / float(fan_in))
+            op = block.append_op(
+                type="gaussian_random",
+                outputs={"Out": out_var},
+                attrs={
+                    "shape": out_var.shape,
+                    "dtype": int(out_dtype),
+                    "mean": 0.0,
+                    "std": std,
+                    "seed": self._seed
+                },
+                stop_gradient=True)
+
+        if var.dtype == VarDesc.VarType.FP16 or (
+                var.dtype == VarDesc.VarType.BF16 and not self._uniform):
+            block.append_op(
+                type="cast",
+                inputs={"X": out_var},
+                outputs={"Out": var},
+                attrs={"in_dtype": out_var.dtype,
+                       "out_dtype": var.dtype})
+
+        if not framework.in_dygraph_mode():
+            var.op = op
+        return op
+
+
+class DefaultInitializerContext(object):
+    """
+        egs:
+        with DefaultInitializerContext("kaiming_uniform"):
+            code for setup_model
+    """
+
+    def __init__(self, init_type=None):
+        self.init_type = init_type
+
+    def __enter__(self):
+        if self.init_type is None:
+            return
+        else:
+            from paddlespeech.s2t.modules import align
+            align.global_init_type = self.init_type
+            return
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        from paddlespeech.s2t.modules import align
+        align.global_init_type = None
diff --git a/paddlespeech/s2t/modules/positionwise_feed_forward.py b/paddlespeech/s2t/modules/positionwise_feed_forward.py
index e2619cd49..c2725dc5c 100644
--- a/paddlespeech/s2t/modules/positionwise_feed_forward.py
+++ b/paddlespeech/s2t/modules/positionwise_feed_forward.py
@@ -17,6 +17,7 @@
 import paddle
 from paddle import nn
 
+from paddlespeech.s2t.modules.align import Linear
 from paddlespeech.s2t.utils.log import Log
 
 logger = Log(__name__).getlog()
@@ -44,10 +45,10 @@ class PositionwiseFeedForward(nn.Layer):
             activation (paddle.nn.Layer): Activation function
         """
         super().__init__()
-        self.w_1 = nn.Linear(idim, hidden_units)
+        self.w_1 = Linear(idim, hidden_units)
         self.activation = activation
         self.dropout = nn.Dropout(dropout_rate)
-        self.w_2 = nn.Linear(hidden_units, idim)
+        self.w_2 = Linear(hidden_units, idim)
 
     def forward(self, xs: paddle.Tensor) -> paddle.Tensor:
         """Forward function.
diff --git a/paddlespeech/s2t/modules/subsampling.py b/paddlespeech/s2t/modules/subsampling.py
index 99a8300f2..88451ddd7 100644
--- a/paddlespeech/s2t/modules/subsampling.py
+++ b/paddlespeech/s2t/modules/subsampling.py
@@ -19,6 +19,9 @@ from typing import Tuple
 import paddle
 from paddle import nn
 
+from paddlespeech.s2t.modules.align import Conv2D
+from paddlespeech.s2t.modules.align import LayerNorm
+from paddlespeech.s2t.modules.align import Linear
 from paddlespeech.s2t.modules.embedding import PositionalEncoding
 from paddlespeech.s2t.utils.log import Log
 
@@ -60,8 +63,8 @@ class LinearNoSubsampling(BaseSubsampling):
         """
         super().__init__(pos_enc_class)
         self.out = nn.Sequential(
-            nn.Linear(idim, odim),
-            nn.LayerNorm(odim, epsilon=1e-12),
+            Linear(idim, odim),
+            LayerNorm(odim, epsilon=1e-12),
             nn.Dropout(dropout_rate),
             nn.ReLU(), )
         self.right_context = 0
@@ -108,12 +111,12 @@ class Conv2dSubsampling4(Conv2dSubsampling):
         """
         super().__init__(pos_enc_class)
         self.conv = nn.Sequential(
-            nn.Conv2D(1, odim, 3, 2),
+            Conv2D(1, odim, 3, 2),
             nn.ReLU(),
-            nn.Conv2D(odim, odim, 3, 2),
+            Conv2D(odim, odim, 3, 2),
             nn.ReLU(), )
         self.out = nn.Sequential(
-            nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim))
+            Linear(odim * (((idim - 1) // 2 - 1) // 2), odim))
         self.subsampling_rate = 4
         # The right context for every conv layer is computed by:
         # (kernel_size - 1) * frame_rate_of_this_layer
@@ -160,13 +163,13 @@ class Conv2dSubsampling6(Conv2dSubsampling):
         """
         super().__init__(pos_enc_class)
         self.conv = nn.Sequential(
-            nn.Conv2D(1, odim, 3, 2),
+            Conv2D(1, odim, 3, 2),
             nn.ReLU(),
-            nn.Conv2D(odim, odim, 5, 3),
+            Conv2D(odim, odim, 5, 3),
             nn.ReLU(), )
         # O = (I - F + Pstart + Pend) // S + 1
         # when Padding == 0, O = (I - F - S) // S
-        self.linear = nn.Linear(odim * (((idim - 1) // 2 - 2) // 3), odim)
+        self.linear = Linear(odim * (((idim - 1) // 2 - 2) // 3), odim)
         # The right context for every conv layer is computed by:
         # (kernel_size - 1) * frame_rate_of_this_layer
         # 10 = (3 - 1) * 1 + (5 - 1) * 2
@@ -212,14 +215,14 @@ class Conv2dSubsampling8(Conv2dSubsampling):
         """
         super().__init__(pos_enc_class)
         self.conv = nn.Sequential(
-            nn.Conv2D(1, odim, 3, 2),
+            Conv2D(1, odim, 3, 2),
             nn.ReLU(),
-            nn.Conv2D(odim, odim, 3, 2),
+            Conv2D(odim, odim, 3, 2),
             nn.ReLU(),
-            nn.Conv2D(odim, odim, 3, 2),
+            Conv2D(odim, odim, 3, 2),
             nn.ReLU(), )
-        self.linear = nn.Linear(odim * ((((idim - 1) // 2 - 1) // 2 - 1) // 2),
-                                odim)
+        self.linear = Linear(odim * ((((idim - 1) // 2 - 1) // 2 - 1) // 2),
+                             odim)
         self.subsampling_rate = 8
         # The right context for every conv layer is computed by:
         # (kernel_size - 1) * frame_rate_of_this_layer
diff --git a/paddlespeech/s2t/transform/spectrogram.py b/paddlespeech/s2t/transform/spectrogram.py
index 889cd349d..4a65548fe 100644
--- a/paddlespeech/s2t/transform/spectrogram.py
+++ b/paddlespeech/s2t/transform/spectrogram.py
@@ -14,8 +14,11 @@
 # Modified from espnet(https://github.com/espnet/espnet)
 import librosa
 import numpy as np
+import paddle
 from python_speech_features import logfbank
 
+import paddleaudio.compliance.kaldi as kaldi
+
 
 def stft(x,
          n_fft,
@@ -309,6 +312,77 @@ class IStft():
 
 
 class LogMelSpectrogramKaldi():
+    def __init__(
+            self,
+            fs=16000,
+            n_mels=80,
+            n_shift=160,  # unit:sample, 10ms
+            win_length=400,  # unit:sample, 25ms
+            energy_floor=0.0,
+            dither=0.1):
+        """
+        The Kaldi implementation of LogMelSpectrogram 
+        Args:
+            fs (int): sample rate of the audio
+            n_mels (int): number of mel filter banks
+            n_shift (int): number of points in a frame shift
+            win_length (int): number of points in a frame windows
+            energy_floor (float): Floor on energy in Spectrogram computation (absolute)
+            dither (float): Dithering constant
+
+        Returns:
+            LogMelSpectrogramKaldi
+        """
+
+        self.fs = fs
+        self.n_mels = n_mels
+        num_point_ms = fs / 1000
+        self.n_frame_length = win_length / num_point_ms
+        self.n_frame_shift = n_shift / num_point_ms
+        self.energy_floor = energy_floor
+        self.dither = dither
+
+    def __repr__(self):
+        return (
+            "{name}(fs={fs}, n_mels={n_mels}, "
+            "n_frame_shift={n_frame_shift}, n_frame_length={n_frame_length}, "
+            "dither={dither}))".format(
+                name=self.__class__.__name__,
+                fs=self.fs,
+                n_mels=self.n_mels,
+                n_frame_shift=self.n_frame_shift,
+                n_frame_length=self.n_frame_length,
+                dither=self.dither, ))
+
+    def __call__(self, x, train):
+        """
+        Args:
+            x (np.ndarray): shape (Ti,)
+            train (bool): True, train mode.
+
+        Raises:
+            ValueError: not support (Ti, C)
+
+        Returns:
+            np.ndarray: (T, D)
+        """
+        dither = self.dither if train else 0.0
+        if x.ndim != 1:
+            raise ValueError("Not support x: [Time, Channel]")
+        waveform = paddle.to_tensor(np.expand_dims(x, 0), dtype=paddle.float32)
+        mat = kaldi.fbank(
+            waveform,
+            n_mels=self.n_mels,
+            frame_length=self.n_frame_length,
+            frame_shift=self.n_frame_shift,
+            dither=dither,
+            energy_floor=self.energy_floor,
+            sr=self.fs)
+        mat = np.squeeze(mat.numpy())
+        return mat
+
+
+class LogMelSpectrogramKaldi_decay():
     def __init__(
             self,
             fs=16000,
diff --git a/paddlespeech/s2t/transform/transformation.py b/paddlespeech/s2t/transform/transformation.py
index 381b0cdc9..3b433cb0b 100644
--- a/paddlespeech/s2t/transform/transformation.py
+++ b/paddlespeech/s2t/transform/transformation.py
@@ -31,6 +31,7 @@ import_alias = dict(
     freq_mask="paddlespeech.s2t.transform.spec_augment:FreqMask",
     spec_augment="paddlespeech.s2t.transform.spec_augment:SpecAugment",
     speed_perturbation="paddlespeech.s2t.transform.perturb:SpeedPerturbation",
+    speed_perturbation_sox="paddlespeech.s2t.transform.perturb:SpeedPerturbationSox",
     volume_perturbation="paddlespeech.s2t.transform.perturb:VolumePerturbation",
     noise_injection="paddlespeech.s2t.transform.perturb:NoiseInjection",
     bandpass_perturbation="paddlespeech.s2t.transform.perturb:BandpassPerturbation",
diff --git a/paddlespeech/s2t/utils/bleu_score.py b/paddlespeech/s2t/utils/bleu_score.py
index a50c000ae..d7eb9c7c6 100644
--- a/paddlespeech/s2t/utils/bleu_score.py
+++ b/paddlespeech/s2t/utils/bleu_score.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
 """This module provides functions to calculate bleu score in different level.
 e.g. wer for word-level, cer for char-level.
 """
diff --git a/paddlespeech/s2t/utils/cli_utils.py b/paddlespeech/s2t/utils/cli_utils.py
index 4aee3f439..ccb0d3c97 100644
--- a/paddlespeech/s2t/utils/cli_utils.py
+++ b/paddlespeech/s2t/utils/cli_utils.py
@@ -14,9 +14,9 @@
 # Modified from espnet(https://github.com/espnet/espnet)
 import sys
 from collections.abc import Sequence
-from distutils.util import strtobool as dist_strtobool
 
 import numpy
+from distutils.util import strtobool as dist_strtobool
 
 
 def strtobool(x):
diff --git a/paddlespeech/s2t/utils/text_grid.py b/paddlespeech/s2t/utils/text_grid.py
index 3af58c9ba..cbd9856e4 100644
--- a/paddlespeech/s2t/utils/text_grid.py
+++ b/paddlespeech/s2t/utils/text_grid.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# Modified from wenet(https://github.com/wenet-e2e/wenet)
 from typing import Dict
 from typing import List
 from typing import Text
diff --git a/paddlespeech/s2t/utils/utility.py b/paddlespeech/s2t/utils/utility.py
index dc1be8159..fdd8c0292 100644
--- a/paddlespeech/s2t/utils/utility.py
+++ b/paddlespeech/s2t/utils/utility.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Contains common utility functions."""
-import distutils.util
 import math
 import os
 import random
@@ -21,6 +20,7 @@ from contextlib import contextmanager
 from pprint import pformat
 from typing import List
 
+import distutils.util
 import numpy as np
 import paddle
 import soundfile
diff --git a/paddlespeech/server/README.md b/paddlespeech/server/README.md
index 4ce9605d6..819fe440d 100644
--- a/paddlespeech/server/README.md
+++ b/paddlespeech/server/README.md
@@ -10,7 +10,7 @@
  paddlespeech_server help
  ```
  ### Start the server
- First set the service-related configuration parameters, similar to `./conf/application.yaml`,
+ First set the service-related configuration parameters, similar to `./conf/application.yaml`. Set `engine_list`, which represents the speech tasks included in the service to be started
  Then start the service:
  ```bash
  paddlespeech_server start --config_file ./conf/application.yaml
@@ -23,7 +23,7 @@
  ```
  ### Access speech recognition services 
  ```
- paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input ./tests/16_audio.wav
+ paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input input_16k.wav
  ```
  
  ### Access text to speech services
@@ -31,3 +31,7 @@
  paddlespeech_client tts --server_ip 127.0.0.1 --port 8090 --input "你好，欢迎使用百度飞桨深度学习框架！" --output output.wav
  ```
  
+ ### Access audio classification services
+ ```bash
+ paddlespeech_client cls --server_ip 127.0.0.1 --port 8090 --input input.wav
+ ```
diff --git a/paddlespeech/server/README_cn.md b/paddlespeech/server/README_cn.md
index 2dfd9474b..c0a4a7336 100644
--- a/paddlespeech/server/README_cn.md
+++ b/paddlespeech/server/README_cn.md
@@ -10,7 +10,7 @@
  paddlespeech_server help
  ```
  ### 启动服务
- 首先设置服务相关配置文件，类似于 `./conf/application.yaml`，同时设置服务配置中的语音任务模型相关配置，类似于 `./conf/tts/tts.yaml`。
+ 首先设置服务相关配置文件，类似于 `./conf/application.yaml`，设置 `engine_list`，该值表示即将启动的服务中包含的语音任务。
  然后启动服务：
  ```bash
  paddlespeech_server start --config_file ./conf/application.yaml
@@ -30,3 +30,8 @@
  ```bash
  paddlespeech_client tts --server_ip 127.0.0.1 --port 8090 --input "你好，欢迎使用百度飞桨深度学习框架！" --output output.wav
  ```
+
+ ### 访问音频分类服务
+ ```bash
+ paddlespeech_client cls --server_ip 127.0.0.1 --port 8090 --input input.wav
+ ```
diff --git a/paddlespeech/server/__init__.py b/paddlespeech/server/__init__.py
index 384061dda..97722c0a0 100644
--- a/paddlespeech/server/__init__.py
+++ b/paddlespeech/server/__init__.py
@@ -18,6 +18,7 @@ from .base_commands import ClientHelpCommand
 from .base_commands import ServerBaseCommand
 from .base_commands import ServerHelpCommand
 from .bin.paddlespeech_client import ASRClientExecutor
+from .bin.paddlespeech_client import CLSClientExecutor
 from .bin.paddlespeech_client import TTSClientExecutor
 from .bin.paddlespeech_server import ServerExecutor
 
diff --git a/paddlespeech/server/bin/main.py b/paddlespeech/server/bin/main.py
index de5282993..81824c85c 100644
--- a/paddlespeech/server/bin/main.py
+++ b/paddlespeech/server/bin/main.py
@@ -17,8 +17,9 @@ import uvicorn
 from fastapi import FastAPI
 
 from paddlespeech.server.engine.engine_pool import init_engine_pool
-from paddlespeech.server.restful.api import setup_router
+from paddlespeech.server.restful.api import setup_router as setup_http_router
 from paddlespeech.server.utils.config import get_config
+from paddlespeech.server.ws.api import setup_router as setup_ws_router
 
 app = FastAPI(
     title="PaddleSpeech Serving API", description="Api", version="0.0.1")
@@ -35,7 +36,12 @@ def init(config):
     """
     # init api
     api_list = list(engine.split("_")[0] for engine in config.engine_list)
-    api_router = setup_router(api_list)
+    if config.protocol == "websocket":
+        api_router = setup_ws_router(api_list)
+    elif config.protocol == "http":
+        api_router = setup_http_router(api_list)
+    else:
+        raise Exception("unsupported protocol")
     app.include_router(api_router)
 
     if not init_engine_pool(config):
diff --git a/paddlespeech/server/bin/paddlespeech_client.py b/paddlespeech/server/bin/paddlespeech_client.py
index ee6ab7ad7..413f00872 100644
--- a/paddlespeech/server/bin/paddlespeech_client.py
+++ b/paddlespeech/server/bin/paddlespeech_client.py
@@ -31,7 +31,7 @@ from paddlespeech.cli.log import logger
 from paddlespeech.server.utils.audio_process import wav2pcm
 from paddlespeech.server.utils.util import wav2base64
 
-__all__ = ['TTSClientExecutor', 'ASRClientExecutor']
+__all__ = ['TTSClientExecutor', 'ASRClientExecutor', 'CLSClientExecutor']
 
 
 @cli_client_register(
@@ -70,13 +70,9 @@ class TTSClientExecutor(BaseExecutor):
             choices=[0, 8000, 16000],
             help='Sampling rate, the default is the same as the model')
         self.parser.add_argument(
-            '--output',
-            type=str,
-            default="./output.wav",
-            help='Synthesized audio file')
+            '--output', type=str, default=None, help='Synthesized audio file')
 
-    def postprocess(self, response_dict: dict, outfile: str) -> float:
-        wav_base64 = response_dict["result"]["audio"]
+    def postprocess(self, wav_base64: str, outfile: str) -> float:
         audio_data_byte = base64.b64decode(wav_base64)
         # from byte
         samples, sample_rate = soundfile.read(
@@ -93,37 +89,38 @@ class TTSClientExecutor(BaseExecutor):
         else:
             logger.error("The format for saving audio only supports wav or pcm")
 
-        duration = len(samples) / sample_rate
-        return duration
-
     def execute(self, argv: List[str]) -> bool:
         args = self.parser.parse_args(argv)
-        try:
-            url = 'http://' + args.server_ip + ":" + str(
-                args.port) + '/paddlespeech/tts'
-            request = {
-                "text": args.input,
-                "spk_id": args.spk_id,
-                "speed": args.speed,
-                "volume": args.volume,
-                "sample_rate": args.sample_rate,
-                "save_path": args.output
-            }
-            st = time.time()
-            response = requests.post(url, json.dumps(request))
-            time_consume = time.time() - st
-
-            response_dict = response.json()
-            duration = self.postprocess(response_dict, args.output)
+        input_ = args.input
+        server_ip = args.server_ip
+        port = args.port
+        spk_id = args.spk_id
+        speed = args.speed
+        volume = args.volume
+        sample_rate = args.sample_rate
+        output = args.output
 
+        try:
+            time_start = time.time()
+            res = self(
+                input=input_,
+                server_ip=server_ip,
+                port=port,
+                spk_id=spk_id,
+                speed=speed,
+                volume=volume,
+                sample_rate=sample_rate,
+                output=output)
+            time_end = time.time()
+            time_consume = time_end - time_start
+            response_dict = res.json()
             logger.info(response_dict["message"])
-            logger.info("Save synthesized audio successfully on %s." %
-                        (args.output))
-            logger.info("Audio duration: %f s." % (duration))
+            logger.info("Save synthesized audio successfully on %s." % (output))
+            logger.info("Audio duration: %f s." %
+                        (response_dict['result']['duration']))
             logger.info("Response time: %f s." % (time_consume))
-
             return True
-        except BaseException:
+        except Exception as e:
             logger.error("Failed to synthesized audio.")
             return False
 
@@ -136,7 +133,7 @@ class TTSClientExecutor(BaseExecutor):
                  speed: float=1.0,
                  volume: float=1.0,
                  sample_rate: int=0,
-                 output: str="./output.wav"):
+                 output: str=None):
         """
         Python API to call an executor.
         """
@@ -151,20 +148,11 @@ class TTSClientExecutor(BaseExecutor):
             "save_path": output
         }
 
-        try:
-            st = time.time()
-            response = requests.post(url, json.dumps(request))
-            time_consume = time.time() - st
-            response_dict = response.json()
-            duration = self.postprocess(response_dict, output)
-
-            print(response_dict["message"])
-            print("Save synthesized audio successfully on %s." % (output))
-            print("Audio duration: %f s." % (duration))
-            print("Response time: %f s." % (time_consume))
-            print("RTF: %f " % (time_consume / duration))
-        except BaseException:
-            print("Failed to synthesized audio.")
+        res = requests.post(url, json.dumps(request))
+        response_dict = res.json()
+        if output is not None:
+            self.postprocess(response_dict["result"]["audio"], output)
+        return res
 
 
 @cli_client_register(
@@ -193,24 +181,27 @@ class ASRClientExecutor(BaseExecutor):
 
     def execute(self, argv: List[str]) -> bool:
         args = self.parser.parse_args(argv)
-        url = 'http://' + args.server_ip + ":" + str(
-            args.port) + '/paddlespeech/asr'
-        audio = wav2base64(args.input)
-        data = {
-            "audio": audio,
-            "audio_format": args.audio_format,
-            "sample_rate": args.sample_rate,
-            "lang": args.lang,
-        }
-        time_start = time.time()
+        input_ = args.input
+        server_ip = args.server_ip
+        port = args.port
+        sample_rate = args.sample_rate
+        lang = args.lang
+        audio_format = args.audio_format
+
         try:
-            r = requests.post(url=url, data=json.dumps(data))
-            # ending Timestamp
+            time_start = time.time()
+            res = self(
+                input=input_,
+                server_ip=server_ip,
+                port=port,
+                sample_rate=sample_rate,
+                lang=lang,
+                audio_format=audio_format)
             time_end = time.time()
-            logger.info(r.json())
-            logger.info("time cost %f s." % (time_end - time_start))
+            logger.info(res.json())
+            logger.info("Response time %f s." % (time_end - time_start))
             return True
-        except BaseException:
+        except Exception as e:
             logger.error("Failed to speech recognition.")
             return False
 
@@ -234,12 +225,65 @@ class ASRClientExecutor(BaseExecutor):
             "sample_rate": sample_rate,
             "lang": lang,
         }
-        time_start = time.time()
+
+        res = requests.post(url=url, data=json.dumps(data))
+        return res
+
+
+@cli_client_register(
+    name='paddlespeech_client.cls', description='visit cls service')
+class CLSClientExecutor(BaseExecutor):
+    def __init__(self):
+        super(CLSClientExecutor, self).__init__()
+        self.parser = argparse.ArgumentParser(
+            prog='paddlespeech_client.cls', add_help=True)
+        self.parser.add_argument(
+            '--server_ip', type=str, default='127.0.0.1', help='server ip')
+        self.parser.add_argument(
+            '--port', type=int, default=8090, help='server port')
+        self.parser.add_argument(
+            '--input',
+            type=str,
+            default=None,
+            help='Audio file to classify.',
+            required=True)
+        self.parser.add_argument(
+            '--topk',
+            type=int,
+            default=1,
+            help='Return topk scores of classification result.')
+
+    def execute(self, argv: List[str]) -> bool:
+        args = self.parser.parse_args(argv)
+        input_ = args.input
+        server_ip = args.server_ip
+        port = args.port
+        topk = args.topk
+
         try:
-            r = requests.post(url=url, data=json.dumps(data))
-            # ending Timestamp
+            time_start = time.time()
+            res = self(input=input_, server_ip=server_ip, port=port, topk=topk)
             time_end = time.time()
-            print(r.json())
-            print("time cost %f s." % (time_end - time_start))
-        except BaseException:
-            print("Failed to speech recognition.")
+            logger.info(res.json())
+            logger.info("Response time %f s." % (time_end - time_start))
+            return True
+        except Exception as e:
+            logger.error("Failed to speech classification.")
+            return False
+
+    @stats_wrapper
+    def __call__(self,
+                 input: str,
+                 server_ip: str="127.0.0.1",
+                 port: int=8090,
+                 topk: int=1):
+        """
+        Python API to call an executor.
+        """
+
+        url = 'http://' + server_ip + ":" + str(port) + '/paddlespeech/cls'
+        audio = wav2base64(input)
+        data = {"audio": audio, "topk": topk}
+
+        res = requests.post(url=url, data=json.dumps(data))
+        return res
diff --git a/paddlespeech/server/bin/paddlespeech_server.py b/paddlespeech/server/bin/paddlespeech_server.py
index 3d71f091b..f6a7f4295 100644
--- a/paddlespeech/server/bin/paddlespeech_server.py
+++ b/paddlespeech/server/bin/paddlespeech_server.py
@@ -103,13 +103,14 @@ class ServerStatsExecutor():
             '--task',
             type=str,
             default=None,
-            choices=['asr', 'tts'],
+            choices=['asr', 'tts', 'cls'],
             help='Choose speech task.',
             required=True)
-        self.task_choices = ['asr', 'tts']
+        self.task_choices = ['asr', 'tts', 'cls']
         self.model_name_format = {
             'asr': 'Model-Language-Sample Rate',
-            'tts': 'Model-Language'
+            'tts': 'Model-Language',
+            'cls': 'Model-Sample Rate'
         }
 
     def show_support_models(self, pretrained_models: dict):
@@ -174,53 +175,24 @@ class ServerStatsExecutor():
                 )
                 return False
 
-    @stats_wrapper
-    def __call__(
-            self,
-            task: str=None, ):
-        """
-            Python API to call an executor.
-        """
-        self.task = task
-        if self.task not in self.task_choices:
-            print("Please input correct speech task, choices = ['asr', 'tts']")
-
-        elif self.task == 'asr':
-            try:
-                from paddlespeech.cli.asr.infer import pretrained_models
-                print(
-                    "Here is the table of ASR pretrained models supported in the service."
-                )
-                self.show_support_models(pretrained_models)
-
-                # show ASR static pretrained model
-                from paddlespeech.server.engine.asr.paddleinference.asr_engine import pretrained_models
-                print(
-                    "Here is the table of ASR static pretrained models supported in the service."
-                )
-                self.show_support_models(pretrained_models)
-
-            except BaseException:
-                print(
-                    "Failed to get the table of ASR pretrained models supported in the service."
-                )
-
-        elif self.task == 'tts':
+        elif self.task == 'cls':
             try:
-                from paddlespeech.cli.tts.infer import pretrained_models
-                print(
-                    "Here is the table of TTS pretrained models supported in the service."
+                from paddlespeech.cli.cls.infer import pretrained_models
+                logger.info(
+                    "Here is the table of CLS pretrained models supported in the service."
                 )
                 self.show_support_models(pretrained_models)
 
-                # show TTS static pretrained model
-                from paddlespeech.server.engine.tts.paddleinference.tts_engine import pretrained_models
-                print(
-                    "Here is the table of TTS static pretrained models supported in the service."
+                # show CLS static pretrained model
+                from paddlespeech.server.engine.cls.paddleinference.cls_engine import pretrained_models
+                logger.info(
+                    "Here is the table of CLS static pretrained models supported in the service."
                 )
                 self.show_support_models(pretrained_models)
 
+                return True
             except BaseException:
-                print(
-                    "Failed to get the table of TTS pretrained models supported in the service."
+                logger.error(
+                    "Failed to get the table of CLS pretrained models supported in the service."
                 )
+                return False
diff --git a/paddlespeech/server/conf/application.yaml b/paddlespeech/server/conf/application.yaml
index 6048450b7..849349c2d 100644
--- a/paddlespeech/server/conf/application.yaml
+++ b/paddlespeech/server/conf/application.yaml
@@ -8,13 +8,17 @@ port: 8090
 
 # The task format in the engin_list is: <speech task>_<engine type>
 # task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference']
-
-engine_list: ['asr_python', 'tts_python']
+# protocol = ['websocket', 'http'] (only one can be selected). 
+# http only support offline engine type.
+protocol: 'http'
+engine_list: ['asr_python', 'tts_python', 'cls_python']
 
 
 #################################################################################
 #                                ENGINE CONFIG                                  #
 #################################################################################
+
+################################### ASR #########################################
 ################### speech task: asr; engine_type: python #######################
 asr_python:
     model: 'conformer_wenetspeech'
@@ -46,6 +50,25 @@ asr_inference:
         summary: True  # False -> do not show predictor config
 
 
+################### speech task: asr; engine_type: online #######################
+asr_online:
+    model_type: 'deepspeech2online_aishell'
+    am_model: # the pdmodel file of am static model [optional]
+    am_params:  # the pdiparams file of am static model [optional]
+    lang: 'zh'
+    sample_rate: 16000
+    cfg_path: 
+    decode_method: 
+    force_yes: True
+
+    am_predictor_conf:
+        device:  # set 'gpu:id' or 'cpu'
+        switch_ir_optim: True
+        glog_info: False  # True -> print glog
+        summary: True  # False -> do not show predictor config
+
+
+################################### TTS #########################################
 ################### speech task: tts; engine_type: python #######################
 tts_python: 
     # am (acoustic model) choices=['speedyspeech_csmsc', 'fastspeech2_csmsc', 
@@ -105,3 +128,30 @@ tts_inference:
     # others
     lang: 'zh'
 
+
+################################### CLS #########################################
+################### speech task: cls; engine_type: python #######################
+cls_python:
+    # model choices=['panns_cnn14', 'panns_cnn10', 'panns_cnn6']
+    model: 'panns_cnn14'
+    cfg_path: # [optional] Config of cls task.
+    ckpt_path: # [optional] Checkpoint file of model.
+    label_file: # [optional] Label file of cls task.
+    device:  # set 'gpu:id' or 'cpu'
+
+
+################### speech task: cls; engine_type: inference #######################
+cls_inference:
+    # model_type choices=['panns_cnn14', 'panns_cnn10', 'panns_cnn6']
+    model_type: 'panns_cnn14' 
+    cfg_path: 
+    model_path:  # the pdmodel file of am static model [optional]
+    params_path:  # the pdiparams file of am static model [optional]
+    label_file:  # [optional] Label file of cls task.
+
+    predictor_conf:
+        device:  # set 'gpu:id' or 'cpu'
+        switch_ir_optim: True
+        glog_info: False  # True -> print glog
+        summary: True  # False -> do not show predictor config
+
diff --git a/paddlespeech/server/conf/ws_application.yaml b/paddlespeech/server/conf/ws_application.yaml
new file mode 100644
index 000000000..ef23593ed
--- /dev/null
+++ b/paddlespeech/server/conf/ws_application.yaml
@@ -0,0 +1,51 @@
+# This is the parameter configuration file for PaddleSpeech Serving.
+
+#################################################################################
+#                             SERVER SETTING                                    #
+#################################################################################
+host: 0.0.0.0
+port: 8091
+
+# The task format in the engin_list is: <speech task>_<engine type>
+# task choices = ['asr_online', 'tts_online']
+# protocol = ['websocket', 'http'] (only one can be selected).
+# websocket only support online engine type.
+protocol: 'websocket'
+engine_list: ['asr_online']
+
+
+#################################################################################
+#                                ENGINE CONFIG                                  #
+#################################################################################
+
+################################### ASR #########################################
+################### speech task: asr; engine_type: online #######################
+asr_online:
+    model_type: 'deepspeech2online_aishell'
+    am_model: # the pdmodel file of am static model [optional]
+    am_params:  # the pdiparams file of am static model [optional]
+    lang: 'zh'
+    sample_rate: 16000
+    cfg_path: 
+    decode_method: 
+    force_yes: True
+
+    am_predictor_conf:
+        device:  # set 'gpu:id' or 'cpu'
+        switch_ir_optim: True
+        glog_info: False  # True -> print glog
+        summary: True  # False -> do not show predictor config
+
+    chunk_buffer_conf:
+        frame_duration_ms: 80
+        shift_ms: 40
+        sample_rate: 16000
+        sample_width: 2
+
+    vad_conf:
+        aggressiveness: 2
+        sample_rate: 16000
+        frame_duration_ms: 20
+        sample_width: 2
+        padding_ms: 200
+        padding_ratio: 0.9
diff --git a/paddlespeech/server/engine/asr/online/__init__.py b/paddlespeech/server/engine/asr/online/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/paddlespeech/server/engine/asr/online/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddlespeech/server/engine/asr/online/asr_engine.py b/paddlespeech/server/engine/asr/online/asr_engine.py
new file mode 100644
index 000000000..389175a0a
--- /dev/null
+++ b/paddlespeech/server/engine/asr/online/asr_engine.py
@@ -0,0 +1,352 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from typing import Optional
+
+import numpy as np
+import paddle
+from numpy import float32
+from yacs.config import CfgNode
+
+from paddlespeech.cli.asr.infer import ASRExecutor
+from paddlespeech.cli.log import logger
+from paddlespeech.cli.utils import MODEL_HOME
+from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
+from paddlespeech.s2t.frontend.speech import SpeechSegment
+from paddlespeech.s2t.modules.ctc import CTCDecoder
+from paddlespeech.s2t.utils.utility import UpdateConfig
+from paddlespeech.server.engine.base_engine import BaseEngine
+from paddlespeech.server.utils.paddle_predictor import init_predictor
+
+__all__ = ['ASREngine']
+
+pretrained_models = {
+    "deepspeech2online_aishell-zh-16k": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.1.1.model.tar.gz',
+        'md5':
+        'd5e076217cf60486519f72c217d21b9b',
+        'cfg_path':
+        'model.yaml',
+        'ckpt_path':
+        'exp/deepspeech2_online/checkpoints/avg_1',
+        'model':
+        'exp/deepspeech2_online/checkpoints/avg_1.jit.pdmodel',
+        'params':
+        'exp/deepspeech2_online/checkpoints/avg_1.jit.pdiparams',
+        'lm_url':
+        'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
+        'lm_md5':
+        '29e02312deb2e59b3c8686c7966d4fe3'
+    },
+}
+
+
+class ASRServerExecutor(ASRExecutor):
+    def __init__(self):
+        super().__init__()
+        pass
+
+    def _init_from_path(self,
+                        model_type: str='wenetspeech',
+                        am_model: Optional[os.PathLike]=None,
+                        am_params: Optional[os.PathLike]=None,
+                        lang: str='zh',
+                        sample_rate: int=16000,
+                        cfg_path: Optional[os.PathLike]=None,
+                        decode_method: str='attention_rescoring',
+                        am_predictor_conf: dict=None):
+        """
+        Init model and other resources from a specific path.
+        """
+
+        if cfg_path is None or am_model is None or am_params is None:
+            sample_rate_str = '16k' if sample_rate == 16000 else '8k'
+            tag = model_type + '-' + lang + '-' + sample_rate_str
+            res_path = self._get_pretrained_path(tag)  # wenetspeech_zh
+            self.res_path = res_path
+            self.cfg_path = os.path.join(res_path,
+                                         pretrained_models[tag]['cfg_path'])
+
+            self.am_model = os.path.join(res_path,
+                                         pretrained_models[tag]['model'])
+            self.am_params = os.path.join(res_path,
+                                          pretrained_models[tag]['params'])
+            logger.info(res_path)
+            logger.info(self.cfg_path)
+            logger.info(self.am_model)
+            logger.info(self.am_params)
+        else:
+            self.cfg_path = os.path.abspath(cfg_path)
+            self.am_model = os.path.abspath(am_model)
+            self.am_params = os.path.abspath(am_params)
+            self.res_path = os.path.dirname(
+                os.path.dirname(os.path.abspath(self.cfg_path)))
+
+        #Init body.
+        self.config = CfgNode(new_allowed=True)
+        self.config.merge_from_file(self.cfg_path)
+
+        with UpdateConfig(self.config):
+            if "deepspeech2online" in model_type or "deepspeech2offline" in model_type:
+                from paddlespeech.s2t.io.collator import SpeechCollator
+                self.vocab = self.config.vocab_filepath
+                self.config.decode.lang_model_path = os.path.join(
+                    MODEL_HOME, 'language_model',
+                    self.config.decode.lang_model_path)
+                self.collate_fn_test = SpeechCollator.from_config(self.config)
+                self.text_feature = TextFeaturizer(
+                    unit_type=self.config.unit_type, vocab=self.vocab)
+
+                lm_url = pretrained_models[tag]['lm_url']
+                lm_md5 = pretrained_models[tag]['lm_md5']
+                self.download_lm(
+                    lm_url,
+                    os.path.dirname(self.config.decode.lang_model_path), lm_md5)
+            elif "conformer" in model_type or "transformer" in model_type or "wenetspeech" in model_type:
+                raise Exception("wrong type")
+            else:
+                raise Exception("wrong type")
+
+        # AM predictor
+        self.am_predictor_conf = am_predictor_conf
+        self.am_predictor = init_predictor(
+            model_file=self.am_model,
+            params_file=self.am_params,
+            predictor_conf=self.am_predictor_conf)
+
+        # decoder
+        self.decoder = CTCDecoder(
+            odim=self.config.output_dim,  # <blank> is in  vocab
+            enc_n_units=self.config.rnn_layer_size * 2,
+            blank_id=self.config.blank_id,
+            dropout_rate=0.0,
+            reduction=True,  # sum
+            batch_average=True,  # sum / batch_size
+            grad_norm_type=self.config.get('ctc_grad_norm_type', None))
+
+        # init decoder
+        cfg = self.config.decode
+        decode_batch_size = 1  # for online
+        self.decoder.init_decoder(
+            decode_batch_size, self.text_feature.vocab_list,
+            cfg.decoding_method, cfg.lang_model_path, cfg.alpha, cfg.beta,
+            cfg.beam_size, cfg.cutoff_prob, cfg.cutoff_top_n,
+            cfg.num_proc_bsearch)
+
+        # init state box
+        self.chunk_state_h_box = np.zeros(
+            (self.config.num_rnn_layers, 1, self.config.rnn_layer_size),
+            dtype=float32)
+        self.chunk_state_c_box = np.zeros(
+            (self.config.num_rnn_layers, 1, self.config.rnn_layer_size),
+            dtype=float32)
+
+    def reset_decoder_and_chunk(self):
+        """reset decoder and chunk state for an new audio
+        """
+        self.decoder.reset_decoder(batch_size=1)
+        # init state box, for new audio request
+        self.chunk_state_h_box = np.zeros(
+            (self.config.num_rnn_layers, 1, self.config.rnn_layer_size),
+            dtype=float32)
+        self.chunk_state_c_box = np.zeros(
+            (self.config.num_rnn_layers, 1, self.config.rnn_layer_size),
+            dtype=float32)
+
+    def decode_one_chunk(self, x_chunk, x_chunk_lens, model_type: str):
+        """decode one chunk
+
+        Args:
+            x_chunk (numpy.array): shape[B, T, D]
+            x_chunk_lens (numpy.array): shape[B]
+            model_type (str): online model type
+
+        Returns:
+            [type]: [description]
+        """
+        if "deepspeech2online" in model_type:
+            input_names = self.am_predictor.get_input_names()
+            audio_handle = self.am_predictor.get_input_handle(input_names[0])
+            audio_len_handle = self.am_predictor.get_input_handle(
+                input_names[1])
+            h_box_handle = self.am_predictor.get_input_handle(input_names[2])
+            c_box_handle = self.am_predictor.get_input_handle(input_names[3])
+
+            audio_handle.reshape(x_chunk.shape)
+            audio_handle.copy_from_cpu(x_chunk)
+
+            audio_len_handle.reshape(x_chunk_lens.shape)
+            audio_len_handle.copy_from_cpu(x_chunk_lens)
+
+            h_box_handle.reshape(self.chunk_state_h_box.shape)
+            h_box_handle.copy_from_cpu(self.chunk_state_h_box)
+
+            c_box_handle.reshape(self.chunk_state_c_box.shape)
+            c_box_handle.copy_from_cpu(self.chunk_state_c_box)
+
+            output_names = self.am_predictor.get_output_names()
+            output_handle = self.am_predictor.get_output_handle(output_names[0])
+            output_lens_handle = self.am_predictor.get_output_handle(
+                output_names[1])
+            output_state_h_handle = self.am_predictor.get_output_handle(
+                output_names[2])
+            output_state_c_handle = self.am_predictor.get_output_handle(
+                output_names[3])
+
+            self.am_predictor.run()
+
+            output_chunk_probs = output_handle.copy_to_cpu()
+            output_chunk_lens = output_lens_handle.copy_to_cpu()
+            self.chunk_state_h_box = output_state_h_handle.copy_to_cpu()
+            self.chunk_state_c_box = output_state_c_handle.copy_to_cpu()
+
+            self.decoder.next(output_chunk_probs, output_chunk_lens)
+            trans_best, trans_beam = self.decoder.decode()
+
+            return trans_best[0]
+
+        elif "conformer" in model_type or "transformer" in model_type:
+            raise Exception("invalid model name")
+        else:
+            raise Exception("invalid model name")
+
+    def _pcm16to32(self, audio):
+        """pcm int16 to float32
+
+        Args:
+            audio(numpy.array): numpy.int16
+
+        Returns:
+            audio(numpy.array): numpy.float32
+        """
+        if audio.dtype == np.int16:
+            audio = audio.astype("float32")
+            bits = np.iinfo(np.int16).bits
+            audio = audio / (2**(bits - 1))
+        return audio
+
+    def extract_feat(self, samples, sample_rate):
+        """extract feat
+
+        Args:
+            samples (numpy.array): numpy.float32
+            sample_rate (int): sample rate
+
+        Returns:
+            x_chunk (numpy.array): shape[B, T, D]
+            x_chunk_lens (numpy.array): shape[B]
+        """
+        # pcm16 -> pcm 32
+        samples = self._pcm16to32(samples)
+
+        # read audio
+        speech_segment = SpeechSegment.from_pcm(
+            samples, sample_rate, transcript=" ")
+        # audio augment
+        self.collate_fn_test.augmentation.transform_audio(speech_segment)
+
+        # extract speech feature
+        spectrum, transcript_part = self.collate_fn_test._speech_featurizer.featurize(
+            speech_segment, self.collate_fn_test.keep_transcription_text)
+        # CMVN spectrum
+        if self.collate_fn_test._normalizer:
+            spectrum = self.collate_fn_test._normalizer.apply(spectrum)
+
+        # spectrum augment
+        audio = self.collate_fn_test.augmentation.transform_feature(spectrum)
+
+        audio_len = audio.shape[0]
+        audio = paddle.to_tensor(audio, dtype='float32')
+        # audio_len = paddle.to_tensor(audio_len)
+        audio = paddle.unsqueeze(audio, axis=0)
+
+        x_chunk = audio.numpy()
+        x_chunk_lens = np.array([audio_len])
+
+        return x_chunk, x_chunk_lens
+
+
+class ASREngine(BaseEngine):
+    """ASR server engine
+
+    Args:
+        metaclass: Defaults to Singleton.
+    """
+
+    def __init__(self):
+        super(ASREngine, self).__init__()
+
+    def init(self, config: dict) -> bool:
+        """init engine resource
+
+        Args:
+            config_file (str): config file
+
+        Returns:
+            bool: init failed or success
+        """
+        self.input = None
+        self.output = ""
+        self.executor = ASRServerExecutor()
+        self.config = config
+
+        self.executor._init_from_path(
+            model_type=self.config.model_type,
+            am_model=self.config.am_model,
+            am_params=self.config.am_params,
+            lang=self.config.lang,
+            sample_rate=self.config.sample_rate,
+            cfg_path=self.config.cfg_path,
+            decode_method=self.config.decode_method,
+            am_predictor_conf=self.config.am_predictor_conf)
+
+        logger.info("Initialize ASR server engine successfully.")
+        return True
+
+    def preprocess(self, samples, sample_rate):
+        """preprocess
+
+        Args:
+            samples (numpy.array): numpy.float32
+            sample_rate (int): sample rate
+
+        Returns:
+            x_chunk (numpy.array): shape[B, T, D]
+            x_chunk_lens (numpy.array): shape[B]
+        """
+        x_chunk, x_chunk_lens = self.executor.extract_feat(samples, sample_rate)
+        return x_chunk, x_chunk_lens
+
+    def run(self, x_chunk, x_chunk_lens, decoder_chunk_size=1):
+        """run online engine
+
+        Args:
+            x_chunk (numpy.array): shape[B, T, D]
+            x_chunk_lens (numpy.array): shape[B]
+            decoder_chunk_size(int)
+        """
+        self.output = self.executor.decode_one_chunk(x_chunk, x_chunk_lens,
+                                                     self.config.model_type)
+
+    def postprocess(self):
+        """postprocess
+        """
+        return self.output
+
+    def reset(self):
+        """reset engine decoder and inference state
+        """
+        self.executor.reset_decoder_and_chunk()
+        self.output = ""
diff --git a/paddlespeech/server/engine/cls/__init__.py b/paddlespeech/server/engine/cls/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/paddlespeech/server/engine/cls/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddlespeech/server/engine/cls/paddleinference/__init__.py b/paddlespeech/server/engine/cls/paddleinference/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/paddlespeech/server/engine/cls/paddleinference/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddlespeech/server/engine/cls/paddleinference/cls_engine.py b/paddlespeech/server/engine/cls/paddleinference/cls_engine.py
new file mode 100644
index 000000000..3982effd9
--- /dev/null
+++ b/paddlespeech/server/engine/cls/paddleinference/cls_engine.py
@@ -0,0 +1,224 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import io
+import os
+import time
+from typing import Optional
+
+import numpy as np
+import paddle
+import yaml
+
+from paddlespeech.cli.cls.infer import CLSExecutor
+from paddlespeech.cli.log import logger
+from paddlespeech.cli.utils import download_and_decompress
+from paddlespeech.cli.utils import MODEL_HOME
+from paddlespeech.server.engine.base_engine import BaseEngine
+from paddlespeech.server.utils.paddle_predictor import init_predictor
+from paddlespeech.server.utils.paddle_predictor import run_model
+
+__all__ = ['CLSEngine']
+
+pretrained_models = {
+    "panns_cnn6-32k": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn6_static.tar.gz',
+        'md5':
+        'da087c31046d23281d8ec5188c1967da',
+        'cfg_path':
+        'panns.yaml',
+        'model_path':
+        'inference.pdmodel',
+        'params_path':
+        'inference.pdiparams',
+        'label_file':
+        'audioset_labels.txt',
+    },
+    "panns_cnn10-32k": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn10_static.tar.gz',
+        'md5':
+        '5460cc6eafbfaf0f261cc75b90284ae1',
+        'cfg_path':
+        'panns.yaml',
+        'model_path':
+        'inference.pdmodel',
+        'params_path':
+        'inference.pdiparams',
+        'label_file':
+        'audioset_labels.txt',
+    },
+    "panns_cnn14-32k": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn14_static.tar.gz',
+        'md5':
+        'ccc80b194821274da79466862b2ab00f',
+        'cfg_path':
+        'panns.yaml',
+        'model_path':
+        'inference.pdmodel',
+        'params_path':
+        'inference.pdiparams',
+        'label_file':
+        'audioset_labels.txt',
+    },
+}
+
+
+class CLSServerExecutor(CLSExecutor):
+    def __init__(self):
+        super().__init__()
+        pass
+
+    def _get_pretrained_path(self, tag: str) -> os.PathLike:
+        """
+            Download and returns pretrained resources path of current task.
+        """
+        support_models = list(pretrained_models.keys())
+        assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format(
+            tag, '\n\t\t'.join(support_models))
+
+        res_path = os.path.join(MODEL_HOME, tag)
+        decompressed_path = download_and_decompress(pretrained_models[tag],
+                                                    res_path)
+        decompressed_path = os.path.abspath(decompressed_path)
+        logger.info(
+            'Use pretrained model stored in: {}'.format(decompressed_path))
+
+        return decompressed_path
+
+    def _init_from_path(
+            self,
+            model_type: str='panns_cnn14',
+            cfg_path: Optional[os.PathLike]=None,
+            model_path: Optional[os.PathLike]=None,
+            params_path: Optional[os.PathLike]=None,
+            label_file: Optional[os.PathLike]=None,
+            predictor_conf: dict=None, ):
+        """
+        Init model and other resources from a specific path.
+        """
+
+        if cfg_path is None or model_path is None or params_path is None or label_file is None:
+            tag = model_type + '-' + '32k'
+            self.res_path = self._get_pretrained_path(tag)
+            self.cfg_path = os.path.join(self.res_path,
+                                         pretrained_models[tag]['cfg_path'])
+            self.model_path = os.path.join(self.res_path,
+                                           pretrained_models[tag]['model_path'])
+            self.params_path = os.path.join(
+                self.res_path, pretrained_models[tag]['params_path'])
+            self.label_file = os.path.join(self.res_path,
+                                           pretrained_models[tag]['label_file'])
+        else:
+            self.cfg_path = os.path.abspath(cfg_path)
+            self.model_path = os.path.abspath(model_path)
+            self.params_path = os.path.abspath(params_path)
+            self.label_file = os.path.abspath(label_file)
+
+        logger.info(self.cfg_path)
+        logger.info(self.model_path)
+        logger.info(self.params_path)
+        logger.info(self.label_file)
+
+        # config
+        with open(self.cfg_path, 'r') as f:
+            self._conf = yaml.safe_load(f)
+        logger.info("Read cfg file successfully.")
+
+        # labels
+        self._label_list = []
+        with open(self.label_file, 'r') as f:
+            for line in f:
+                self._label_list.append(line.strip())
+        logger.info("Read label file successfully.")
+
+        # Create predictor
+        self.predictor_conf = predictor_conf
+        self.predictor = init_predictor(
+            model_file=self.model_path,
+            params_file=self.params_path,
+            predictor_conf=self.predictor_conf)
+        logger.info("Create predictor successfully.")
+
+    @paddle.no_grad()
+    def infer(self):
+        """
+        Model inference and result stored in self.output.
+        """
+        output = run_model(self.predictor, [self._inputs['feats'].numpy()])
+        self._outputs['logits'] = output[0]
+
+
+class CLSEngine(BaseEngine):
+    """CLS server engine
+
+    Args:
+        metaclass: Defaults to Singleton.
+    """
+
+    def __init__(self):
+        super(CLSEngine, self).__init__()
+
+    def init(self, config: dict) -> bool:
+        """init engine resource
+
+        Args:
+            config_file (str): config file
+
+        Returns:
+            bool: init failed or success
+        """
+        self.executor = CLSServerExecutor()
+        self.config = config
+        self.executor._init_from_path(
+            self.config.model_type, self.config.cfg_path,
+            self.config.model_path, self.config.params_path,
+            self.config.label_file, self.config.predictor_conf)
+
+        logger.info("Initialize CLS server engine successfully.")
+        return True
+
+    def run(self, audio_data):
+        """engine run 
+
+        Args:
+            audio_data (bytes): base64.b64decode
+        """
+
+        self.executor.preprocess(io.BytesIO(audio_data))
+        st = time.time()
+        self.executor.infer()
+        infer_time = time.time() - st
+
+        logger.info("inference time: {}".format(infer_time))
+        logger.info("cls engine type: inference")
+
+    def postprocess(self, topk: int):
+        """postprocess
+        """
+        assert topk <= len(self.executor._label_list
+                           ), 'Value of topk is larger than number of labels.'
+
+        result = np.squeeze(self.executor._outputs['logits'], axis=0)
+        topk_idx = (-result).argsort()[:topk]
+        topk_results = []
+        for idx in topk_idx:
+            res = {}
+            label, score = self.executor._label_list[idx], result[idx]
+            res['class_name'] = label
+            res['prob'] = score
+            topk_results.append(res)
+
+        return topk_results
diff --git a/paddlespeech/server/engine/cls/python/__init__.py b/paddlespeech/server/engine/cls/python/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/paddlespeech/server/engine/cls/python/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddlespeech/server/engine/cls/python/cls_engine.py b/paddlespeech/server/engine/cls/python/cls_engine.py
new file mode 100644
index 000000000..1a975b0a0
--- /dev/null
+++ b/paddlespeech/server/engine/cls/python/cls_engine.py
@@ -0,0 +1,124 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import io
+import time
+from typing import List
+
+import paddle
+
+from paddlespeech.cli.cls.infer import CLSExecutor
+from paddlespeech.cli.log import logger
+from paddlespeech.server.engine.base_engine import BaseEngine
+
+__all__ = ['CLSEngine']
+
+
+class CLSServerExecutor(CLSExecutor):
+    def __init__(self):
+        super().__init__()
+        pass
+
+    def get_topk_results(self, topk: int) -> List:
+        assert topk <= len(
+            self._label_list), 'Value of topk is larger than number of labels.'
+
+        result = self._outputs['logits'].squeeze(0).numpy()
+        topk_idx = (-result).argsort()[:topk]
+        res = {}
+        topk_results = []
+        for idx in topk_idx:
+            label, score = self._label_list[idx], result[idx]
+            res['class'] = label
+            res['prob'] = score
+            topk_results.append(res)
+        return topk_results
+
+
+class CLSEngine(BaseEngine):
+    """CLS server engine
+
+    Args:
+        metaclass: Defaults to Singleton.
+    """
+
+    def __init__(self):
+        super(CLSEngine, self).__init__()
+
+    def init(self, config: dict) -> bool:
+        """init engine resource
+
+        Args:
+            config_file (str): config file
+
+        Returns:
+            bool: init failed or success
+        """
+        self.input = None
+        self.output = None
+        self.executor = CLSServerExecutor()
+        self.config = config
+        try:
+            if self.config.device:
+                self.device = self.config.device
+            else:
+                self.device = paddle.get_device()
+            paddle.set_device(self.device)
+        except BaseException:
+            logger.error(
+                "Set device failed, please check if device is already used and the parameter 'device' in the yaml file"
+            )
+
+        try:
+            self.executor._init_from_path(
+                self.config.model, self.config.cfg_path, self.config.ckpt_path,
+                self.config.label_file)
+        except BaseException:
+            logger.error("Initialize CLS server engine Failed.")
+            return False
+
+        logger.info("Initialize CLS server engine successfully on device: %s." %
+                    (self.device))
+        return True
+
+    def run(self, audio_data):
+        """engine run 
+
+        Args:
+            audio_data (bytes): base64.b64decode
+        """
+        self.executor.preprocess(io.BytesIO(audio_data))
+        st = time.time()
+        self.executor.infer()
+        infer_time = time.time() - st
+
+        logger.info("inference time: {}".format(infer_time))
+        logger.info("cls engine type: python")
+
+    def postprocess(self, topk: int):
+        """postprocess
+        """
+        assert topk <= len(self.executor._label_list
+                           ), 'Value of topk is larger than number of labels.'
+
+        result = self.executor._outputs['logits'].squeeze(0).numpy()
+        topk_idx = (-result).argsort()[:topk]
+        topk_results = []
+        for idx in topk_idx:
+            res = {}
+            label, score = self.executor._label_list[idx], result[idx]
+            res['class_name'] = label
+            res['prob'] = score
+            topk_results.append(res)
+
+        return topk_results
diff --git a/paddlespeech/server/engine/engine_factory.py b/paddlespeech/server/engine/engine_factory.py
index 546541edf..2a39fb79b 100644
--- a/paddlespeech/server/engine/engine_factory.py
+++ b/paddlespeech/server/engine/engine_factory.py
@@ -25,11 +25,20 @@ class EngineFactory(object):
         elif engine_name == 'asr' and engine_type == 'python':
             from paddlespeech.server.engine.asr.python.asr_engine import ASREngine
             return ASREngine()
+        elif engine_name == 'asr' and engine_type == 'online':
+            from paddlespeech.server.engine.asr.online.asr_engine import ASREngine
+            return ASREngine()
         elif engine_name == 'tts' and engine_type == 'inference':
             from paddlespeech.server.engine.tts.paddleinference.tts_engine import TTSEngine
             return TTSEngine()
         elif engine_name == 'tts' and engine_type == 'python':
             from paddlespeech.server.engine.tts.python.tts_engine import TTSEngine
             return TTSEngine()
+        elif engine_name == 'cls' and engine_type == 'inference':
+            from paddlespeech.server.engine.cls.paddleinference.cls_engine import CLSEngine
+            return CLSEngine()
+        elif engine_name == 'cls' and engine_type == 'python':
+            from paddlespeech.server.engine.cls.python.cls_engine import CLSEngine
+            return CLSEngine()
         else:
             return None
diff --git a/paddlespeech/server/engine/tts/paddleinference/tts_engine.py b/paddlespeech/server/engine/tts/paddleinference/tts_engine.py
index 1bbbe0ea3..db8813ba9 100644
--- a/paddlespeech/server/engine/tts/paddleinference/tts_engine.py
+++ b/paddlespeech/server/engine/tts/paddleinference/tts_engine.py
@@ -250,27 +250,21 @@ class TTSServerExecutor(TTSExecutor):
             self.frontend = English(phone_vocab_path=self.phones_dict)
         logger.info("frontend done!")
 
-        try:
-            # am predictor
-            self.am_predictor_conf = am_predictor_conf
-            self.am_predictor = init_predictor(
-                model_file=self.am_model,
-                params_file=self.am_params,
-                predictor_conf=self.am_predictor_conf)
-            logger.info("Create AM predictor successfully.")
-        except BaseException:
-            logger.error("Failed to create AM predictor.")
-
-        try:
-            # voc predictor
-            self.voc_predictor_conf = voc_predictor_conf
-            self.voc_predictor = init_predictor(
-                model_file=self.voc_model,
-                params_file=self.voc_params,
-                predictor_conf=self.voc_predictor_conf)
-            logger.info("Create Vocoder predictor successfully.")
-        except BaseException:
-            logger.error("Failed to create Vocoder predictor.")
+        # Create am predictor
+        self.am_predictor_conf = am_predictor_conf
+        self.am_predictor = init_predictor(
+            model_file=self.am_model,
+            params_file=self.am_params,
+            predictor_conf=self.am_predictor_conf)
+        logger.info("Create AM predictor successfully.")
+
+        # Create voc predictor
+        self.voc_predictor_conf = voc_predictor_conf
+        self.voc_predictor = init_predictor(
+            model_file=self.voc_model,
+            params_file=self.voc_params,
+            predictor_conf=self.voc_predictor_conf)
+        logger.info("Create Vocoder predictor successfully.")
 
     @paddle.no_grad()
     def infer(self,
@@ -359,27 +353,22 @@ class TTSEngine(BaseEngine):
     def init(self, config: dict) -> bool:
         self.executor = TTSServerExecutor()
 
-        try:
-            self.config = config
-            self.executor._init_from_path(
-                am=self.config.am,
-                am_model=self.config.am_model,
-                am_params=self.config.am_params,
-                am_sample_rate=self.config.am_sample_rate,
-                phones_dict=self.config.phones_dict,
-                tones_dict=self.config.tones_dict,
-                speaker_dict=self.config.speaker_dict,
-                voc=self.config.voc,
-                voc_model=self.config.voc_model,
-                voc_params=self.config.voc_params,
-                voc_sample_rate=self.config.voc_sample_rate,
-                lang=self.config.lang,
-                am_predictor_conf=self.config.am_predictor_conf,
-                voc_predictor_conf=self.config.voc_predictor_conf, )
-
-        except BaseException:
-            logger.error("Initialize TTS server engine Failed.")
-            return False
+        self.config = config
+        self.executor._init_from_path(
+            am=self.config.am,
+            am_model=self.config.am_model,
+            am_params=self.config.am_params,
+            am_sample_rate=self.config.am_sample_rate,
+            phones_dict=self.config.phones_dict,
+            tones_dict=self.config.tones_dict,
+            speaker_dict=self.config.speaker_dict,
+            voc=self.config.voc,
+            voc_model=self.config.voc_model,
+            voc_params=self.config.voc_params,
+            voc_sample_rate=self.config.voc_sample_rate,
+            lang=self.config.lang,
+            am_predictor_conf=self.config.am_predictor_conf,
+            voc_predictor_conf=self.config.voc_predictor_conf, )
 
         logger.info("Initialize TTS server engine successfully.")
         return True
@@ -542,4 +531,4 @@ class TTSEngine(BaseEngine):
                                                            postprocess_time))
         logger.info("RTF: {}".format(rtf))
 
-        return lang, target_sample_rate, wav_base64
+        return lang, target_sample_rate, duration, wav_base64
diff --git a/paddlespeech/server/engine/tts/python/tts_engine.py b/paddlespeech/server/engine/tts/python/tts_engine.py
index 8d6c7fd17..f153f60b9 100644
--- a/paddlespeech/server/engine/tts/python/tts_engine.py
+++ b/paddlespeech/server/engine/tts/python/tts_engine.py
@@ -250,4 +250,4 @@ class TTSEngine(BaseEngine):
         logger.info("RTF: {}".format(rtf))
         logger.info("device: {}".format(self.device))
 
-        return lang, target_sample_rate, wav_base64
+        return lang, target_sample_rate, duration, wav_base64
diff --git a/paddlespeech/server/restful/api.py b/paddlespeech/server/restful/api.py
index 2d69dee87..3f91a03b6 100644
--- a/paddlespeech/server/restful/api.py
+++ b/paddlespeech/server/restful/api.py
@@ -16,6 +16,7 @@ from typing import List
 from fastapi import APIRouter
 
 from paddlespeech.server.restful.asr_api import router as asr_router
+from paddlespeech.server.restful.cls_api import router as cls_router
 from paddlespeech.server.restful.tts_api import router as tts_router
 
 _router = APIRouter()
@@ -25,7 +26,7 @@ def setup_router(api_list: List):
     """setup router for fastapi
 
     Args:
-        api_list (List): [asr, tts]
+        api_list (List): [asr, tts, cls]
 
     Returns:
         APIRouter
@@ -35,6 +36,8 @@ def setup_router(api_list: List):
             _router.include_router(asr_router)
         elif api_name == 'tts':
             _router.include_router(tts_router)
+        elif api_name == 'cls':
+            _router.include_router(cls_router)
         else:
             pass
 
diff --git a/paddlespeech/server/restful/cls_api.py b/paddlespeech/server/restful/cls_api.py
new file mode 100644
index 000000000..306d9ca9c
--- /dev/null
+++ b/paddlespeech/server/restful/cls_api.py
@@ -0,0 +1,92 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import base64
+import traceback
+from typing import Union
+
+from fastapi import APIRouter
+
+from paddlespeech.server.engine.engine_pool import get_engine_pool
+from paddlespeech.server.restful.request import CLSRequest
+from paddlespeech.server.restful.response import CLSResponse
+from paddlespeech.server.restful.response import ErrorResponse
+from paddlespeech.server.utils.errors import ErrorCode
+from paddlespeech.server.utils.errors import failed_response
+from paddlespeech.server.utils.exception import ServerBaseException
+
+router = APIRouter()
+
+
+@router.get('/paddlespeech/cls/help')
+def help():
+    """help
+
+    Returns:
+        json: [description]
+    """
+    response = {
+        "success": "True",
+        "code": 200,
+        "message": {
+            "global": "success"
+        },
+        "result": {
+            "description": "cls server",
+            "input": "base64 string of wavfile",
+            "output": "classification result"
+        }
+    }
+    return response
+
+
+@router.post(
+    "/paddlespeech/cls", response_model=Union[CLSResponse, ErrorResponse])
+def cls(request_body: CLSRequest):
+    """cls api 
+
+    Args:
+        request_body (CLSRequest): [description]
+
+    Returns:
+        json: [description]
+    """
+    try:
+        audio_data = base64.b64decode(request_body.audio)
+
+        # get single engine from engine pool
+        engine_pool = get_engine_pool()
+        cls_engine = engine_pool['cls']
+
+        cls_engine.run(audio_data)
+        cls_results = cls_engine.postprocess(request_body.topk)
+
+        response = {
+            "success": True,
+            "code": 200,
+            "message": {
+                "description": "success"
+            },
+            "result": {
+                "topk": request_body.topk,
+                "results": cls_results
+            }
+        }
+
+    except ServerBaseException as e:
+        response = failed_response(e.error_code, e.msg)
+    except BaseException:
+        response = failed_response(ErrorCode.SERVER_UNKOWN_ERR)
+        traceback.print_exc()
+
+    return response
diff --git a/paddlespeech/server/restful/request.py b/paddlespeech/server/restful/request.py
index 289088019..dbac9dac8 100644
--- a/paddlespeech/server/restful/request.py
+++ b/paddlespeech/server/restful/request.py
@@ -15,7 +15,7 @@ from typing import Optional
 
 from pydantic import BaseModel
 
-__all__ = ['ASRRequest', 'TTSRequest']
+__all__ = ['ASRRequest', 'TTSRequest', 'CLSRequest']
 
 
 #****************************************************************************************/
@@ -63,3 +63,18 @@ class TTSRequest(BaseModel):
     volume: float = 1.0
     sample_rate: int = 0
     save_path: str = None
+
+
+#****************************************************************************************/
+#************************************ CLS request ***************************************/
+#****************************************************************************************/
+class CLSRequest(BaseModel):
+    """
+    request body example
+    {
+        "audio": "exSI6ICJlbiIsCgkgICAgInBvc2l0aW9uIjogImZhbHNlIgoJf...",
+        "topk": 1
+    }
+    """
+    audio: str
+    topk: int = 1
diff --git a/paddlespeech/server/restful/response.py b/paddlespeech/server/restful/response.py
index 4e18ee0d7..a2a207e4f 100644
--- a/paddlespeech/server/restful/response.py
+++ b/paddlespeech/server/restful/response.py
@@ -11,9 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from typing import List
+
 from pydantic import BaseModel
 
-__all__ = ['ASRResponse', 'TTSResponse']
+__all__ = ['ASRResponse', 'TTSResponse', 'CLSResponse']
 
 
 class Message(BaseModel):
@@ -52,10 +54,11 @@ class ASRResponse(BaseModel):
 #****************************************************************************************/
 class TTSResult(BaseModel):
     lang: str = "zh"
-    sample_rate: int
     spk_id: int = 0
     speed: float = 1.0
     volume: float = 1.0
+    sample_rate: int
+    duration: float
     save_path: str = None
     audio: str
 
@@ -71,9 +74,11 @@ class TTSResponse(BaseModel):
         },
         "result": {
             "lang": "zh",
-            "sample_rate": 24000,
+            "spk_id": 0,
             "speed": 1.0,
             "volume": 1.0,
+            "sample_rate": 24000,
+            "duration": 3.6125,
             "audio": "LTI1OTIuNjI1OTUwMzQsOTk2OS41NDk4...",
             "save_path": "./tts.wav"
         }
@@ -85,6 +90,45 @@ class TTSResponse(BaseModel):
     result: TTSResult
 
 
+#****************************************************************************************/
+#************************************ CLS response **************************************/
+#****************************************************************************************/
+class CLSResults(BaseModel):
+    class_name: str
+    prob: float
+
+
+class CLSResult(BaseModel):
+    topk: int
+    results: List[CLSResults]
+
+
+class CLSResponse(BaseModel):
+    """
+    response example
+    {
+        "success": true,
+        "code": 0,
+        "message": {
+            "description": "success" 
+        },
+        "result": {
+            topk: 1
+            results: [
+            {
+                "class":"Speech",
+                "prob": 0.9027184844017029
+            }
+            ]
+        }
+    }
+    """
+    success: bool
+    code: int
+    message: Message
+    result: CLSResult
+
+
 #****************************************************************************************/
 #********************************** Error response **************************************/
 #****************************************************************************************/
diff --git a/paddlespeech/server/restful/tts_api.py b/paddlespeech/server/restful/tts_api.py
index 0af0f6d07..4e9bbe23e 100644
--- a/paddlespeech/server/restful/tts_api.py
+++ b/paddlespeech/server/restful/tts_api.py
@@ -98,7 +98,7 @@ def tts(request_body: TTSRequest):
         tts_engine = engine_pool['tts']
         logger.info("Get tts engine successfully.")
 
-        lang, target_sample_rate, wav_base64 = tts_engine.run(
+        lang, target_sample_rate, duration, wav_base64 = tts_engine.run(
             text, spk_id, speed, volume, sample_rate, save_path)
 
         response = {
@@ -113,6 +113,7 @@ def tts(request_body: TTSRequest):
                 "speed": speed,
                 "volume": volume,
                 "sample_rate": target_sample_rate,
+                "duration": duration,
                 "save_path": save_path,
                 "audio": wav_base64
             }
diff --git a/paddlespeech/server/tests/asr/online/microphone_client.py b/paddlespeech/server/tests/asr/online/microphone_client.py
new file mode 100644
index 000000000..2ceaf6d03
--- /dev/null
+++ b/paddlespeech/server/tests/asr/online/microphone_client.py
@@ -0,0 +1,161 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+record wave from the mic
+"""
+import asyncio
+import json
+import logging
+import threading
+import wave
+from signal import SIGINT
+from signal import SIGTERM
+
+import pyaudio
+import websockets
+
+
+class ASRAudioHandler(threading.Thread):
+    def __init__(self, url="127.0.0.1", port=8091):
+        threading.Thread.__init__(self)
+        self.url = url
+        self.port = port
+        self.url = "ws://" + self.url + ":" + str(self.port) + "/ws/asr"
+        self.fileName = "./output.wav"
+        self.chunk = 5120
+        self.format = pyaudio.paInt16
+        self.channels = 1
+        self.rate = 16000
+        self._running = True
+        self._frames = []
+        self.data_backup = []
+
+    def startrecord(self):
+        """
+        start a new thread to record wave
+        """
+        threading._start_new_thread(self.recording, ())
+
+    def recording(self):
+        """
+        recording wave
+        """
+        self._running = True
+        self._frames = []
+        p = pyaudio.PyAudio()
+        stream = p.open(
+            format=self.format,
+            channels=self.channels,
+            rate=self.rate,
+            input=True,
+            frames_per_buffer=self.chunk)
+        while (self._running):
+            data = stream.read(self.chunk)
+            self._frames.append(data)
+            self.data_backup.append(data)
+
+        stream.stop_stream()
+        stream.close()
+        p.terminate()
+
+    def save(self):
+        """
+        save wave data
+        """
+        p = pyaudio.PyAudio()
+        wf = wave.open(self.fileName, 'wb')
+        wf.setnchannels(self.channels)
+        wf.setsampwidth(p.get_sample_size(self.format))
+        wf.setframerate(self.rate)
+        wf.writeframes(b''.join(self.data_backup))
+        wf.close()
+        p.terminate()
+
+    def stoprecord(self):
+        """
+        stop recording
+        """
+        self._running = False
+
+    async def run(self):
+        aa = input("是否开始录音？   (y/n)")
+        if aa.strip() == "y":
+            self.startrecord()
+            logging.info("*" * 10 + "开始录音，请输入语音")
+
+            async with websockets.connect(self.url) as ws:
+                # 发送开始指令
+                audio_info = json.dumps(
+                    {
+                        "name": "test.wav",
+                        "signal": "start",
+                        "nbest": 5
+                    },
+                    sort_keys=True,
+                    indent=4,
+                    separators=(',', ': '))
+                await ws.send(audio_info)
+                msg = await ws.recv()
+                logging.info("receive msg={}".format(msg))
+
+                # send bytes data
+                logging.info("结束录音请: Ctrl + c。继续请按回车。")
+                try:
+                    while True:
+                        while len(self._frames) > 0:
+                            await ws.send(self._frames.pop(0))
+                            msg = await ws.recv()
+                            logging.info("receive msg={}".format(msg))
+                except asyncio.CancelledError:
+                    # quit
+                    # send finished 
+                    audio_info = json.dumps(
+                        {
+                            "name": "test.wav",
+                            "signal": "end",
+                            "nbest": 5
+                        },
+                        sort_keys=True,
+                        indent=4,
+                        separators=(',', ': '))
+                    await ws.send(audio_info)
+                    msg = await ws.recv()
+                    logging.info("receive msg={}".format(msg))
+
+                    self.stoprecord()
+                    logging.info("*" * 10 + "录音结束")
+                    self.save()
+        elif aa.strip() == "n":
+            exit()
+        else:
+            print("无效输入!")
+            exit()
+
+
+if __name__ == "__main__":
+
+    logging.basicConfig(level=logging.INFO)
+    logging.info("asr websocket client start")
+
+    handler = ASRAudioHandler("127.0.0.1", 8091)
+    loop = asyncio.get_event_loop()
+    main_task = asyncio.ensure_future(handler.run())
+    for signal in [SIGINT, SIGTERM]:
+        loop.add_signal_handler(signal, main_task.cancel)
+    try:
+        loop.run_until_complete(main_task)
+    finally:
+        loop.close()
+
+    logging.info("asr websocket client finished")
diff --git a/paddlespeech/server/tests/asr/online/websocket_client.py b/paddlespeech/server/tests/asr/online/websocket_client.py
new file mode 100644
index 000000000..58b1a452c
--- /dev/null
+++ b/paddlespeech/server/tests/asr/online/websocket_client.py
@@ -0,0 +1,115 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#!/usr/bin/python
+# -*- coding: UTF-8 -*-
+import argparse
+import asyncio
+import json
+import logging
+
+import numpy as np
+import soundfile
+import websockets
+
+
+class ASRAudioHandler:
+    def __init__(self, url="127.0.0.1", port=8090):
+        self.url = url
+        self.port = port
+        self.url = "ws://" + self.url + ":" + str(self.port) + "/ws/asr"
+
+    def read_wave(self, wavfile_path: str):
+        samples, sample_rate = soundfile.read(wavfile_path, dtype='int16')
+        x_len = len(samples)
+        chunk_stride = 40 * 16  #40ms, sample_rate = 16kHz
+        chunk_size = 80 * 16  #80ms, sample_rate = 16kHz
+
+        if (x_len - chunk_size) % chunk_stride != 0:
+            padding_len_x = chunk_stride - (x_len - chunk_size) % chunk_stride
+        else:
+            padding_len_x = 0
+
+        padding = np.zeros((padding_len_x), dtype=samples.dtype)
+        padded_x = np.concatenate([samples, padding], axis=0)
+
+        num_chunk = (x_len + padding_len_x - chunk_size) / chunk_stride + 1
+        num_chunk = int(num_chunk)
+
+        for i in range(0, num_chunk):
+            start = i * chunk_stride
+            end = start + chunk_size
+            x_chunk = padded_x[start:end]
+            yield x_chunk
+
+    async def run(self, wavfile_path: str):
+        logging.info("send a message to the server")
+        # 读取音频
+        # self.read_wave()
+        # 发送 websocket 的 handshake 协议头
+        async with websockets.connect(self.url) as ws:
+            # server 端已经接收到 handshake 协议头
+            # 发送开始指令
+            audio_info = json.dumps(
+                {
+                    "name": "test.wav",
+                    "signal": "start",
+                    "nbest": 5
+                },
+                sort_keys=True,
+                indent=4,
+                separators=(',', ': '))
+            await ws.send(audio_info)
+            msg = await ws.recv()
+            logging.info("receive msg={}".format(msg))
+
+            # send chunk audio data to engine
+            for chunk_data in self.read_wave(wavfile_path):
+                await ws.send(chunk_data.tobytes())
+                msg = await ws.recv()
+                logging.info("receive msg={}".format(msg))
+
+            # finished 
+            audio_info = json.dumps(
+                {
+                    "name": "test.wav",
+                    "signal": "end",
+                    "nbest": 5
+                },
+                sort_keys=True,
+                indent=4,
+                separators=(',', ': '))
+            await ws.send(audio_info)
+            msg = await ws.recv()
+            logging.info("receive msg={}".format(msg))
+
+
+def main(args):
+    logging.basicConfig(level=logging.INFO)
+    logging.info("asr websocket client start")
+    handler = ASRAudioHandler("127.0.0.1", 8091)
+    loop = asyncio.get_event_loop()
+    loop.run_until_complete(handler.run(args.wavfile))
+    logging.info("asr websocket client finished")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--wavfile",
+        action="store",
+        help="wav file path ",
+        default="./16_audio.wav")
+    args = parser.parse_args()
+
+    main(args)
diff --git a/paddlespeech/server/utils/buffer.py b/paddlespeech/server/utils/buffer.py
new file mode 100644
index 000000000..682357b34
--- /dev/null
+++ b/paddlespeech/server/utils/buffer.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class Frame(object):
+    """Represents a "frame" of audio data."""
+
+    def __init__(self, bytes, timestamp, duration):
+        self.bytes = bytes
+        self.timestamp = timestamp
+        self.duration = duration
+
+
+class ChunkBuffer(object):
+    def __init__(self,
+                 frame_duration_ms=80,
+                 shift_ms=40,
+                 sample_rate=16000,
+                 sample_width=2):
+        self.sample_rate = sample_rate
+        self.frame_duration_ms = frame_duration_ms
+        self.shift_ms = shift_ms
+        self.remained_audio = b''
+        self.sample_width = sample_width  # int16 = 2; float32 = 4
+
+    def frame_generator(self, audio):
+        """Generates audio frames from PCM audio data.
+        Takes the desired frame duration in milliseconds, the PCM data, and
+        the sample rate.
+        Yields Frames of the requested duration.
+        """
+        audio = self.remained_audio + audio
+        self.remained_audio = b''
+
+        n = int(self.sample_rate * (self.frame_duration_ms / 1000.0) *
+                self.sample_width)
+        shift_n = int(self.sample_rate * (self.shift_ms / 1000.0) *
+                      self.sample_width)
+        offset = 0
+        timestamp = 0.0
+        duration = (float(n) / self.sample_rate) / self.sample_width
+        shift_duration = (float(shift_n) / self.sample_rate) / self.sample_width
+        while offset + n <= len(audio):
+            yield Frame(audio[offset:offset + n], timestamp, duration)
+            timestamp += shift_duration
+            offset += shift_n
+
+        self.remained_audio += audio[offset:]
diff --git a/paddlespeech/server/utils/paddle_predictor.py b/paddlespeech/server/utils/paddle_predictor.py
index 4035d48d8..16653cf37 100644
--- a/paddlespeech/server/utils/paddle_predictor.py
+++ b/paddlespeech/server/utils/paddle_predictor.py
@@ -35,10 +35,12 @@ def init_predictor(model_dir: Optional[os.PathLike]=None,
     Returns:
         predictor (PaddleInferPredictor): created predictor
     """
-
     if model_dir is not None:
+        assert os.path.isdir(model_dir), 'Please check model dir.'
         config = Config(args.model_dir)
     else:
+        assert os.path.isfile(model_file) and os.path.isfile(
+            params_file), 'Please check model and parameter files.'
         config = Config(model_file, params_file)
 
     # set device
@@ -66,7 +68,6 @@ def init_predictor(model_dir: Optional[os.PathLike]=None,
     config.enable_memory_optim()
 
     predictor = create_predictor(config)
-
     return predictor
 
 
@@ -84,10 +85,8 @@ def run_model(predictor, input: List) -> List:
     for i, name in enumerate(input_names):
         input_handle = predictor.get_input_handle(name)
         input_handle.copy_from_cpu(input[i])
-
     # do the inference
     predictor.run()
-
     results = []
     # get out data from output tensor
     output_names = predictor.get_output_names()
diff --git a/paddlespeech/server/utils/vad.py b/paddlespeech/server/utils/vad.py
new file mode 100644
index 000000000..a2dcf68b8
--- /dev/null
+++ b/paddlespeech/server/utils/vad.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+
+import webrtcvad
+
+
+class VADAudio():
+    def __init__(self,
+                 aggressiveness=2,
+                 rate=16000,
+                 frame_duration_ms=20,
+                 sample_width=2,
+                 padding_ms=200,
+                 padding_ratio=0.9):
+        """Initializes VAD with given aggressivenes and sets up internal queues"""
+        self.vad = webrtcvad.Vad(aggressiveness)
+        self.rate = rate
+        self.sample_width = sample_width
+        self.frame_duration_ms = frame_duration_ms
+        self._frame_length = int(rate * (frame_duration_ms / 1000.0) *
+                                 self.sample_width)
+        self._buffer_queue = collections.deque()
+        self.ring_buffer = collections.deque(maxlen=padding_ms //
+                                             frame_duration_ms)
+        self._ratio = padding_ratio
+        self.triggered = False
+
+    def add_audio(self, audio):
+        """Adds new audio to internal queue"""
+        for x in audio:
+            self._buffer_queue.append(x)
+
+    def frame_generator(self):
+        """Generator that yields audio frames of frame_duration_ms"""
+        while len(self._buffer_queue) > self._frame_length:
+            frame = bytearray()
+            for _ in range(self._frame_length):
+                frame.append(self._buffer_queue.popleft())
+            yield bytes(frame)
+
+    def vad_collector(self):
+        """Generator that yields series of consecutive audio frames comprising each utterence, separated by yielding a single None.
+            Determines voice activity by ratio of frames in padding_ms. Uses a buffer to include padding_ms prior to being triggered.
+            Example: (frame, ..., frame, None, frame, ..., frame, None, ...)
+                      |---utterence---|        |---utterence---|
+        """
+        for frame in self.frame_generator():
+            is_speech = self.vad.is_speech(frame, self.rate)
+            if not self.triggered:
+                self.ring_buffer.append((frame, is_speech))
+                num_voiced = len(
+                    [f for f, speech in self.ring_buffer if speech])
+                if num_voiced > self._ratio * self.ring_buffer.maxlen:
+                    self.triggered = True
+                    for f, s in self.ring_buffer:
+                        yield f
+                    self.ring_buffer.clear()
+            else:
+                yield frame
+                self.ring_buffer.append((frame, is_speech))
+                num_unvoiced = len(
+                    [f for f, speech in self.ring_buffer if not speech])
+                if num_unvoiced > self._ratio * self.ring_buffer.maxlen:
+                    self.triggered = False
+                    yield None
+                    self.ring_buffer.clear()
diff --git a/paddlespeech/server/ws/__init__.py b/paddlespeech/server/ws/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/paddlespeech/server/ws/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddlespeech/server/ws/api.py b/paddlespeech/server/ws/api.py
new file mode 100644
index 000000000..10664d114
--- /dev/null
+++ b/paddlespeech/server/ws/api.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List
+
+from fastapi import APIRouter
+
+from paddlespeech.server.ws.asr_socket import router as asr_router
+
+_router = APIRouter()
+
+
+def setup_router(api_list: List):
+    """setup router for fastapi
+    Args:
+        api_list (List): [asr, tts]
+    Returns:
+        APIRouter
+    """
+    for api_name in api_list:
+        if api_name == 'asr':
+            _router.include_router(asr_router)
+        elif api_name == 'tts':
+            pass
+        else:
+            pass
+
+    return _router
diff --git a/paddlespeech/server/ws/asr_socket.py b/paddlespeech/server/ws/asr_socket.py
new file mode 100644
index 000000000..ea19816b6
--- /dev/null
+++ b/paddlespeech/server/ws/asr_socket.py
@@ -0,0 +1,100 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+
+import numpy as np
+from fastapi import APIRouter
+from fastapi import WebSocket
+from fastapi import WebSocketDisconnect
+from starlette.websockets import WebSocketState as WebSocketState
+
+from paddlespeech.server.engine.engine_pool import get_engine_pool
+from paddlespeech.server.utils.buffer import ChunkBuffer
+from paddlespeech.server.utils.vad import VADAudio
+
+router = APIRouter()
+
+
+@router.websocket('/ws/asr')
+async def websocket_endpoint(websocket: WebSocket):
+
+    await websocket.accept()
+
+    engine_pool = get_engine_pool()
+    asr_engine = engine_pool['asr']
+    # init buffer
+    chunk_buffer_conf = asr_engine.config.chunk_buffer_conf
+    chunk_buffer = ChunkBuffer(
+        sample_rate=chunk_buffer_conf['sample_rate'],
+        sample_width=chunk_buffer_conf['sample_width'])
+    # init vad
+    vad_conf = asr_engine.config.vad_conf
+    vad = VADAudio(
+        aggressiveness=vad_conf['aggressiveness'],
+        rate=vad_conf['sample_rate'],
+        frame_duration_ms=vad_conf['frame_duration_ms'])
+
+    try:
+        while True:
+            # careful here, changed the source code from starlette.websockets
+            assert websocket.application_state == WebSocketState.CONNECTED
+            message = await websocket.receive()
+            websocket._raise_on_disconnect(message)
+            if "text" in message:
+                message = json.loads(message["text"])
+                if 'signal' not in message:
+                    resp = {"status": "ok", "message": "no valid json data"}
+                    await websocket.send_json(resp)
+
+                if message['signal'] == 'start':
+                    resp = {"status": "ok", "signal": "server_ready"}
+                    # do something at begining here
+                    await websocket.send_json(resp)
+                elif message['signal'] == 'end':
+                    engine_pool = get_engine_pool()
+                    asr_engine = engine_pool['asr']
+                    # reset single  engine for an new connection
+                    asr_engine.reset()
+                    resp = {"status": "ok", "signal": "finished"}
+                    await websocket.send_json(resp)
+                    break
+                else:
+                    resp = {"status": "ok", "message": "no valid json data"}
+                    await websocket.send_json(resp)
+            elif "bytes" in message:
+                message = message["bytes"]
+
+                # vad for input bytes audio
+                vad.add_audio(message)
+                message = b''.join(f for f in vad.vad_collector()
+                                   if f is not None)
+
+                engine_pool = get_engine_pool()
+                asr_engine = engine_pool['asr']
+                asr_results = ""
+                frames = chunk_buffer.frame_generator(message)
+                for frame in frames:
+                    samples = np.frombuffer(frame.bytes, dtype=np.int16)
+                    sample_rate = asr_engine.config.sample_rate
+                    x_chunk, x_chunk_lens = asr_engine.preprocess(samples,
+                                                                  sample_rate)
+                    asr_engine.run(x_chunk, x_chunk_lens)
+                    asr_results = asr_engine.postprocess()
+
+                asr_results = asr_engine.postprocess()
+                resp = {'asr_results': asr_results}
+
+                await websocket.send_json(resp)
+    except WebSocketDisconnect:
+        pass
diff --git a/paddlespeech/t2s/exps/csmsc_test.txt b/paddlespeech/t2s/exps/csmsc_test.txt
new file mode 100644
index 000000000..d8cf367cd
--- /dev/null
+++ b/paddlespeech/t2s/exps/csmsc_test.txt
@@ -0,0 +1,100 @@
+009901 昨日，这名伤者与医生全部被警方依法刑事拘留。
+009902 钱伟长想到上海来办学校是经过深思熟虑的。
+009903 她见我一进门就骂，吃饭时也骂，骂得我抬不起头。
+009904 李述德在离开之前，只说了一句柱驼杀父亲了。
+009905 这种车票和保险单捆绑出售属于重复性购买。
+009906 戴佩妮的男友西米露接唱情歌，让她非常开心。
+009907 观大势，谋大局，出大策始终是该院的办院方针。
+009908 他们骑着摩托回家，正好为农忙时的父母帮忙。
+009909 但是因为还没到退休年龄，只能掰着指头捱日子。
+009910 这几天雨水不断，人们恨不得待在家里不出门。
+009911 没想到徐赟，张海翔两人就此玩起了人间蒸发。
+009912 藤村此番发言可能是为了凸显野田的领导能力。
+009913 程长庚，生在清王朝嘉庆年间，安徽的潜山小县。
+009914 南海海域综合补给基地码头项目正在论证中。
+009915 也就是说今晚成都市民极有可能再次看到飘雪。
+009916 随着天气转热，各地的游泳场所开始人头攒动。
+009917 更让徐先生纳闷的是，房客的手机也打不通了。
+009918 遇到颠簸时，应听从乘务员的安全指令，回座位坐好。
+009919 他在后面呆惯了，怕自己一插身后的人会不满，不敢排进去。
+009920 傍晚七个小人回来了，白雪公主说，你们就是我命中的七个小矮人吧。
+009921 他本想说，教育局管这个，他们是一路的，这样一管岂不是妓女起嫖客？
+009922 一种表示商品所有权的财物证券，也称商品证券，如提货单，交货单。
+009923 会有很丰富的东西留下来，说都说不完。
+009924 这句话像从天而降，吓得四周一片寂静。
+009925 记者所在的是受害人家属所在的右区。
+009926 不管哈大爷去哪，它都一步不离地跟着。
+009927 大家抬头望去，一只老鼠正趴在吊顶上。
+009928 我决定过年就辞职，接手我爸的废品站！
+009929 最终，中国男子乒乓球队获得此奖项。
+009930 防汛抗旱两手抓，抗旱相对抓的不够。
+009931 图们江下游地区开发开放的进展如何？
+009932 这要求中国必须有一个坚强的政党领导。
+009933 再说，关于利益上的事俺俩都不好开口。
+009934 明代瓦剌，鞑靼入侵明境也是通过此地。
+009935 咪咪舔着孩子，把它身上的毛舔干净。
+009936 是否这次的国标修订被大企业绑架了？
+009937 判决后，姚某妻子胡某不服，提起上诉。
+009938 由此可以看出邯钢的经济效益来自何处。
+009939 琳达说，是瑜伽改变了她和马儿的生活。
+009940 楼下的保安告诉记者，这里不租也不卖。
+009941 习近平说，中斯两国人民传统友谊深厚。
+009942 传闻越来越多，后来连老汉儿自己都怕了。
+009943 我怒吼一声冲上去，举起砖头砸了过去。
+009944 我现在还不会，这就回去问问发明我的人。
+009945 显然，洛阳性奴案不具备上述两个前提。
+009946 另外，杰克逊有文唇线，眼线，眉毛的动作。
+009947 昨晚，华西都市报记者电话采访了尹琪。
+009948 涅拉季科未透露这些航空公司的名称。
+009949 从运行轨迹上来说，它也不可能是星星。
+009950 目前看，如果继续加息也存在两难问题。
+009951 曾宝仪在节目录制现场大爆观众糗事。
+009952 但任凭周某怎么叫，男子仍酣睡不醒。
+009953 老大爷说，小子，你挡我财路了，知道不？
+009954 没料到，闯下大头佛的阿伟还不知悔改。
+009955 卡扎菲部落式统治已遭遇部落内讧。
+009956 这个孩子的生命一半来源于另一位女士捐赠的冷冻卵子。
+009957 出现这种泥鳅内阁的局面既是野田有意为之，也实属无奈。
+009958 济青高速济南，华山，章丘，邹平，周村，淄博，临淄站。
+009959 赵凌飞的话，反映了沈阳赛区所有奥运志愿者的共同心声。
+009960 因为，我们所发出的力量必会因难度加大而减弱。
+009961 发生事故的楼梯拐角处仍可看到血迹。
+009962 想过进公安，可能身高不够，老汉儿也不让我进去。
+009963 路上关卡很多，为了方便撤离，只好轻装前进。
+009964 原来比尔盖茨就是美国微软公司联合创始人呀。
+009965 之后他们一家三口将与双方父母往峇里岛旅游。
+009966 谢谢总理，也感谢广大网友的参与，我们明年再见。
+009967 事实上是，从来没有一个欺善怕恶的人能作出过稍大一点的成就。
+009968 我会打开邮件，你可以从那里继续。
+009969 美方对近期东海局势表示关切。
+009970 据悉，奥巴马一家人对这座冬季白宫极为满意。
+009971 打扫完你会很有成就感的，试一试，你就信了。
+009972 诺曼站在滑板车上，各就各位，准备出发啦！
+009973 塔河的寒夜，气温降到了零下三十多摄氏度。
+009974 其间，连破六点六，六点五，六点四，六点三五等多个重要关口。
+009975 算命其实只是人们的一种自我安慰和自我暗示而已，我们还是要相信科学才好。
+009976 这一切都令人欢欣鼓舞，阿讷西没理由不坚持到最后。
+009977 直至公元前一万一千年，它又再次出现。
+009978 尽量少玩电脑，少看电视，少打游戏。
+009979 从五到七，前后也就是六个月的时间。
+009980 一进咖啡店，他就遇见一张熟悉的脸。
+009981 好在众弟兄看到了把她追了回来。
+009982 有一个人说，哥们儿我们跑过它才能活。
+009983 捅了她以后，模糊记得她没咋动了。
+009984 从小到大，葛启义没有收到过压岁钱。
+009985 舞台下的你会对舞台上的你说什么？
+009986 但考生普遍认为，试题的怪多过难。
+009987 我希望每个人都能够尊重我们的隐私。
+009988 漫天的红霞使劲给两人增添气氛。
+009989 晚上加完班开车回家，太累了，迷迷糊糊开着车，走一半的时候，铛一声！
+009990 该车将三人撞倒后，在大雾中逃窜。
+009991 这人一哆嗦，方向盘也把不稳了，差点撞上了高速边道护栏。
+009992 那女孩儿委屈的说，我一回头见你已经进去了我不敢进去啊！
+009993 小明摇摇头说，不是，我只是美女看多了，想换个口味而已。
+009994 接下来，红娘要求记者交费，记者表示不知表姐身份证号码。
+009995 李东蓊表示，自己当时在法庭上发表了一次独特的公诉意见。
+009996 另一男子扑了上来，手里拿着明晃晃的长刀，向他胸口直刺。
+009997 今天，快递员拿着一个快递在办公室喊，秦王是哪个，有他快递？
+009998 这场抗议活动究竟是如何发展演变的，又究竟是谁伤害了谁？
+009999 因华国锋肖鸡，墓地设计根据其属相设计。
+010000 在狱中，张明宝悔恨交加，写了一份忏悔书。
diff --git a/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py b/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py
index 3fded29b1..4c92ad1cc 100644
--- a/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py
+++ b/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 # generate mels using durations.txt
 # for mb melgan finetune
-# 长度和原本的 mel 不一致怎么办？
 import argparse
 import os
 from pathlib import Path
diff --git a/paddlespeech/t2s/exps/gan_vocoder/synthesize.py b/paddlespeech/t2s/exps/gan_vocoder/synthesize.py
index c60b9add2..9d9a8c49b 100644
--- a/paddlespeech/t2s/exps/gan_vocoder/synthesize.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/synthesize.py
@@ -34,7 +34,7 @@ def main():
         "--generator-type",
         type=str,
         default="pwgan",
-        help="type of GANVocoder, should in {pwgan, mb_melgan, style_melgan, } now"
+        help="type of GANVocoder, should in {pwgan, mb_melgan, style_melgan, hifigan, } now"
     )
     parser.add_argument("--config", type=str, help="GANVocoder config file.")
     parser.add_argument("--checkpoint", type=str, help="snapshot to load.")
diff --git a/paddlespeech/t2s/exps/inference.py b/paddlespeech/t2s/exps/inference.py
index 26d7e2c08..1188ddfb1 100644
--- a/paddlespeech/t2s/exps/inference.py
+++ b/paddlespeech/t2s/exps/inference.py
@@ -17,13 +17,92 @@ from pathlib import Path
 import numpy
 import soundfile as sf
 from paddle import inference
-
-from paddlespeech.t2s.frontend import English
-from paddlespeech.t2s.frontend.zh_frontend import Frontend
+from timer import timer
+
+from paddlespeech.t2s.exps.syn_utils import get_frontend
+from paddlespeech.t2s.exps.syn_utils import get_sentences
+from paddlespeech.t2s.utils import str2bool
+
+
+def get_predictor(args, filed='am'):
+    full_name = ''
+    if filed == 'am':
+        full_name = args.am
+    elif filed == 'voc':
+        full_name = args.voc
+    model_name = full_name[:full_name.rindex('_')]
+    config = inference.Config(
+        str(Path(args.inference_dir) / (full_name + ".pdmodel")),
+        str(Path(args.inference_dir) / (full_name + ".pdiparams")))
+    if args.device == "gpu":
+        config.enable_use_gpu(100, 0)
+    elif args.device == "cpu":
+        config.disable_gpu()
+    # This line must be commented for fastspeech2, if not, it will OOM
+    if model_name != 'fastspeech2':
+        config.enable_memory_optim()
+    predictor = inference.create_predictor(config)
+    return predictor
 
 
-# only inference for models trained with csmsc now
-def main():
+def get_am_output(args, am_predictor, frontend, merge_sentences, input):
+    am_name = args.am[:args.am.rindex('_')]
+    am_dataset = args.am[args.am.rindex('_') + 1:]
+    am_input_names = am_predictor.get_input_names()
+    get_tone_ids = False
+    get_spk_id = False
+    if am_name == 'speedyspeech':
+        get_tone_ids = True
+    if am_dataset in {"aishell3", "vctk"} and args.speaker_dict:
+        get_spk_id = True
+        spk_id = numpy.array([args.spk_id])
+    if args.lang == 'zh':
+        input_ids = frontend.get_input_ids(
+            input, merge_sentences=merge_sentences, get_tone_ids=get_tone_ids)
+        phone_ids = input_ids["phone_ids"]
+    elif args.lang == 'en':
+        input_ids = frontend.get_input_ids(
+            input, merge_sentences=merge_sentences)
+        phone_ids = input_ids["phone_ids"]
+    else:
+        print("lang should in {'zh', 'en'}!")
+
+    if get_tone_ids:
+        tone_ids = input_ids["tone_ids"]
+        tones = tone_ids[0].numpy()
+        tones_handle = am_predictor.get_input_handle(am_input_names[1])
+        tones_handle.reshape(tones.shape)
+        tones_handle.copy_from_cpu(tones)
+    if get_spk_id:
+        spk_id_handle = am_predictor.get_input_handle(am_input_names[1])
+        spk_id_handle.reshape(spk_id.shape)
+        spk_id_handle.copy_from_cpu(spk_id)
+    phones = phone_ids[0].numpy()
+    phones_handle = am_predictor.get_input_handle(am_input_names[0])
+    phones_handle.reshape(phones.shape)
+    phones_handle.copy_from_cpu(phones)
+
+    am_predictor.run()
+    am_output_names = am_predictor.get_output_names()
+    am_output_handle = am_predictor.get_output_handle(am_output_names[0])
+    am_output_data = am_output_handle.copy_to_cpu()
+    return am_output_data
+
+
+def get_voc_output(args, voc_predictor, input):
+    voc_input_names = voc_predictor.get_input_names()
+    mel_handle = voc_predictor.get_input_handle(voc_input_names[0])
+    mel_handle.reshape(input.shape)
+    mel_handle.copy_from_cpu(input)
+
+    voc_predictor.run()
+    voc_output_names = voc_predictor.get_output_names()
+    voc_output_handle = voc_predictor.get_output_handle(voc_output_names[0])
+    wav = voc_output_handle.copy_to_cpu()
+    return wav
+
+
+def parse_args():
     parser = argparse.ArgumentParser(
         description="Paddle Infernce with speedyspeech & parallel wavegan.")
     # acoustic model
@@ -70,113 +149,97 @@ def main():
     parser.add_argument(
         "--inference_dir", type=str, help="dir to save inference models")
     parser.add_argument("--output_dir", type=str, help="output dir")
+    # inference
+    parser.add_argument(
+        "--use_trt",
+        type=str2bool,
+        default=False,
+        help="Whether to use inference engin TensorRT.", )
+    parser.add_argument(
+        "--int8",
+        type=str2bool,
+        default=False,
+        help="Whether to use int8 inference.", )
+    parser.add_argument(
+        "--fp16",
+        type=str2bool,
+        default=False,
+        help="Whether to use float16 inference.", )
+    parser.add_argument(
+        "--device",
+        default="gpu",
+        choices=["gpu", "cpu"],
+        help="Device selected for inference.", )
 
     args, _ = parser.parse_known_args()
+    return args
+
 
+# only inference for models trained with csmsc now
+def main():
+    args = parse_args()
     # frontend
-    if args.lang == 'zh':
-        frontend = Frontend(
-            phone_vocab_path=args.phones_dict, tone_vocab_path=args.tones_dict)
-    elif args.lang == 'en':
-        frontend = English(phone_vocab_path=args.phones_dict)
-    print("frontend done!")
+    frontend = get_frontend(args)
 
+    # am_predictor
+    am_predictor = get_predictor(args, filed='am')
     # model: {model_name}_{dataset}
-    am_name = args.am[:args.am.rindex('_')]
     am_dataset = args.am[args.am.rindex('_') + 1:]
 
-    am_config = inference.Config(
-        str(Path(args.inference_dir) / (args.am + ".pdmodel")),
-        str(Path(args.inference_dir) / (args.am + ".pdiparams")))
-    am_config.enable_use_gpu(100, 0)
-    # This line must be commented for fastspeech2, if not, it will OOM
-    if am_name != 'fastspeech2':
-        am_config.enable_memory_optim()
-    am_predictor = inference.create_predictor(am_config)
-
-    voc_config = inference.Config(
-        str(Path(args.inference_dir) / (args.voc + ".pdmodel")),
-        str(Path(args.inference_dir) / (args.voc + ".pdiparams")))
-    voc_config.enable_use_gpu(100, 0)
-    voc_config.enable_memory_optim()
-    voc_predictor = inference.create_predictor(voc_config)
+    # voc_predictor
+    voc_predictor = get_predictor(args, filed='voc')
 
     output_dir = Path(args.output_dir)
     output_dir.mkdir(parents=True, exist_ok=True)
-    sentences = []
-
-    print("in new inference")
-
-    # construct dataset for evaluation
-    sentences = []
-    with open(args.text, 'rt') as f:
-        for line in f:
-            items = line.strip().split()
-            utt_id = items[0]
-            if args.lang == 'zh':
-                sentence = "".join(items[1:])
-            elif args.lang == 'en':
-                sentence = " ".join(items[1:])
-            sentences.append((utt_id, sentence))
 
-    get_tone_ids = False
-    get_spk_id = False
-    if am_name == 'speedyspeech':
-        get_tone_ids = True
-    if am_dataset in {"aishell3", "vctk"} and args.speaker_dict:
-        get_spk_id = True
-        spk_id = numpy.array([args.spk_id])
+    sentences = get_sentences(args)
 
-    am_input_names = am_predictor.get_input_names()
-    print("am_input_names:", am_input_names)
     merge_sentences = True
+    fs = 24000 if am_dataset != 'ljspeech' else 22050
+    # warmup
+    for utt_id, sentence in sentences[:3]:
+        with timer() as t:
+            am_output_data = get_am_output(
+                args,
+                am_predictor=am_predictor,
+                frontend=frontend,
+                merge_sentences=merge_sentences,
+                input=sentence)
+            wav = get_voc_output(
+                args, voc_predictor=voc_predictor, input=am_output_data)
+        speed = wav.size / t.elapse
+        rtf = fs / speed
+        print(
+            f"{utt_id}, mel: {am_output_data.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
+        )
+
+    print("warm up done!")
+
+    N = 0
+    T = 0
     for utt_id, sentence in sentences:
-        if args.lang == 'zh':
-            input_ids = frontend.get_input_ids(
-                sentence,
+        with timer() as t:
+            am_output_data = get_am_output(
+                args,
+                am_predictor=am_predictor,
+                frontend=frontend,
                 merge_sentences=merge_sentences,
-                get_tone_ids=get_tone_ids)
-            phone_ids = input_ids["phone_ids"]
-        elif args.lang == 'en':
-            input_ids = frontend.get_input_ids(
-                sentence, merge_sentences=merge_sentences)
-            phone_ids = input_ids["phone_ids"]
-        else:
-            print("lang should in {'zh', 'en'}!")
-
-        if get_tone_ids:
-            tone_ids = input_ids["tone_ids"]
-            tones = tone_ids[0].numpy()
-            tones_handle = am_predictor.get_input_handle(am_input_names[1])
-            tones_handle.reshape(tones.shape)
-            tones_handle.copy_from_cpu(tones)
-        if get_spk_id:
-            spk_id_handle = am_predictor.get_input_handle(am_input_names[1])
-            spk_id_handle.reshape(spk_id.shape)
-            spk_id_handle.copy_from_cpu(spk_id)
-        phones = phone_ids[0].numpy()
-        phones_handle = am_predictor.get_input_handle(am_input_names[0])
-        phones_handle.reshape(phones.shape)
-        phones_handle.copy_from_cpu(phones)
-
-        am_predictor.run()
-        am_output_names = am_predictor.get_output_names()
-        am_output_handle = am_predictor.get_output_handle(am_output_names[0])
-        am_output_data = am_output_handle.copy_to_cpu()
-
-        voc_input_names = voc_predictor.get_input_names()
-        mel_handle = voc_predictor.get_input_handle(voc_input_names[0])
-        mel_handle.reshape(am_output_data.shape)
-        mel_handle.copy_from_cpu(am_output_data)
-
-        voc_predictor.run()
-        voc_output_names = voc_predictor.get_output_names()
-        voc_output_handle = voc_predictor.get_output_handle(voc_output_names[0])
-        wav = voc_output_handle.copy_to_cpu()
+                input=sentence)
+            wav = get_voc_output(
+                args, voc_predictor=voc_predictor, input=am_output_data)
+
+        N += wav.size
+        T += t.elapse
+        speed = wav.size / t.elapse
+        rtf = fs / speed
 
         sf.write(output_dir / (utt_id + ".wav"), wav, samplerate=24000)
+        print(
+            f"{utt_id}, mel: {am_output_data.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
+        )
 
         print(f"{utt_id} done!")
+    print(f"generation speed: {N / T}Hz, RTF: {fs / (N / T) }")
 
 
 if __name__ == "__main__":
diff --git a/paddlespeech/t2s/exps/syn_utils.py b/paddlespeech/t2s/exps/syn_utils.py
new file mode 100644
index 000000000..c52cb3727
--- /dev/null
+++ b/paddlespeech/t2s/exps/syn_utils.py
@@ -0,0 +1,243 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+import numpy as np
+import paddle
+from paddle import jit
+from paddle.static import InputSpec
+
+from paddlespeech.s2t.utils.dynamic_import import dynamic_import
+from paddlespeech.t2s.datasets.data_table import DataTable
+from paddlespeech.t2s.frontend import English
+from paddlespeech.t2s.frontend.zh_frontend import Frontend
+from paddlespeech.t2s.modules.normalizer import ZScore
+
+model_alias = {
+    # acoustic model
+    "speedyspeech":
+    "paddlespeech.t2s.models.speedyspeech:SpeedySpeech",
+    "speedyspeech_inference":
+    "paddlespeech.t2s.models.speedyspeech:SpeedySpeechInference",
+    "fastspeech2":
+    "paddlespeech.t2s.models.fastspeech2:FastSpeech2",
+    "fastspeech2_inference":
+    "paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference",
+    "tacotron2":
+    "paddlespeech.t2s.models.tacotron2:Tacotron2",
+    "tacotron2_inference":
+    "paddlespeech.t2s.models.tacotron2:Tacotron2Inference",
+    # voc
+    "pwgan":
+    "paddlespeech.t2s.models.parallel_wavegan:PWGGenerator",
+    "pwgan_inference":
+    "paddlespeech.t2s.models.parallel_wavegan:PWGInference",
+    "mb_melgan":
+    "paddlespeech.t2s.models.melgan:MelGANGenerator",
+    "mb_melgan_inference":
+    "paddlespeech.t2s.models.melgan:MelGANInference",
+    "style_melgan":
+    "paddlespeech.t2s.models.melgan:StyleMelGANGenerator",
+    "style_melgan_inference":
+    "paddlespeech.t2s.models.melgan:StyleMelGANInference",
+    "hifigan":
+    "paddlespeech.t2s.models.hifigan:HiFiGANGenerator",
+    "hifigan_inference":
+    "paddlespeech.t2s.models.hifigan:HiFiGANInference",
+    "wavernn":
+    "paddlespeech.t2s.models.wavernn:WaveRNN",
+    "wavernn_inference":
+    "paddlespeech.t2s.models.wavernn:WaveRNNInference",
+}
+
+
+# input
+def get_sentences(args):
+    # construct dataset for evaluation
+    sentences = []
+    with open(args.text, 'rt') as f:
+        for line in f:
+            items = line.strip().split()
+            utt_id = items[0]
+            if 'lang' in args and args.lang == 'zh':
+                sentence = "".join(items[1:])
+            elif 'lang' in args and args.lang == 'en':
+                sentence = " ".join(items[1:])
+            sentences.append((utt_id, sentence))
+    return sentences
+
+
+def get_test_dataset(args, test_metadata, am_name, am_dataset):
+    if am_name == 'fastspeech2':
+        fields = ["utt_id", "text"]
+        if am_dataset in {"aishell3", "vctk"} and args.speaker_dict:
+            print("multiple speaker fastspeech2!")
+            fields += ["spk_id"]
+        elif 'voice_cloning' in args and args.voice_cloning:
+            print("voice cloning!")
+            fields += ["spk_emb"]
+        else:
+            print("single speaker fastspeech2!")
+    elif am_name == 'speedyspeech':
+        fields = ["utt_id", "phones", "tones"]
+    elif am_name == 'tacotron2':
+        fields = ["utt_id", "text"]
+        if 'voice_cloning' in args and args.voice_cloning:
+            print("voice cloning!")
+            fields += ["spk_emb"]
+
+    test_dataset = DataTable(data=test_metadata, fields=fields)
+    return test_dataset
+
+
+# frontend
+def get_frontend(args):
+    if 'lang' in args and args.lang == 'zh':
+        frontend = Frontend(
+            phone_vocab_path=args.phones_dict, tone_vocab_path=args.tones_dict)
+    elif 'lang' in args and args.lang == 'en':
+        frontend = English(phone_vocab_path=args.phones_dict)
+    else:
+        print("wrong lang!")
+    print("frontend done!")
+    return frontend
+
+
+# dygraph
+def get_am_inference(args, am_config):
+    with open(args.phones_dict, "r") as f:
+        phn_id = [line.strip().split() for line in f.readlines()]
+    vocab_size = len(phn_id)
+    print("vocab_size:", vocab_size)
+
+    tone_size = None
+    if 'tones_dict' in args and args.tones_dict:
+        with open(args.tones_dict, "r") as f:
+            tone_id = [line.strip().split() for line in f.readlines()]
+        tone_size = len(tone_id)
+        print("tone_size:", tone_size)
+
+    spk_num = None
+    if 'speaker_dict' in args and args.speaker_dict:
+        with open(args.speaker_dict, 'rt') as f:
+            spk_id = [line.strip().split() for line in f.readlines()]
+        spk_num = len(spk_id)
+        print("spk_num:", spk_num)
+
+    odim = am_config.n_mels
+    # model: {model_name}_{dataset}
+    am_name = args.am[:args.am.rindex('_')]
+    am_dataset = args.am[args.am.rindex('_') + 1:]
+
+    am_class = dynamic_import(am_name, model_alias)
+    am_inference_class = dynamic_import(am_name + '_inference', model_alias)
+
+    if am_name == 'fastspeech2':
+        am = am_class(
+            idim=vocab_size, odim=odim, spk_num=spk_num, **am_config["model"])
+    elif am_name == 'speedyspeech':
+        am = am_class(
+            vocab_size=vocab_size,
+            tone_size=tone_size,
+            spk_num=spk_num,
+            **am_config["model"])
+    elif am_name == 'tacotron2':
+        am = am_class(idim=vocab_size, odim=odim, **am_config["model"])
+
+    am.set_state_dict(paddle.load(args.am_ckpt)["main_params"])
+    am.eval()
+    am_mu, am_std = np.load(args.am_stat)
+    am_mu = paddle.to_tensor(am_mu)
+    am_std = paddle.to_tensor(am_std)
+    am_normalizer = ZScore(am_mu, am_std)
+    am_inference = am_inference_class(am_normalizer, am)
+    am_inference.eval()
+    print("acoustic model done!")
+    return am_inference, am_name, am_dataset
+
+
+def get_voc_inference(args, voc_config):
+    # model: {model_name}_{dataset}
+    voc_name = args.voc[:args.voc.rindex('_')]
+    voc_class = dynamic_import(voc_name, model_alias)
+    voc_inference_class = dynamic_import(voc_name + '_inference', model_alias)
+    if voc_name != 'wavernn':
+        voc = voc_class(**voc_config["generator_params"])
+        voc.set_state_dict(paddle.load(args.voc_ckpt)["generator_params"])
+        voc.remove_weight_norm()
+        voc.eval()
+    else:
+        voc = voc_class(**voc_config["model"])
+        voc.set_state_dict(paddle.load(args.voc_ckpt)["main_params"])
+        voc.eval()
+
+    voc_mu, voc_std = np.load(args.voc_stat)
+    voc_mu = paddle.to_tensor(voc_mu)
+    voc_std = paddle.to_tensor(voc_std)
+    voc_normalizer = ZScore(voc_mu, voc_std)
+    voc_inference = voc_inference_class(voc_normalizer, voc)
+    voc_inference.eval()
+    print("voc done!")
+    return voc_inference
+
+
+# to static
+def am_to_static(args, am_inference, am_name, am_dataset):
+    if am_name == 'fastspeech2':
+        if am_dataset in {"aishell3", "vctk"} and args.speaker_dict:
+            am_inference = jit.to_static(
+                am_inference,
+                input_spec=[
+                    InputSpec([-1], dtype=paddle.int64),
+                    InputSpec([1], dtype=paddle.int64),
+                ], )
+        else:
+            am_inference = jit.to_static(
+                am_inference, input_spec=[InputSpec([-1], dtype=paddle.int64)])
+
+    elif am_name == 'speedyspeech':
+        if am_dataset in {"aishell3", "vctk"} and args.speaker_dict:
+            am_inference = jit.to_static(
+                am_inference,
+                input_spec=[
+                    InputSpec([-1], dtype=paddle.int64),  # text
+                    InputSpec([-1], dtype=paddle.int64),  # tone
+                    InputSpec([1], dtype=paddle.int64),  # spk_id
+                    None  # duration
+                ])
+        else:
+            am_inference = jit.to_static(
+                am_inference,
+                input_spec=[
+                    InputSpec([-1], dtype=paddle.int64),
+                    InputSpec([-1], dtype=paddle.int64)
+                ])
+
+    elif am_name == 'tacotron2':
+        am_inference = jit.to_static(
+            am_inference, input_spec=[InputSpec([-1], dtype=paddle.int64)])
+
+    paddle.jit.save(am_inference, os.path.join(args.inference_dir, args.am))
+    am_inference = paddle.jit.load(os.path.join(args.inference_dir, args.am))
+    return am_inference
+
+
+def voc_to_static(args, voc_inference):
+    voc_inference = jit.to_static(
+        voc_inference, input_spec=[
+            InputSpec([-1, 80], dtype=paddle.float32),
+        ])
+    paddle.jit.save(voc_inference, os.path.join(args.inference_dir, args.voc))
+    voc_inference = paddle.jit.load(os.path.join(args.inference_dir, args.voc))
+    return voc_inference
diff --git a/paddlespeech/t2s/exps/synthesize.py b/paddlespeech/t2s/exps/synthesize.py
index 81da14f2e..abb1eb4eb 100644
--- a/paddlespeech/t2s/exps/synthesize.py
+++ b/paddlespeech/t2s/exps/synthesize.py
@@ -23,48 +23,11 @@ import yaml
 from timer import timer
 from yacs.config import CfgNode
 
-from paddlespeech.s2t.utils.dynamic_import import dynamic_import
-from paddlespeech.t2s.datasets.data_table import DataTable
-from paddlespeech.t2s.modules.normalizer import ZScore
+from paddlespeech.t2s.exps.syn_utils import get_am_inference
+from paddlespeech.t2s.exps.syn_utils import get_test_dataset
+from paddlespeech.t2s.exps.syn_utils import get_voc_inference
 from paddlespeech.t2s.utils import str2bool
 
-model_alias = {
-    # acoustic model
-    "speedyspeech":
-    "paddlespeech.t2s.models.speedyspeech:SpeedySpeech",
-    "speedyspeech_inference":
-    "paddlespeech.t2s.models.speedyspeech:SpeedySpeechInference",
-    "fastspeech2":
-    "paddlespeech.t2s.models.fastspeech2:FastSpeech2",
-    "fastspeech2_inference":
-    "paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference",
-    "tacotron2":
-    "paddlespeech.t2s.models.tacotron2:Tacotron2",
-    "tacotron2_inference":
-    "paddlespeech.t2s.models.tacotron2:Tacotron2Inference",
-    # voc
-    "pwgan":
-    "paddlespeech.t2s.models.parallel_wavegan:PWGGenerator",
-    "pwgan_inference":
-    "paddlespeech.t2s.models.parallel_wavegan:PWGInference",
-    "mb_melgan":
-    "paddlespeech.t2s.models.melgan:MelGANGenerator",
-    "mb_melgan_inference":
-    "paddlespeech.t2s.models.melgan:MelGANInference",
-    "style_melgan":
-    "paddlespeech.t2s.models.melgan:StyleMelGANGenerator",
-    "style_melgan_inference":
-    "paddlespeech.t2s.models.melgan:StyleMelGANInference",
-    "hifigan":
-    "paddlespeech.t2s.models.hifigan:HiFiGANGenerator",
-    "hifigan_inference":
-    "paddlespeech.t2s.models.hifigan:HiFiGANInference",
-    "wavernn":
-    "paddlespeech.t2s.models.wavernn:WaveRNN",
-    "wavernn_inference":
-    "paddlespeech.t2s.models.wavernn:WaveRNNInference",
-}
-
 
 def evaluate(args):
     # dataloader has been too verbose
@@ -86,96 +49,12 @@ def evaluate(args):
     print(am_config)
     print(voc_config)
 
-    # construct dataset for evaluation
-
-    # model: {model_name}_{dataset}
-    am_name = args.am[:args.am.rindex('_')]
-    am_dataset = args.am[args.am.rindex('_') + 1:]
-
-    if am_name == 'fastspeech2':
-        fields = ["utt_id", "text"]
-        spk_num = None
-        if am_dataset in {"aishell3", "vctk"} and args.speaker_dict:
-            print("multiple speaker fastspeech2!")
-            with open(args.speaker_dict, 'rt') as f:
-                spk_id = [line.strip().split() for line in f.readlines()]
-            spk_num = len(spk_id)
-            fields += ["spk_id"]
-        elif args.voice_cloning:
-            print("voice cloning!")
-            fields += ["spk_emb"]
-        else:
-            print("single speaker fastspeech2!")
-        print("spk_num:", spk_num)
-    elif am_name == 'speedyspeech':
-        fields = ["utt_id", "phones", "tones"]
-    elif am_name == 'tacotron2':
-        fields = ["utt_id", "text"]
-        if args.voice_cloning:
-            print("voice cloning!")
-            fields += ["spk_emb"]
-
-    test_dataset = DataTable(data=test_metadata, fields=fields)
-
-    with open(args.phones_dict, "r") as f:
-        phn_id = [line.strip().split() for line in f.readlines()]
-    vocab_size = len(phn_id)
-    print("vocab_size:", vocab_size)
-
-    tone_size = None
-    if args.tones_dict:
-        with open(args.tones_dict, "r") as f:
-            tone_id = [line.strip().split() for line in f.readlines()]
-        tone_size = len(tone_id)
-        print("tone_size:", tone_size)
-
     # acoustic model
-    odim = am_config.n_mels
-    am_class = dynamic_import(am_name, model_alias)
-    am_inference_class = dynamic_import(am_name + '_inference', model_alias)
-
-    if am_name == 'fastspeech2':
-        am = am_class(
-            idim=vocab_size, odim=odim, spk_num=spk_num, **am_config["model"])
-    elif am_name == 'speedyspeech':
-        am = am_class(
-            vocab_size=vocab_size, tone_size=tone_size, **am_config["model"])
-    elif am_name == 'tacotron2':
-        am = am_class(idim=vocab_size, odim=odim, **am_config["model"])
-
-    am.set_state_dict(paddle.load(args.am_ckpt)["main_params"])
-    am.eval()
-    am_mu, am_std = np.load(args.am_stat)
-    am_mu = paddle.to_tensor(am_mu)
-    am_std = paddle.to_tensor(am_std)
-    am_normalizer = ZScore(am_mu, am_std)
-    am_inference = am_inference_class(am_normalizer, am)
-    print("am_inference.training0:", am_inference.training)
-    am_inference.eval()
-    print("acoustic model done!")
+    am_inference, am_name, am_dataset = get_am_inference(args, am_config)
+    test_dataset = get_test_dataset(args, test_metadata, am_name, am_dataset)
 
     # vocoder
-    # model: {model_name}_{dataset}
-    voc_name = args.voc[:args.voc.rindex('_')]
-    voc_class = dynamic_import(voc_name, model_alias)
-    voc_inference_class = dynamic_import(voc_name + '_inference', model_alias)
-    if voc_name != 'wavernn':
-        voc = voc_class(**voc_config["generator_params"])
-        voc.set_state_dict(paddle.load(args.voc_ckpt)["generator_params"])
-        voc.remove_weight_norm()
-        voc.eval()
-    else:
-        voc = voc_class(**voc_config["model"])
-        voc.set_state_dict(paddle.load(args.voc_ckpt)["main_params"])
-        voc.eval()
-    voc_mu, voc_std = np.load(args.voc_stat)
-    voc_mu = paddle.to_tensor(voc_mu)
-    voc_std = paddle.to_tensor(voc_std)
-    voc_normalizer = ZScore(voc_mu, voc_std)
-    voc_inference = voc_inference_class(voc_normalizer, voc)
-    print("voc_inference.training0:", voc_inference.training)
-    voc_inference.eval()
-    print("voc done!")
+    voc_inference = get_voc_inference(args, voc_config)
 
     output_dir = Path(args.output_dir)
     output_dir.mkdir(parents=True, exist_ok=True)
@@ -227,7 +106,7 @@ def evaluate(args):
     print(f"generation speed: {N / T}Hz, RTF: {am_config.fs / (N / T) }")
 
 
-def main():
+def parse_args():
     # parse args and config and redirect to train_sp
     parser = argparse.ArgumentParser(
         description="Synthesize with acoustic model & vocoder")
@@ -264,7 +143,6 @@ def main():
         "--tones_dict", type=str, default=None, help="tone vocabulary file.")
     parser.add_argument(
         "--speaker_dict", type=str, default=None, help="speaker id map file.")
-
     parser.add_argument(
         "--voice-cloning",
         type=str2bool,
@@ -278,10 +156,10 @@ def main():
         choices=[
             'pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3', 'pwgan_vctk',
             'mb_melgan_csmsc', 'wavernn_csmsc', 'hifigan_csmsc',
+            'hifigan_ljspeech', 'hifigan_aishell3', 'hifigan_vctk',
             'style_melgan_csmsc'
         ],
         help='Choose vocoder type of tts task.')
-
     parser.add_argument(
         '--voc_config',
         type=str,
@@ -302,7 +180,12 @@ def main():
     parser.add_argument("--output_dir", type=str, help="output dir.")
 
     args = parser.parse_args()
+    return args
+
+
+def main():
 
+    args = parse_args()
     if args.ngpu == 0:
         paddle.set_device("cpu")
     elif args.ngpu > 0:
diff --git a/paddlespeech/t2s/exps/synthesize_e2e.py b/paddlespeech/t2s/exps/synthesize_e2e.py
index 94180f853..10b33c60a 100644
--- a/paddlespeech/t2s/exps/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/synthesize_e2e.py
@@ -12,59 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
-import os
 from pathlib import Path
 
-import numpy as np
 import paddle
 import soundfile as sf
 import yaml
-from paddle import jit
-from paddle.static import InputSpec
 from timer import timer
 from yacs.config import CfgNode
 
-from paddlespeech.s2t.utils.dynamic_import import dynamic_import
-from paddlespeech.t2s.frontend import English
-from paddlespeech.t2s.frontend.zh_frontend import Frontend
-from paddlespeech.t2s.modules.normalizer import ZScore
-
-model_alias = {
-    # acoustic model
-    "speedyspeech":
-    "paddlespeech.t2s.models.speedyspeech:SpeedySpeech",
-    "speedyspeech_inference":
-    "paddlespeech.t2s.models.speedyspeech:SpeedySpeechInference",
-    "fastspeech2":
-    "paddlespeech.t2s.models.fastspeech2:FastSpeech2",
-    "fastspeech2_inference":
-    "paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference",
-    "tacotron2":
-    "paddlespeech.t2s.models.tacotron2:Tacotron2",
-    "tacotron2_inference":
-    "paddlespeech.t2s.models.tacotron2:Tacotron2Inference",
-    # voc
-    "pwgan":
-    "paddlespeech.t2s.models.parallel_wavegan:PWGGenerator",
-    "pwgan_inference":
-    "paddlespeech.t2s.models.parallel_wavegan:PWGInference",
-    "mb_melgan":
-    "paddlespeech.t2s.models.melgan:MelGANGenerator",
-    "mb_melgan_inference":
-    "paddlespeech.t2s.models.melgan:MelGANInference",
-    "style_melgan":
-    "paddlespeech.t2s.models.melgan:StyleMelGANGenerator",
-    "style_melgan_inference":
-    "paddlespeech.t2s.models.melgan:StyleMelGANInference",
-    "hifigan":
-    "paddlespeech.t2s.models.hifigan:HiFiGANGenerator",
-    "hifigan_inference":
-    "paddlespeech.t2s.models.hifigan:HiFiGANInference",
-    "wavernn":
-    "paddlespeech.t2s.models.wavernn:WaveRNN",
-    "wavernn_inference":
-    "paddlespeech.t2s.models.wavernn:WaveRNNInference",
-}
+from paddlespeech.t2s.exps.syn_utils import am_to_static
+from paddlespeech.t2s.exps.syn_utils import get_am_inference
+from paddlespeech.t2s.exps.syn_utils import get_frontend
+from paddlespeech.t2s.exps.syn_utils import get_sentences
+from paddlespeech.t2s.exps.syn_utils import get_voc_inference
+from paddlespeech.t2s.exps.syn_utils import voc_to_static
 
 
 def evaluate(args):
@@ -81,151 +42,24 @@ def evaluate(args):
     print(am_config)
     print(voc_config)
 
-    # construct dataset for evaluation
-    sentences = []
-    with open(args.text, 'rt') as f:
-        for line in f:
-            items = line.strip().split()
-            utt_id = items[0]
-            if args.lang == 'zh':
-                sentence = "".join(items[1:])
-            elif args.lang == 'en':
-                sentence = " ".join(items[1:])
-            sentences.append((utt_id, sentence))
-
-    with open(args.phones_dict, "r") as f:
-        phn_id = [line.strip().split() for line in f.readlines()]
-    vocab_size = len(phn_id)
-    print("vocab_size:", vocab_size)
-
-    tone_size = None
-    if args.tones_dict:
-        with open(args.tones_dict, "r") as f:
-            tone_id = [line.strip().split() for line in f.readlines()]
-        tone_size = len(tone_id)
-        print("tone_size:", tone_size)
-
-    spk_num = None
-    if args.speaker_dict:
-        with open(args.speaker_dict, 'rt') as f:
-            spk_id = [line.strip().split() for line in f.readlines()]
-        spk_num = len(spk_id)
-        print("spk_num:", spk_num)
+    sentences = get_sentences(args)
 
     # frontend
-    if args.lang == 'zh':
-        frontend = Frontend(
-            phone_vocab_path=args.phones_dict, tone_vocab_path=args.tones_dict)
-    elif args.lang == 'en':
-        frontend = English(phone_vocab_path=args.phones_dict)
-    print("frontend done!")
+    frontend = get_frontend(args)
 
     # acoustic model
-    odim = am_config.n_mels
-    # model: {model_name}_{dataset}
-    am_name = args.am[:args.am.rindex('_')]
-    am_dataset = args.am[args.am.rindex('_') + 1:]
-
-    am_class = dynamic_import(am_name, model_alias)
-    am_inference_class = dynamic_import(am_name + '_inference', model_alias)
-
-    if am_name == 'fastspeech2':
-        am = am_class(
-            idim=vocab_size, odim=odim, spk_num=spk_num, **am_config["model"])
-    elif am_name == 'speedyspeech':
-        am = am_class(
-            vocab_size=vocab_size,
-            tone_size=tone_size,
-            spk_num=spk_num,
-            **am_config["model"])
-    elif am_name == 'tacotron2':
-        am = am_class(idim=vocab_size, odim=odim, **am_config["model"])
-
-    am.set_state_dict(paddle.load(args.am_ckpt)["main_params"])
-    am.eval()
-    am_mu, am_std = np.load(args.am_stat)
-    am_mu = paddle.to_tensor(am_mu)
-    am_std = paddle.to_tensor(am_std)
-    am_normalizer = ZScore(am_mu, am_std)
-    am_inference = am_inference_class(am_normalizer, am)
-    am_inference.eval()
-    print("acoustic model done!")
+    am_inference, am_name, am_dataset = get_am_inference(args, am_config)
 
     # vocoder
-    # model: {model_name}_{dataset}
-    voc_name = args.voc[:args.voc.rindex('_')]
-    voc_class = dynamic_import(voc_name, model_alias)
-    voc_inference_class = dynamic_import(voc_name + '_inference', model_alias)
-    if voc_name != 'wavernn':
-        voc = voc_class(**voc_config["generator_params"])
-        voc.set_state_dict(paddle.load(args.voc_ckpt)["generator_params"])
-        voc.remove_weight_norm()
-        voc.eval()
-    else:
-        voc = voc_class(**voc_config["model"])
-        voc.set_state_dict(paddle.load(args.voc_ckpt)["main_params"])
-        voc.eval()
-
-    voc_mu, voc_std = np.load(args.voc_stat)
-    voc_mu = paddle.to_tensor(voc_mu)
-    voc_std = paddle.to_tensor(voc_std)
-    voc_normalizer = ZScore(voc_mu, voc_std)
-    voc_inference = voc_inference_class(voc_normalizer, voc)
-    voc_inference.eval()
-    print("voc done!")
+    voc_inference = get_voc_inference(args, voc_config)
 
     # whether dygraph to static
     if args.inference_dir:
         # acoustic model
-        if am_name == 'fastspeech2':
-            if am_dataset in {"aishell3", "vctk"} and args.speaker_dict:
-                am_inference = jit.to_static(
-                    am_inference,
-                    input_spec=[
-                        InputSpec([-1], dtype=paddle.int64),
-                        InputSpec([1], dtype=paddle.int64)
-                    ])
-            else:
-                am_inference = jit.to_static(
-                    am_inference,
-                    input_spec=[InputSpec([-1], dtype=paddle.int64)])
-
-        elif am_name == 'speedyspeech':
-            if am_dataset in {"aishell3", "vctk"} and args.speaker_dict:
-                am_inference = jit.to_static(
-                    am_inference,
-                    input_spec=[
-                        InputSpec([-1], dtype=paddle.int64), # text
-                        InputSpec([-1], dtype=paddle.int64), # tone
-                        InputSpec([1], dtype=paddle.int64),  # spk_id
-                        None                                 # duration
-                    ])
-            else:
-                am_inference = jit.to_static(
-                    am_inference,
-                    input_spec=[
-                        InputSpec([-1], dtype=paddle.int64),
-                        InputSpec([-1], dtype=paddle.int64)
-                    ])
-
-        elif am_name == 'tacotron2':
-            am_inference = jit.to_static(
-                am_inference, input_spec=[InputSpec([-1], dtype=paddle.int64)])
-
-        paddle.jit.save(am_inference, os.path.join(args.inference_dir, args.am))
-        am_inference = paddle.jit.load(
-            os.path.join(args.inference_dir, args.am))
+        am_inference = am_to_static(args, am_inference, am_name, am_dataset)
 
         # vocoder
-        voc_inference = jit.to_static(
-            voc_inference,
-            input_spec=[
-                InputSpec([-1, 80], dtype=paddle.float32),
-            ])
-        paddle.jit.save(voc_inference,
-                        os.path.join(args.inference_dir, args.voc))
-        voc_inference = paddle.jit.load(
-            os.path.join(args.inference_dir, args.voc))
+        voc_inference = voc_to_static(args, voc_inference)
 
     output_dir = Path(args.output_dir)
     output_dir.mkdir(parents=True, exist_ok=True)
@@ -298,7 +132,7 @@ def evaluate(args):
     print(f"generation speed: {N / T}Hz, RTF: {am_config.fs / (N / T) }")
 
 
-def main():
+def parse_args():
     # parse args and config and redirect to train_sp
     parser = argparse.ArgumentParser(
         description="Synthesize with acoustic model & vocoder")
@@ -346,12 +180,19 @@ def main():
         type=str,
         default='pwgan_csmsc',
         choices=[
-            'pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3', 'pwgan_vctk',
-            'mb_melgan_csmsc', 'style_melgan_csmsc', 'hifigan_csmsc',
-            'wavernn_csmsc'
+            'pwgan_csmsc',
+            'pwgan_ljspeech',
+            'pwgan_aishell3',
+            'pwgan_vctk',
+            'mb_melgan_csmsc',
+            'style_melgan_csmsc',
+            'hifigan_csmsc',
+            'hifigan_ljspeech',
+            'hifigan_aishell3',
+            'hifigan_vctk',
+            'wavernn_csmsc',
         ],
         help='Choose vocoder type of tts task.')
-
     parser.add_argument(
         '--voc_config',
         type=str,
@@ -386,6 +227,11 @@ def main():
     parser.add_argument("--output_dir", type=str, help="output dir.")
 
     args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
 
     if args.ngpu == 0:
         paddle.set_device("cpu")
diff --git a/paddlespeech/t2s/exps/synthesize_streaming.py b/paddlespeech/t2s/exps/synthesize_streaming.py
new file mode 100644
index 000000000..f38b2d352
--- /dev/null
+++ b/paddlespeech/t2s/exps/synthesize_streaming.py
@@ -0,0 +1,274 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import math
+from pathlib import Path
+
+import numpy as np
+import paddle
+import soundfile as sf
+import yaml
+from timer import timer
+from yacs.config import CfgNode
+
+from paddlespeech.s2t.utils.dynamic_import import dynamic_import
+from paddlespeech.t2s.exps.syn_utils import get_frontend
+from paddlespeech.t2s.exps.syn_utils import get_sentences
+from paddlespeech.t2s.exps.syn_utils import get_voc_inference
+from paddlespeech.t2s.exps.syn_utils import model_alias
+from paddlespeech.t2s.utils import str2bool
+
+
+def denorm(data, mean, std):
+    return data * std + mean
+
+
+def get_chunks(data, chunk_size, pad_size):
+    data_len = data.shape[1]
+    chunks = []
+    n = math.ceil(data_len / chunk_size)
+    for i in range(n):
+        start = max(0, i * chunk_size - pad_size)
+        end = min((i + 1) * chunk_size + pad_size, data_len)
+        chunks.append(data[:, start:end, :])
+    return chunks
+
+
+def evaluate(args):
+
+    # Init body.
+    with open(args.am_config) as f:
+        am_config = CfgNode(yaml.safe_load(f))
+    with open(args.voc_config) as f:
+        voc_config = CfgNode(yaml.safe_load(f))
+
+    print("========Args========")
+    print(yaml.safe_dump(vars(args)))
+    print("========Config========")
+    print(am_config)
+    print(voc_config)
+
+    sentences = get_sentences(args)
+
+    # frontend
+    frontend = get_frontend(args)
+
+    with open(args.phones_dict, "r") as f:
+        phn_id = [line.strip().split() for line in f.readlines()]
+    vocab_size = len(phn_id)
+    print("vocab_size:", vocab_size)
+
+    # acoustic model, only support fastspeech2 here now!
+    # am_inference, am_name, am_dataset = get_am_inference(args, am_config)
+    # model: {model_name}_{dataset}
+    am_name = args.am[:args.am.rindex('_')]
+    am_dataset = args.am[args.am.rindex('_') + 1:]
+    odim = am_config.n_mels
+
+    am_class = dynamic_import(am_name, model_alias)
+    am = am_class(idim=vocab_size, odim=odim, **am_config["model"])
+    am.set_state_dict(paddle.load(args.am_ckpt)["main_params"])
+    am.eval()
+    am_mu, am_std = np.load(args.am_stat)
+    am_mu = paddle.to_tensor(am_mu)
+    am_std = paddle.to_tensor(am_std)
+
+    # vocoder
+    voc_inference = get_voc_inference(args, voc_config)
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    merge_sentences = True
+
+    N = 0
+    T = 0
+    chunk_size = args.chunk_size
+    pad_size = args.pad_size
+
+    for utt_id, sentence in sentences:
+        with timer() as t:
+            get_tone_ids = False
+
+            if args.lang == 'zh':
+                input_ids = frontend.get_input_ids(
+                    sentence,
+                    merge_sentences=merge_sentences,
+                    get_tone_ids=get_tone_ids)
+
+                phone_ids = input_ids["phone_ids"]
+            else:
+                print("lang should in be 'zh' here!")
+            # merge_sentences=True here, so we only use the first item of phone_ids
+            phone_ids = phone_ids[0]
+            with paddle.no_grad():
+                # acoustic model
+                orig_hs, h_masks = am.encoder_infer(phone_ids)
+
+                if args.am_streaming:
+                    hss = get_chunks(orig_hs, chunk_size, pad_size)
+                    chunk_num = len(hss)
+                    mel_list = []
+                    for i, hs in enumerate(hss):
+                        before_outs, _ = am.decoder(hs)
+                        after_outs = before_outs + am.postnet(
+                            before_outs.transpose((0, 2, 1))).transpose(
+                                (0, 2, 1))
+                        normalized_mel = after_outs[0]
+                        sub_mel = denorm(normalized_mel, am_mu, am_std)
+                        # clip output part of pad
+                        if i == 0:
+                            sub_mel = sub_mel[:-pad_size]
+                        elif i == chunk_num - 1:
+                            # 最后一块的右侧一定没有 pad 够
+                            sub_mel = sub_mel[pad_size:]
+                        else:
+                            # 倒数几块的右侧也可能没有 pad 够
+                            sub_mel = sub_mel[pad_size:(chunk_size + pad_size) -
+                                              sub_mel.shape[0]]
+                        mel_list.append(sub_mel)
+                    mel = paddle.concat(mel_list, axis=0)
+
+                else:
+                    before_outs, _ = am.decoder(orig_hs)
+                    after_outs = before_outs + am.postnet(
+                        before_outs.transpose((0, 2, 1))).transpose((0, 2, 1))
+                    normalized_mel = after_outs[0]
+                    mel = denorm(normalized_mel, am_mu, am_std)
+
+                # vocoder
+                wav = voc_inference(mel)
+
+        wav = wav.numpy()
+        N += wav.size
+        T += t.elapse
+        speed = wav.size / t.elapse
+        rtf = am_config.fs / speed
+        print(
+            f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
+        )
+        sf.write(
+            str(output_dir / (utt_id + ".wav")), wav, samplerate=am_config.fs)
+        print(f"{utt_id} done!")
+    print(f"generation speed: {N / T}Hz, RTF: {am_config.fs / (N / T) }")
+
+
+def parse_args():
+    # parse args and config and redirect to train_sp
+    parser = argparse.ArgumentParser(
+        description="Synthesize with acoustic model & vocoder")
+    # acoustic model
+    parser.add_argument(
+        '--am',
+        type=str,
+        default='fastspeech2_csmsc',
+        choices=['fastspeech2_csmsc'],
+        help='Choose acoustic model type of tts task.')
+    parser.add_argument(
+        '--am_config',
+        type=str,
+        default=None,
+        help='Config of acoustic model. Use deault config when it is None.')
+    parser.add_argument(
+        '--am_ckpt',
+        type=str,
+        default=None,
+        help='Checkpoint file of acoustic model.')
+    parser.add_argument(
+        "--am_stat",
+        type=str,
+        default=None,
+        help="mean and standard deviation used to normalize spectrogram when training acoustic model."
+    )
+    parser.add_argument(
+        "--phones_dict", type=str, default=None, help="phone vocabulary file.")
+    parser.add_argument(
+        "--tones_dict", type=str, default=None, help="tone vocabulary file.")
+
+    # vocoder
+    parser.add_argument(
+        '--voc',
+        type=str,
+        default='pwgan_csmsc',
+        choices=[
+            'pwgan_csmsc',
+            'pwgan_ljspeech',
+            'pwgan_aishell3',
+            'pwgan_vctk',
+            'mb_melgan_csmsc',
+            'style_melgan_csmsc',
+            'hifigan_csmsc',
+            'hifigan_ljspeech',
+            'hifigan_aishell3',
+            'hifigan_vctk',
+            'wavernn_csmsc',
+        ],
+        help='Choose vocoder type of tts task.')
+    parser.add_argument(
+        '--voc_config',
+        type=str,
+        default=None,
+        help='Config of voc. Use deault config when it is None.')
+    parser.add_argument(
+        '--voc_ckpt', type=str, default=None, help='Checkpoint file of voc.')
+    parser.add_argument(
+        "--voc_stat",
+        type=str,
+        default=None,
+        help="mean and standard deviation used to normalize spectrogram when training voc."
+    )
+    # other
+    parser.add_argument(
+        '--lang',
+        type=str,
+        default='zh',
+        help='Choose model language. zh or en')
+
+    parser.add_argument(
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
+    parser.add_argument(
+        "--text",
+        type=str,
+        help="text to synthesize, a 'utt_id sentence' pair per line.")
+
+    parser.add_argument(
+        "--am_streaming",
+        type=str2bool,
+        default=False,
+        help="whether use streaming acoustic model")
+    parser.add_argument(
+        "--chunk_size", type=int, default=42, help="chunk size of am streaming")
+    parser.add_argument(
+        "--pad_size", type=int, default=12, help="pad size of am streaming")
+
+    parser.add_argument("--output_dir", type=str, help="output dir.")
+
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+
+    if args.ngpu == 0:
+        paddle.set_device("cpu")
+    elif args.ngpu > 0:
+        paddle.set_device("gpu")
+    else:
+        print("ngpu should >= 0 !")
+
+    evaluate(args)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/paddlespeech/t2s/exps/transformer_tts/train.py b/paddlespeech/t2s/exps/transformer_tts/train.py
index d521ce896..45ecb269b 100644
--- a/paddlespeech/t2s/exps/transformer_tts/train.py
+++ b/paddlespeech/t2s/exps/transformer_tts/train.py
@@ -42,10 +42,12 @@ from paddlespeech.t2s.training.trainer import Trainer
 def train_sp(args, config):
     # decides device type and whether to run in parallel
     # setup running environment correctly
-    if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0:
-        paddle.set_device("cpu")
-    else:
+    if paddle.is_compiled_with_cuda() and args.ngpu > 0:
         paddle.set_device("gpu")
+    elif paddle.is_compiled_with_npu() and args.ngpu > 0:
+        paddle.set_device("npu")
+    else:
+        paddle.set_device("cpu")
     world_size = paddle.distributed.get_world_size()
     if world_size > 1:
         paddle.distributed.init_parallel_env()
diff --git a/paddlespeech/t2s/exps/voice_cloning.py b/paddlespeech/t2s/exps/voice_cloning.py
index 3de30774f..1afd21dff 100644
--- a/paddlespeech/t2s/exps/voice_cloning.py
+++ b/paddlespeech/t2s/exps/voice_cloning.py
@@ -21,29 +21,12 @@ import soundfile as sf
 import yaml
 from yacs.config import CfgNode
 
-from paddlespeech.s2t.utils.dynamic_import import dynamic_import
+from paddlespeech.t2s.exps.syn_utils import get_am_inference
+from paddlespeech.t2s.exps.syn_utils import get_voc_inference
 from paddlespeech.t2s.frontend.zh_frontend import Frontend
-from paddlespeech.t2s.modules.normalizer import ZScore
 from paddlespeech.vector.exps.ge2e.audio_processor import SpeakerVerificationPreprocessor
 from paddlespeech.vector.models.lstm_speaker_encoder import LSTMSpeakerEncoder
 
-model_alias = {
-    # acoustic model
-    "fastspeech2":
-    "paddlespeech.t2s.models.fastspeech2:FastSpeech2",
-    "fastspeech2_inference":
-    "paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference",
-    "tacotron2":
-    "paddlespeech.t2s.models.tacotron2:Tacotron2",
-    "tacotron2_inference":
-    "paddlespeech.t2s.models.tacotron2:Tacotron2Inference",
-    # voc
-    "pwgan":
-    "paddlespeech.t2s.models.parallel_wavegan:PWGGenerator",
-    "pwgan_inference":
-    "paddlespeech.t2s.models.parallel_wavegan:PWGInference",
-}
-
 
 def voice_cloning(args):
     # Init body.
@@ -79,55 +62,14 @@ def voice_cloning(args):
     speaker_encoder.eval()
     print("GE2E Done!")
 
-    with open(args.phones_dict, "r") as f:
-        phn_id = [line.strip().split() for line in f.readlines()]
-    vocab_size = len(phn_id)
-    print("vocab_size:", vocab_size)
+    frontend = Frontend(phone_vocab_path=args.phones_dict)
+    print("frontend done!")
 
     # acoustic model
-    odim = am_config.n_mels
-    # model: {model_name}_{dataset}
-    am_name = args.am[:args.am.rindex('_')]
-    am_dataset = args.am[args.am.rindex('_') + 1:]
-
-    am_class = dynamic_import(am_name, model_alias)
-    am_inference_class = dynamic_import(am_name + '_inference', model_alias)
-
-    if am_name == 'fastspeech2':
-        am = am_class(
-            idim=vocab_size, odim=odim, spk_num=None, **am_config["model"])
-    elif am_name == 'tacotron2':
-        am = am_class(idim=vocab_size, odim=odim, **am_config["model"])
-
-    am.set_state_dict(paddle.load(args.am_ckpt)["main_params"])
-    am.eval()
-    am_mu, am_std = np.load(args.am_stat)
-    am_mu = paddle.to_tensor(am_mu)
-    am_std = paddle.to_tensor(am_std)
-    am_normalizer = ZScore(am_mu, am_std)
-    am_inference = am_inference_class(am_normalizer, am)
-    am_inference.eval()
-    print("acoustic model done!")
+    am_inference, *_ = get_am_inference(args, am_config)
 
     # vocoder
-    # model: {model_name}_{dataset}
-    voc_name = args.voc[:args.voc.rindex('_')]
-    voc_class = dynamic_import(voc_name, model_alias)
-    voc_inference_class = dynamic_import(voc_name + '_inference', model_alias)
-    voc = voc_class(**voc_config["generator_params"])
-    voc.set_state_dict(paddle.load(args.voc_ckpt)["generator_params"])
-    voc.remove_weight_norm()
-    voc.eval()
-    voc_mu, voc_std = np.load(args.voc_stat)
-    voc_mu = paddle.to_tensor(voc_mu)
-    voc_std = paddle.to_tensor(voc_std)
-    voc_normalizer = ZScore(voc_mu, voc_std)
-    voc_inference = voc_inference_class(voc_normalizer, voc)
-    voc_inference.eval()
-    print("voc done!")
-
-    frontend = Frontend(phone_vocab_path=args.phones_dict)
-    print("frontend done!")
+    voc_inference = get_voc_inference(args, voc_config)
 
     output_dir = Path(args.output_dir)
     output_dir.mkdir(parents=True, exist_ok=True)
@@ -170,7 +112,7 @@ def voice_cloning(args):
     print(f"{utt_id} done!")
 
 
-def main():
+def parse_args():
     # parse args and config and redirect to train_sp
     parser = argparse.ArgumentParser(description="")
     parser.add_argument(
@@ -240,6 +182,11 @@ def main():
     parser.add_argument("--output-dir", type=str, help="output dir.")
 
     args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
 
     if args.ngpu == 0:
         paddle.set_device("cpu")
diff --git a/paddlespeech/t2s/frontend/zh_normalization/chronology.py b/paddlespeech/t2s/frontend/zh_normalization/chronology.py
index ea5189135..ea4558e2a 100644
--- a/paddlespeech/t2s/frontend/zh_normalization/chronology.py
+++ b/paddlespeech/t2s/frontend/zh_normalization/chronology.py
@@ -64,7 +64,7 @@ def replace_time(match) -> str:
     result = f"{num2str(hour)}点"
     if minute.lstrip('0'):
         if int(minute) == 30:
-            result += f"半"
+            result += "半"
         else:
             result += f"{_time_num2str(minute)}分"
     if second and second.lstrip('0'):
@@ -75,7 +75,7 @@ def replace_time(match) -> str:
         result += f"{num2str(hour_2)}点"
         if minute_2.lstrip('0'):
             if int(minute) == 30:
-                result += f"半"
+                result += "半"
             else:
                 result += f"{_time_num2str(minute_2)}分"
         if second_2 and second_2.lstrip('0'):
diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
index 73f5498e7..c2f1e218f 100644
--- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
+++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
@@ -14,6 +14,7 @@
 # Modified from espnet(https://github.com/espnet/espnet)
 """Fastspeech2 related modules for paddle"""
 from typing import Dict
+from typing import List
 from typing import Sequence
 from typing import Tuple
 from typing import Union
@@ -32,6 +33,8 @@ from paddlespeech.t2s.modules.predictor.duration_predictor import DurationPredic
 from paddlespeech.t2s.modules.predictor.length_regulator import LengthRegulator
 from paddlespeech.t2s.modules.predictor.variance_predictor import VariancePredictor
 from paddlespeech.t2s.modules.tacotron2.decoder import Postnet
+from paddlespeech.t2s.modules.transformer.encoder import CNNDecoder
+from paddlespeech.t2s.modules.transformer.encoder import CNNPostnet
 from paddlespeech.t2s.modules.transformer.encoder import ConformerEncoder
 from paddlespeech.t2s.modules.transformer.encoder import TransformerEncoder
 
@@ -97,6 +100,12 @@ class FastSpeech2(nn.Layer):
             zero_triu: bool=False,
             conformer_enc_kernel_size: int=7,
             conformer_dec_kernel_size: int=31,
+            # for CNN Decoder
+            cnn_dec_dropout_rate: float=0.2,
+            cnn_postnet_dropout_rate: float=0.2,
+            cnn_postnet_resblock_kernel_sizes: List[int]=[256, 256],
+            cnn_postnet_kernel_size: int=5,
+            cnn_decoder_embedding_dim: int=256,
             # duration predictor
             duration_predictor_layers: int=2,
             duration_predictor_chans: int=384,
@@ -392,6 +401,13 @@ class FastSpeech2(nn.Layer):
                 activation_type=conformer_activation_type,
                 use_cnn_module=use_cnn_in_conformer,
                 cnn_module_kernel=conformer_dec_kernel_size, )
+        elif decoder_type == 'cnndecoder':
+            self.decoder = CNNDecoder(
+                emb_dim=adim,
+                odim=odim,
+                kernel_size=cnn_postnet_kernel_size,
+                dropout_rate=cnn_dec_dropout_rate,
+                resblock_kernel_sizes=cnn_postnet_resblock_kernel_sizes)
         else:
             raise ValueError(f"{decoder_type} is not supported.")
 
@@ -399,14 +415,21 @@ class FastSpeech2(nn.Layer):
         self.feat_out = nn.Linear(adim, odim * reduction_factor)
 
         # define postnet
-        self.postnet = (None if postnet_layers == 0 else Postnet(
-            idim=idim,
-            odim=odim,
-            n_layers=postnet_layers,
-            n_chans=postnet_chans,
-            n_filts=postnet_filts,
-            use_batch_norm=use_batch_norm,
-            dropout_rate=postnet_dropout_rate, ))
+        if decoder_type == 'cnndecoder':
+            self.postnet = CNNPostnet(
+                odim=odim,
+                kernel_size=cnn_postnet_kernel_size,
+                dropout_rate=cnn_postnet_dropout_rate,
+                resblock_kernel_sizes=cnn_postnet_resblock_kernel_sizes)
+        else:
+            self.postnet = (None if postnet_layers == 0 else Postnet(
+                idim=idim,
+                odim=odim,
+                n_layers=postnet_layers,
+                n_chans=postnet_chans,
+                n_filts=postnet_filts,
+                use_batch_norm=use_batch_norm,
+                dropout_rate=postnet_dropout_rate, ))
 
         nn.initializer.set_global_initializer(None)
 
@@ -486,6 +509,7 @@ class FastSpeech2(nn.Layer):
                  ps: paddle.Tensor=None,
                  es: paddle.Tensor=None,
                  is_inference: bool=False,
+                 return_after_enc=False,
                  alpha: float=1.0,
                  spk_emb=None,
                  spk_id=None,
@@ -562,15 +586,21 @@ class FastSpeech2(nn.Layer):
                     [olen // self.reduction_factor for olen in olens.numpy()])
             else:
                 olens_in = olens
+            # (B, 1, T)
             h_masks = self._source_mask(olens_in)
         else:
             h_masks = None
-        # (B, Lmax, adim)
 
+        if return_after_enc:
+            return hs, h_masks
+        # (B, Lmax, adim)
         zs, _ = self.decoder(hs, h_masks)
         # (B, Lmax, odim)
-        before_outs = self.feat_out(zs).reshape(
-            (paddle.shape(zs)[0], -1, self.odim))
+        if self.decoder_type == 'cnndecoder':
+            before_outs = zs
+        else:
+            before_outs = self.feat_out(zs).reshape(
+                (paddle.shape(zs)[0], -1, self.odim))
 
         # postnet -> (B, Lmax//r * r, odim)
         if self.postnet is None:
@@ -581,10 +611,42 @@ class FastSpeech2(nn.Layer):
 
         return before_outs, after_outs, d_outs, p_outs, e_outs
 
+    def encoder_infer(
+            self,
+            text: paddle.Tensor,
+            alpha: float=1.0,
+            spk_emb=None,
+            spk_id=None,
+            tone_id=None,
+    ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
+        # input of embedding must be int64
+        x = paddle.cast(text, 'int64')
+        # setup batch axis
+        ilens = paddle.shape(x)[0]
+
+        xs = x.unsqueeze(0)
+
+        if spk_emb is not None:
+            spk_emb = spk_emb.unsqueeze(0)
+
+        if tone_id is not None:
+            tone_id = tone_id.unsqueeze(0)
+
+        # (1, L, odim)
+        hs, h_masks = self._forward(
+            xs,
+            ilens,
+            is_inference=True,
+            return_after_enc=True,
+            alpha=alpha,
+            spk_emb=spk_emb,
+            spk_id=spk_id,
+            tone_id=tone_id)
+        return hs, h_masks
+
     def inference(
             self,
             text: paddle.Tensor,
-            speech: paddle.Tensor=None,
             durations: paddle.Tensor=None,
             pitch: paddle.Tensor=None,
             energy: paddle.Tensor=None,
@@ -598,7 +660,6 @@ class FastSpeech2(nn.Layer):
 
         Args:
             text(Tensor(int64)): Input sequence of characters (T,).
-            speech(Tensor, optional): Feature sequence to extract style (N, idim).
             durations(Tensor, optional (int64)): Groundtruth of duration (T,).
             pitch(Tensor, optional): Groundtruth of token-averaged pitch (T, 1).
             energy(Tensor, optional): Groundtruth of token-averaged energy (T, 1).
@@ -615,15 +676,11 @@ class FastSpeech2(nn.Layer):
         """
         # input of embedding must be int64
         x = paddle.cast(text, 'int64')
-        y = speech
         d, p, e = durations, pitch, energy
         # setup batch axis
         ilens = paddle.shape(x)[0]
 
-        xs, ys = x.unsqueeze(0), None
-
-        if y is not None:
-            ys = y.unsqueeze(0)
+        xs = x.unsqueeze(0)
 
         if spk_emb is not None:
             spk_emb = spk_emb.unsqueeze(0)
@@ -641,7 +698,6 @@ class FastSpeech2(nn.Layer):
             _, outs, d_outs, p_outs, e_outs = self._forward(
                 xs,
                 ilens,
-                ys,
                 ds=ds,
                 ps=ps,
                 es=es,
@@ -654,7 +710,6 @@ class FastSpeech2(nn.Layer):
             _, outs, d_outs, p_outs, e_outs = self._forward(
                 xs,
                 ilens,
-                ys,
                 is_inference=True,
                 alpha=alpha,
                 spk_emb=spk_emb,
@@ -802,7 +857,6 @@ class StyleFastSpeech2Inference(FastSpeech2Inference):
 
         Args:
             text(Tensor(int64)): Input sequence of characters (T,).
-            speech(Tensor, optional): Feature sequence to extract style (N, idim).
             durations(paddle.Tensor/np.ndarray, optional (int64)): Groundtruth of duration (T,), this will overwrite the set of durations_scale and durations_bias
             durations_scale(int/float, optional): 
             durations_bias(int/float, optional): 
diff --git a/paddlespeech/t2s/models/hifigan/hifigan.py b/paddlespeech/t2s/models/hifigan/hifigan.py
index 116376ece..ac5ff204f 100644
--- a/paddlespeech/t2s/models/hifigan/hifigan.py
+++ b/paddlespeech/t2s/models/hifigan/hifigan.py
@@ -1,7 +1,17 @@
-# -*- coding: utf-8 -*-
-"""HiFi-GAN Modules.
-This code is based on https://github.com/jik876/hifi-gan.
-"""
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This code is based on https://github.com/jik876/hifi-gan.
 import copy
 from typing import Any
 from typing import Dict
diff --git a/paddlespeech/t2s/models/tacotron2/tacotron2.py b/paddlespeech/t2s/models/tacotron2/tacotron2.py
index abb691b4d..7b306e482 100644
--- a/paddlespeech/t2s/models/tacotron2/tacotron2.py
+++ b/paddlespeech/t2s/models/tacotron2/tacotron2.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
 """Tacotron 2 related modules for paddle"""
 import logging
 from typing import Dict
diff --git a/paddlespeech/t2s/models/wavernn/wavernn.py b/paddlespeech/t2s/models/wavernn/wavernn.py
index 959070432..b4b8b4809 100644
--- a/paddlespeech/t2s/models/wavernn/wavernn.py
+++ b/paddlespeech/t2s/models/wavernn/wavernn.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# Modified from https://github.com/fatchord/WaveRNN
 import sys
 import time
 from typing import List
diff --git a/paddlespeech/t2s/modules/losses.py b/paddlespeech/t2s/modules/losses.py
index 93644e24a..db31bcfbb 100644
--- a/paddlespeech/t2s/modules/losses.py
+++ b/paddlespeech/t2s/modules/losses.py
@@ -489,7 +489,7 @@ def stft(x,
     """
     # calculate window
     window = signal.get_window(window, win_length, fftbins=True)
-    window = paddle.to_tensor(window)
+    window = paddle.to_tensor(window, dtype=x.dtype)
     x_stft = paddle.signal.stft(
         x,
         fft_size,
@@ -896,7 +896,7 @@ class MelSpectrogram(nn.Layer):
             # calculate window
             window = signal.get_window(
                 self.window, self.win_length, fftbins=True)
-            window = paddle.to_tensor(window)
+            window = paddle.to_tensor(window, dtype=x.dtype)
         else:
             window = None
 
diff --git a/paddlespeech/t2s/modules/predictor/length_regulator.py b/paddlespeech/t2s/modules/predictor/length_regulator.py
index 62d707d22..be788e6ed 100644
--- a/paddlespeech/t2s/modules/predictor/length_regulator.py
+++ b/paddlespeech/t2s/modules/predictor/length_regulator.py
@@ -73,15 +73,21 @@ class LengthRegulator(nn.Layer):
         batch_size, t_enc = paddle.shape(durations)
         slens = paddle.sum(durations, -1)
         t_dec = paddle.max(slens)
-        M = paddle.zeros([batch_size, t_dec, t_enc])
-        for i in range(batch_size):
-            k = 0
-            for j in range(t_enc):
-                d = durations[i, j]
-                # If the d == 0, slice action is meaningless and not supported in paddle
-                if d >= 1:
-                    M[i, k:k + d, j] = 1
-                k += d
+        t_dec_1 = t_dec + 1
+        flatten_duration = paddle.cumsum(
+            paddle.reshape(durations, [batch_size * t_enc])) + 1
+        init = paddle.zeros(t_dec_1)
+        m_batch = batch_size * t_enc
+        M = paddle.zeros([t_dec_1, m_batch])
+        for i in range(m_batch):
+            d = flatten_duration[i]
+            m = paddle.concat(
+                [paddle.ones(d), paddle.zeros(t_dec_1 - d)], axis=0)
+            M[:, i] = m - init
+            init = m
+        M = paddle.reshape(M, shape=[t_dec_1, batch_size, t_enc])
+        M = M[1:, :, :]
+        M = paddle.transpose(M, (1, 0, 2))
         encodings = paddle.matmul(M, encodings)
         return encodings
 
@@ -101,6 +107,16 @@ class LengthRegulator(nn.Layer):
             assert alpha > 0
             ds = paddle.round(ds.cast(dtype=paddle.float32) * alpha)
         ds = ds.cast(dtype=paddle.int64)
+        '''
+        from distutils.version import LooseVersion
+        from paddlespeech.t2s.modules.nets_utils import pad_list
+        # 这里在 paddle 2.2.2 的动转静是不通的
+        # if LooseVersion(paddle.__version__) >= "2.3.0" or hasattr(paddle, 'repeat_interleave'):
+        # if LooseVersion(paddle.__version__) >= "2.3.0":
+        if hasattr(paddle, 'repeat_interleave'):
+            repeat = [paddle.repeat_interleave(x, d, axis=0) for x, d in zip(xs, ds)]
+            return pad_list(repeat, self.pad_value)
+        '''
         if is_inference:
             return self.expand(xs, ds)
         else:
diff --git a/paddlespeech/t2s/modules/transformer/encoder.py b/paddlespeech/t2s/modules/transformer/encoder.py
index 2b3ee788e..f64202824 100644
--- a/paddlespeech/t2s/modules/transformer/encoder.py
+++ b/paddlespeech/t2s/modules/transformer/encoder.py
@@ -515,3 +515,132 @@ class ConformerEncoder(BaseEncoder):
         if self.intermediate_layers is not None:
             return xs, masks, intermediate_outputs
         return xs, masks
+
+
+class Conv1dResidualBlock(nn.Layer):
+    """
+    Special module for simplified version of Encoder class.
+    """
+
+    def __init__(self,
+                 idim: int=256,
+                 odim: int=256,
+                 kernel_size: int=5,
+                 dropout_rate: float=0.2):
+        super().__init__()
+        self.main_block = nn.Sequential(
+            nn.Conv1D(
+                idim, odim, kernel_size=kernel_size, padding=kernel_size // 2),
+            nn.ReLU(),
+            nn.BatchNorm1D(odim),
+            nn.Dropout(p=dropout_rate))
+        self.conv1d_residual = nn.Conv1D(idim, odim, kernel_size=1)
+
+    def forward(self, xs):
+        """Encode input sequence.
+        Args:
+            xs (Tensor): Input tensor (#batch, idim, T).
+        Returns:
+            Tensor: Output tensor (#batch, odim, T).
+        """
+        outputs = self.main_block(xs)
+        outputs = self.conv1d_residual(xs) + outputs
+        return outputs
+
+
+class CNNDecoder(nn.Layer):
+    """
+    Much simplified decoder than the original one with Prenet.
+    """
+
+    def __init__(
+            self,
+            emb_dim: int=256,
+            odim: int=80,
+            kernel_size: int=5,
+            dropout_rate: float=0.2,
+            resblock_kernel_sizes: List[int]=[256, 256], ):
+
+        super().__init__()
+
+        input_shape = emb_dim
+        out_sizes = resblock_kernel_sizes
+        out_sizes.append(out_sizes[-1])
+
+        in_sizes = [input_shape] + out_sizes[:-1]
+        self.residual_blocks = nn.LayerList([
+            Conv1dResidualBlock(
+                idim=in_channels,
+                odim=out_channels,
+                kernel_size=kernel_size,
+                dropout_rate=dropout_rate, )
+            for in_channels, out_channels in zip(in_sizes, out_sizes)
+        ])
+        self.conv1d = nn.Conv1D(
+            in_channels=out_sizes[-1], out_channels=odim, kernel_size=1)
+
+    def forward(self, xs, masks=None):
+        """Encode input sequence.
+        Args:
+            xs (Tensor): Input tensor (#batch, time, idim).
+            masks (Tensor): Mask tensor (#batch, 1, time).
+        Returns:
+            Tensor: Output tensor (#batch, time, odim).
+        """
+        # exchange the temporal dimension and the feature dimension
+        xs = xs.transpose([0, 2, 1])
+        if masks is not None:
+            xs = xs * masks
+
+        for layer in self.residual_blocks:
+            outputs = layer(xs)
+            if masks is not None:
+                # input_mask B * 1 * T
+                outputs = outputs * masks
+            xs = outputs
+        outputs = self.conv1d(outputs)
+        if masks is not None:
+            outputs = outputs * masks
+        outputs = outputs.transpose([0, 2, 1])
+        return outputs, masks
+
+
+class CNNPostnet(nn.Layer):
+    def __init__(
+            self,
+            odim: int=80,
+            kernel_size: int=5,
+            dropout_rate: float=0.2,
+            resblock_kernel_sizes: List[int]=[256, 256], ):
+        super().__init__()
+        out_sizes = resblock_kernel_sizes
+        in_sizes = [odim] + out_sizes[:-1]
+        self.residual_blocks = nn.LayerList([
+            Conv1dResidualBlock(
+                idim=in_channels,
+                odim=out_channels,
+                kernel_size=kernel_size,
+                dropout_rate=dropout_rate)
+            for in_channels, out_channels in zip(in_sizes, out_sizes)
+        ])
+        self.conv1d = nn.Conv1D(
+            in_channels=out_sizes[-1], out_channels=odim, kernel_size=1)
+
+    def forward(self, xs, masks=None):
+        """Encode input sequence.
+        Args:
+            xs (Tensor): Input tensor (#batch, odim, time).
+            masks (Tensor): Mask tensor (#batch, 1, time).
+        Returns:
+            Tensor: Output tensor (#batch, odim, time).
+        """
+        for layer in self.residual_blocks:
+            outputs = layer(xs)
+            if masks is not None:
+                # input_mask B * 1 * T
+                outputs = outputs * masks
+            xs = outputs
+        outputs = self.conv1d(outputs)
+        if masks is not None:
+            outputs = outputs * masks
+        return outputs
diff --git a/paddlespeech/vector/cluster/__init__.py b/paddlespeech/vector/cluster/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/paddlespeech/vector/cluster/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py b/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py
new file mode 100644
index 000000000..686de9363
--- /dev/null
+++ b/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py
@@ -0,0 +1,119 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import time
+
+import paddle
+from yacs.config import CfgNode
+
+from paddleaudio.backends import load as load_audio
+from paddleaudio.compliance.librosa import melspectrogram
+from paddlespeech.s2t.utils.log import Log
+from paddlespeech.vector.io.batch import feature_normalize
+from paddlespeech.vector.models.ecapa_tdnn import EcapaTdnn
+from paddlespeech.vector.modules.sid_model import SpeakerIdetification
+from paddlespeech.vector.training.seeding import seed_everything
+
+logger = Log(__name__).getlog()
+
+
+def extract_audio_embedding(args, config):
+    # stage 0: set the training device, cpu or gpu
+    paddle.set_device(args.device)
+    # set the random seed, it is a must for multiprocess training
+    seed_everything(config.seed)
+
+    # stage 1: build the dnn backbone model network
+    ecapa_tdnn = EcapaTdnn(**config.model)
+
+    # stage4: build the speaker verification train instance with backbone model
+    model = SpeakerIdetification(
+        backbone=ecapa_tdnn, num_class=config.num_speakers)
+    # stage 2: load the pre-trained model
+    args.load_checkpoint = os.path.abspath(
+        os.path.expanduser(args.load_checkpoint))
+
+    # load model checkpoint to sid model
+    state_dict = paddle.load(
+        os.path.join(args.load_checkpoint, 'model.pdparams'))
+    model.set_state_dict(state_dict)
+    logger.info(f'Checkpoint loaded from {args.load_checkpoint}')
+
+    # stage 3: we must set the model to eval mode
+    model.eval()
+
+    # stage 4: read the audio data and extract the embedding
+    # wavform is one dimension numpy array 
+    waveform, sr = load_audio(args.audio_path)
+
+    # feat type is numpy array, whose shape is [dim, time]
+    # we need convert the audio feat to one-batch shape [batch, dim, time], where the batch is one
+    # so the final shape is [1, dim, time]
+    start_time = time.time()
+    feat = melspectrogram(
+        x=waveform,
+        sr=config.sr,
+        n_mels=config.n_mels,
+        window_size=config.window_size,
+        hop_length=config.hop_size)
+    feat = paddle.to_tensor(feat).unsqueeze(0)
+
+    # in inference period, the lengths is all one without padding
+    lengths = paddle.ones([1])
+    feat = feature_normalize(feat, mean_norm=True, std_norm=False)
+
+    # model backbone network forward the feats and get the embedding
+    embedding = model.backbone(
+        feat, lengths).squeeze().numpy()  # (1, emb_size, 1) -> (emb_size)
+    elapsed_time = time.time() - start_time
+    audio_length = waveform.shape[0] / sr
+
+    # stage 5: do global norm with external mean and std
+    rtf = elapsed_time / audio_length
+    logger.info(f"{args.device} rft={rtf}")
+
+    return embedding
+
+
+if __name__ == "__main__":
+    # yapf: disable
+    parser = argparse.ArgumentParser(__doc__)
+    parser.add_argument('--device',
+                        choices=['cpu', 'gpu'],
+                        default="cpu",
+                        help="Select which device to train model, defaults to gpu.")
+    parser.add_argument("--config",
+                        default=None,
+                        type=str,
+                        help="configuration file")
+    parser.add_argument("--load-checkpoint",
+                        type=str,
+                        default='',
+                        help="Directory to load model checkpoint to contiune trainning.")
+    parser.add_argument("--audio-path",
+                        default="./data/demo.wav",
+                        type=str,
+                        help="Single audio file path")
+    args = parser.parse_args()
+    # yapf: enable
+    # https://yaml.org/type/float.html
+    config = CfgNode(new_allowed=True)
+    if args.config:
+        config.merge_from_file(args.config)
+
+    config.freeze()
+    print(config)
+
+    extract_audio_embedding(args, config)
diff --git a/paddlespeech/vector/exps/ecapa_tdnn/test.py b/paddlespeech/vector/exps/ecapa_tdnn/test.py
new file mode 100644
index 000000000..d0de6dc51
--- /dev/null
+++ b/paddlespeech/vector/exps/ecapa_tdnn/test.py
@@ -0,0 +1,203 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+
+import numpy as np
+import paddle
+from paddle.io import BatchSampler
+from paddle.io import DataLoader
+from tqdm import tqdm
+from yacs.config import CfgNode
+
+from paddleaudio.datasets import VoxCeleb
+from paddleaudio.metric import compute_eer
+from paddlespeech.s2t.utils.log import Log
+from paddlespeech.vector.io.batch import batch_feature_normalize
+from paddlespeech.vector.models.ecapa_tdnn import EcapaTdnn
+from paddlespeech.vector.modules.sid_model import SpeakerIdetification
+from paddlespeech.vector.training.seeding import seed_everything
+
+logger = Log(__name__).getlog()
+
+
+def main(args, config):
+    # stage0: set the training device, cpu or gpu
+    paddle.set_device(args.device)
+    # set the random seed, it is a must for multiprocess training
+    seed_everything(config.seed)
+
+    # stage1: build the dnn backbone model network
+    ecapa_tdnn = EcapaTdnn(**config.model)
+
+    # stage2: build the speaker verification eval instance with backbone model
+    model = SpeakerIdetification(
+        backbone=ecapa_tdnn, num_class=config.num_speakers)
+
+    # stage3: load the pre-trained model
+    #         we get the last model from the epoch and save_interval
+    args.load_checkpoint = os.path.abspath(
+        os.path.expanduser(args.load_checkpoint))
+
+    # load model checkpoint to sid model
+    state_dict = paddle.load(
+        os.path.join(args.load_checkpoint, 'model.pdparams'))
+    model.set_state_dict(state_dict)
+    logger.info(f'Checkpoint loaded from {args.load_checkpoint}')
+
+    # stage4: construct the enroll and test dataloader
+
+    enroll_dataset = VoxCeleb(
+        subset='enroll',
+        target_dir=args.data_dir,
+        feat_type='melspectrogram',
+        random_chunk=False,
+        n_mels=config.n_mels,
+        window_size=config.window_size,
+        hop_length=config.hop_size)
+    enroll_sampler = BatchSampler(
+        enroll_dataset, batch_size=config.batch_size,
+        shuffle=True)  # Shuffle to make embedding normalization more robust.
+    enrol_loader = DataLoader(enroll_dataset,
+                    batch_sampler=enroll_sampler,
+                    collate_fn=lambda x: batch_feature_normalize(
+                            x, mean_norm=True, std_norm=False),
+                    num_workers=config.num_workers,
+                    return_list=True,)
+    test_dataset = VoxCeleb(
+        subset='test',
+        target_dir=args.data_dir,
+        feat_type='melspectrogram',
+        random_chunk=False,
+        n_mels=config.n_mels,
+        window_size=config.window_size,
+        hop_length=config.hop_size)
+
+    test_sampler = BatchSampler(
+        test_dataset, batch_size=config.batch_size, shuffle=True)
+    test_loader = DataLoader(test_dataset,
+                            batch_sampler=test_sampler,
+                            collate_fn=lambda x: batch_feature_normalize(
+                                x, mean_norm=True, std_norm=False),
+                            num_workers=config.num_workers,
+                            return_list=True,)
+    # stage5: we must set the model to eval mode
+    model.eval()
+
+    # stage6: global embedding norm to imporve the performance
+    logger.info(f"global embedding norm: {config.global_embedding_norm}")
+    if config.global_embedding_norm:
+        global_embedding_mean = None
+        global_embedding_std = None
+        mean_norm_flag = config.embedding_mean_norm
+        std_norm_flag = config.embedding_std_norm
+        batch_count = 0
+
+    # stage7: Compute embeddings of audios in enrol and test dataset from model.
+    id2embedding = {}
+    # Run multi times to make embedding normalization more stable.
+    for i in range(2):
+        for dl in [enrol_loader, test_loader]:
+            logger.info(
+                f'Loop {[i+1]}: Computing embeddings on {dl.dataset.subset} dataset'
+            )
+            with paddle.no_grad():
+                for batch_idx, batch in enumerate(tqdm(dl)):
+
+                    # stage 8-1: extrac the audio embedding
+                    ids, feats, lengths = batch['ids'], batch['feats'], batch[
+                        'lengths']
+                    embeddings = model.backbone(feats, lengths).squeeze(
+                        -1).numpy()  # (N, emb_size, 1) -> (N, emb_size)
+
+                    # Global embedding normalization.
+                    # if we use the global embedding norm
+                    # eer can reduece about relative 10%
+                    if config.global_embedding_norm:
+                        batch_count += 1
+                        current_mean = embeddings.mean(
+                            axis=0) if mean_norm_flag else 0
+                        current_std = embeddings.std(
+                            axis=0) if std_norm_flag else 1
+                        # Update global mean and std.
+                        if global_embedding_mean is None and global_embedding_std is None:
+                            global_embedding_mean, global_embedding_std = current_mean, current_std
+                        else:
+                            weight = 1 / batch_count  # Weight decay by batches.
+                            global_embedding_mean = (
+                                1 - weight
+                            ) * global_embedding_mean + weight * current_mean
+                            global_embedding_std = (
+                                1 - weight
+                            ) * global_embedding_std + weight * current_std
+                        # Apply global embedding normalization.
+                        embeddings = (embeddings - global_embedding_mean
+                                      ) / global_embedding_std
+
+                    # Update embedding dict.
+                    id2embedding.update(dict(zip(ids, embeddings)))
+
+    # stage 8: Compute cosine scores.
+    labels = []
+    enroll_ids = []
+    test_ids = []
+    logger.info(f"read the trial from {VoxCeleb.veri_test_file}")
+    with open(VoxCeleb.veri_test_file, 'r') as f:
+        for line in f.readlines():
+            label, enroll_id, test_id = line.strip().split(' ')
+            labels.append(int(label))
+            enroll_ids.append(enroll_id.split('.')[0].replace('/', '-'))
+            test_ids.append(test_id.split('.')[0].replace('/', '-'))
+
+    cos_sim_func = paddle.nn.CosineSimilarity(axis=1)
+    enrol_embeddings, test_embeddings = map(lambda ids: paddle.to_tensor(
+        np.asarray([id2embedding[uttid] for uttid in ids], dtype='float32')),
+                                            [enroll_ids, test_ids
+                                             ])  # (N, emb_size)
+    scores = cos_sim_func(enrol_embeddings, test_embeddings)
+    EER, threshold = compute_eer(np.asarray(labels), scores.numpy())
+    logger.info(
+        f'EER of verification test: {EER*100:.4f}%, score threshold: {threshold:.5f}'
+    )
+
+
+if __name__ == "__main__":
+    # yapf: disable
+    parser = argparse.ArgumentParser(__doc__)
+    parser.add_argument('--device',
+                        choices=['cpu', 'gpu'],
+                        default="gpu",
+                        help="Select which device to train model, defaults to gpu.")
+    parser.add_argument("--config",
+                        default=None,
+                        type=str,
+                        help="configuration file")
+    parser.add_argument("--data-dir",
+                        default="./data/",
+                        type=str,
+                        help="data directory")
+    parser.add_argument("--load-checkpoint",
+                        type=str,
+                        default='',
+                        help="Directory to load model checkpoint to contiune trainning.")
+    args = parser.parse_args()
+    # yapf: enable
+    # https://yaml.org/type/float.html
+    config = CfgNode(new_allowed=True)
+    if args.config:
+        config.merge_from_file(args.config)
+
+    config.freeze()
+    print(config)
+    main(args, config)
diff --git a/paddlespeech/vector/exps/ecapa_tdnn/train.py b/paddlespeech/vector/exps/ecapa_tdnn/train.py
new file mode 100644
index 000000000..257b97abe
--- /dev/null
+++ b/paddlespeech/vector/exps/ecapa_tdnn/train.py
@@ -0,0 +1,351 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import time
+
+import numpy as np
+import paddle
+from paddle.io import BatchSampler
+from paddle.io import DataLoader
+from paddle.io import DistributedBatchSampler
+from yacs.config import CfgNode
+
+from paddleaudio.compliance.librosa import melspectrogram
+from paddleaudio.datasets.voxceleb import VoxCeleb
+from paddlespeech.s2t.utils.log import Log
+from paddlespeech.vector.io.augment import build_augment_pipeline
+from paddlespeech.vector.io.augment import waveform_augment
+from paddlespeech.vector.io.batch import batch_pad_right
+from paddlespeech.vector.io.batch import feature_normalize
+from paddlespeech.vector.io.batch import waveform_collate_fn
+from paddlespeech.vector.models.ecapa_tdnn import EcapaTdnn
+from paddlespeech.vector.modules.loss import AdditiveAngularMargin
+from paddlespeech.vector.modules.loss import LogSoftmaxWrapper
+from paddlespeech.vector.modules.sid_model import SpeakerIdetification
+from paddlespeech.vector.training.scheduler import CyclicLRScheduler
+from paddlespeech.vector.training.seeding import seed_everything
+from paddlespeech.vector.utils.time import Timer
+
+logger = Log(__name__).getlog()
+
+
+def main(args, config):
+    # stage0: set the training device, cpu or gpu
+    paddle.set_device(args.device)
+
+    # stage1: we must call the paddle.distributed.init_parallel_env() api at the begining
+    paddle.distributed.init_parallel_env()
+    nranks = paddle.distributed.get_world_size()
+    local_rank = paddle.distributed.get_rank()
+    # set the random seed, it is a must for multiprocess training
+    seed_everything(config.seed)
+
+    # stage2: data prepare, such vox1 and vox2 data, and augment noise data and pipline
+    # note: some cmd must do in rank==0, so wo will refactor the data prepare code
+    train_dataset = VoxCeleb('train', target_dir=args.data_dir)
+    dev_dataset = VoxCeleb('dev', target_dir=args.data_dir)
+
+    if config.augment:
+        augment_pipeline = build_augment_pipeline(target_dir=args.data_dir)
+    else:
+        augment_pipeline = []
+
+    # stage3: build the dnn backbone model network
+    ecapa_tdnn = EcapaTdnn(**config.model)
+
+    # stage4: build the speaker verification train instance with backbone model
+    model = SpeakerIdetification(
+        backbone=ecapa_tdnn, num_class=VoxCeleb.num_speakers)
+
+    # stage5: build the optimizer, we now only construct the AdamW optimizer
+    #         140000 is single gpu steps
+    #         so, in multi-gpu mode, wo reduce the step_size to 140000//nranks to enable CyclicLRScheduler
+    lr_schedule = CyclicLRScheduler(
+        base_lr=config.learning_rate, max_lr=1e-3, step_size=140000 // nranks)
+    optimizer = paddle.optimizer.AdamW(
+        learning_rate=lr_schedule, parameters=model.parameters())
+
+    # stage6: build the loss function, we now only support LogSoftmaxWrapper
+    criterion = LogSoftmaxWrapper(
+        loss_fn=AdditiveAngularMargin(margin=0.2, scale=30))
+
+    # stage7: confirm training start epoch
+    #         if pre-trained model exists, start epoch confirmed by the pre-trained model
+    start_epoch = 0
+    if args.load_checkpoint:
+        logger.info("load the check point")
+        args.load_checkpoint = os.path.abspath(
+            os.path.expanduser(args.load_checkpoint))
+        try:
+            # load model checkpoint
+            state_dict = paddle.load(
+                os.path.join(args.load_checkpoint, 'model.pdparams'))
+            model.set_state_dict(state_dict)
+
+            # load optimizer checkpoint
+            state_dict = paddle.load(
+                os.path.join(args.load_checkpoint, 'model.pdopt'))
+            optimizer.set_state_dict(state_dict)
+            if local_rank == 0:
+                logger.info(f'Checkpoint loaded from {args.load_checkpoint}')
+        except FileExistsError:
+            if local_rank == 0:
+                logger.info('Train from scratch.')
+
+        try:
+            start_epoch = int(args.load_checkpoint[-1])
+            logger.info(f'Restore training from epoch {start_epoch}.')
+        except ValueError:
+            pass
+
+    # stage8: we build the batch sampler for paddle.DataLoader
+    train_sampler = DistributedBatchSampler(
+        train_dataset,
+        batch_size=config.batch_size,
+        shuffle=True,
+        drop_last=False)
+    train_loader = DataLoader(
+        train_dataset,
+        batch_sampler=train_sampler,
+        num_workers=config.num_workers,
+        collate_fn=waveform_collate_fn,
+        return_list=True,
+        use_buffer_reader=True, )
+
+    # stage9: start to train
+    #         we will comment the training process
+    steps_per_epoch = len(train_sampler)
+    timer = Timer(steps_per_epoch * config.epochs)
+    last_saved_epoch = ""
+    timer.start()
+
+    for epoch in range(start_epoch + 1, config.epochs + 1):
+        # at the begining, model must set to train mode
+        model.train()
+
+        avg_loss = 0
+        num_corrects = 0
+        num_samples = 0
+        train_reader_cost = 0.0
+        train_feat_cost = 0.0
+        train_run_cost = 0.0
+
+        reader_start = time.time()
+        for batch_idx, batch in enumerate(train_loader):
+            train_reader_cost += time.time() - reader_start
+
+            # stage 9-1: batch data is audio sample points and speaker id label
+            feat_start = time.time()
+            waveforms, labels = batch['waveforms'], batch['labels']
+            waveforms, lengths = batch_pad_right(waveforms.numpy())
+            waveforms = paddle.to_tensor(waveforms)
+
+            # stage 9-2: audio sample augment method, which is done on the audio sample point
+            #            the original wavefrom and the augmented waveform is concatented in a batch
+            #            eg. five augment method in the augment pipeline
+            #                the final data nums is batch_size * [five + one] 
+            #                -> five augmented waveform batch plus one original batch waveform
+            if len(augment_pipeline) != 0:
+                waveforms = waveform_augment(waveforms, augment_pipeline)
+                labels = paddle.concat(
+                    [labels for i in range(len(augment_pipeline) + 1)])
+
+            # stage 9-3: extract the audio feats,such fbank, mfcc, spectrogram
+            feats = []
+            for waveform in waveforms.numpy():
+                feat = melspectrogram(
+                    x=waveform,
+                    sr=config.sr,
+                    n_mels=config.n_mels,
+                    window_size=config.window_size,
+                    hop_length=config.hop_size)
+                feats.append(feat)
+            feats = paddle.to_tensor(np.asarray(feats))
+
+            # stage 9-4: feature normalize, which help converge and imporve the performance
+            feats = feature_normalize(
+                feats, mean_norm=True, std_norm=False)  # Features normalization
+            train_feat_cost += time.time() - feat_start
+
+            # stage 9-5: model forward, such ecapa-tdnn, x-vector
+            train_start = time.time()
+            logits = model(feats)
+
+            # stage 9-6: loss function criterion, such AngularMargin, AdditiveAngularMargin
+            loss = criterion(logits, labels)
+
+            # stage 9-7: update the gradient and clear the gradient cache
+            loss.backward()
+            optimizer.step()
+            if isinstance(optimizer._learning_rate,
+                          paddle.optimizer.lr.LRScheduler):
+                optimizer._learning_rate.step()
+            optimizer.clear_grad()
+            train_run_cost += time.time() - train_start
+
+            # stage 9-8: Calculate average loss per batch
+            avg_loss += loss.numpy()[0]
+
+            # stage 9-9: Calculate metrics, which is one-best accuracy
+            preds = paddle.argmax(logits, axis=1)
+            num_corrects += (preds == labels).numpy().sum()
+            num_samples += feats.shape[0]
+            timer.count()  # step plus one in timer
+
+            # stage 9-10: print the log information only on 0-rank per log-freq batchs
+            if (batch_idx + 1) % config.log_interval == 0 and local_rank == 0:
+                lr = optimizer.get_lr()
+                avg_loss /= config.log_interval
+                avg_acc = num_corrects / num_samples
+
+                print_msg = 'Train Epoch={}/{}, Step={}/{}'.format(
+                    epoch, config.epochs, batch_idx + 1, steps_per_epoch)
+                print_msg += ' loss={:.4f}'.format(avg_loss)
+                print_msg += ' acc={:.4f}'.format(avg_acc)
+                print_msg += ' avg_reader_cost: {:.5f} sec,'.format(
+                    train_reader_cost / config.log_interval)
+                print_msg += ' avg_feat_cost: {:.5f} sec,'.format(
+                    train_feat_cost / config.log_interval)
+                print_msg += ' avg_train_cost: {:.5f} sec,'.format(
+                    train_run_cost / config.log_interval)
+                print_msg += ' lr={:.4E} step/sec={:.2f} | ETA {}'.format(
+                    lr, timer.timing, timer.eta)
+                logger.info(print_msg)
+
+                avg_loss = 0
+                num_corrects = 0
+                num_samples = 0
+                train_reader_cost = 0.0
+                train_feat_cost = 0.0
+                train_run_cost = 0.0
+
+            reader_start = time.time()
+
+        # stage 9-11: save the model parameters only on 0-rank per save-freq batchs
+        if epoch % config.save_interval == 0 and batch_idx + 1 == steps_per_epoch:
+            if local_rank != 0:
+                paddle.distributed.barrier(
+                )  # Wait for valid step in main process
+                continue  # Resume trainning on other process
+
+            # stage 9-12: construct the valid dataset dataloader
+            dev_sampler = BatchSampler(
+                dev_dataset,
+                batch_size=config.batch_size,
+                shuffle=False,
+                drop_last=False)
+            dev_loader = DataLoader(
+                dev_dataset,
+                batch_sampler=dev_sampler,
+                collate_fn=waveform_collate_fn,
+                num_workers=config.num_workers,
+                return_list=True, )
+
+            # set the model to eval mode
+            model.eval()
+            num_corrects = 0
+            num_samples = 0
+
+            # stage 9-13: evaluation the valid dataset batch data
+            logger.info('Evaluate on validation dataset')
+            with paddle.no_grad():
+                for batch_idx, batch in enumerate(dev_loader):
+                    waveforms, labels = batch['waveforms'], batch['labels']
+
+                    feats = []
+                    for waveform in waveforms.numpy():
+                        feat = melspectrogram(
+                            x=waveform,
+                            sr=config.sr,
+                            n_mels=config.n_mels,
+                            window_size=config.window_size,
+                            hop_length=config.hop_size)
+                        feats.append(feat)
+
+                    feats = paddle.to_tensor(np.asarray(feats))
+                    feats = feature_normalize(
+                        feats, mean_norm=True, std_norm=False)
+                    logits = model(feats)
+
+                    preds = paddle.argmax(logits, axis=1)
+                    num_corrects += (preds == labels).numpy().sum()
+                    num_samples += feats.shape[0]
+
+            print_msg = '[Evaluation result]'
+            print_msg += ' dev_acc={:.4f}'.format(num_corrects / num_samples)
+            logger.info(print_msg)
+
+            # stage 9-14: Save model parameters
+            save_dir = os.path.join(args.checkpoint_dir,
+                                    'epoch_{}'.format(epoch))
+            last_saved_epoch = os.path.join('epoch_{}'.format(epoch),
+                                            "model.pdparams")
+            logger.info('Saving model checkpoint to {}'.format(save_dir))
+            paddle.save(model.state_dict(),
+                        os.path.join(save_dir, 'model.pdparams'))
+            paddle.save(optimizer.state_dict(),
+                        os.path.join(save_dir, 'model.pdopt'))
+
+            if nranks > 1:
+                paddle.distributed.barrier()  # Main process
+
+    # stage 10: create the final trained model.pdparams with soft link
+    if local_rank == 0:
+        final_model = os.path.join(args.checkpoint_dir, "model.pdparams")
+        logger.info(f"we will create the final model: {final_model}")
+        if os.path.islink(final_model):
+            logger.info(
+                f"An {final_model} already exists, we will rm is and create it again"
+            )
+            os.unlink(final_model)
+        os.symlink(last_saved_epoch, final_model)
+
+
+if __name__ == "__main__":
+    # yapf: disable
+    parser = argparse.ArgumentParser(__doc__)
+    parser.add_argument('--device',
+                        choices=['cpu', 'gpu'],
+                        default="cpu",
+                        help="Select which device to train model, defaults to gpu.")
+    parser.add_argument("--config",
+                        default=None,
+                        type=str,
+                        help="configuration file")
+    parser.add_argument("--data-dir",
+                        default="./data/",
+                        type=str,
+                        help="data directory")
+    parser.add_argument("--load-checkpoint",
+                        type=str,
+                        default=None,
+                        help="Directory to load model checkpoint to contiune trainning.")
+    parser.add_argument("--checkpoint-dir",
+                        type=str,
+                        default='./checkpoint',
+                        help="Directory to save model checkpoints.")
+
+    args = parser.parse_args()
+    # yapf: enable
+
+    # https://yaml.org/type/float.html
+    config = CfgNode(new_allowed=True)
+    if args.config:
+        config.merge_from_file(args.config)
+
+    config.freeze()
+    print(config)
+
+    main(args, config)
diff --git a/paddlespeech/vector/io/__init__.py b/paddlespeech/vector/io/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/paddlespeech/vector/io/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddlespeech/vector/io/augment.py b/paddlespeech/vector/io/augment.py
new file mode 100644
index 000000000..3baace139
--- /dev/null
+++ b/paddlespeech/vector/io/augment.py
@@ -0,0 +1,906 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# this is modified from SpeechBrain
+# https://github.com/speechbrain/speechbrain/blob/085be635c07f16d42cd1295045bc46c407f1e15b/speechbrain/lobes/augment.py
+import math
+from typing import List
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddleaudio.datasets.rirs_noises import OpenRIRNoise
+from paddlespeech.s2t.utils.log import Log
+from paddlespeech.vector.io.signal_processing import compute_amplitude
+from paddlespeech.vector.io.signal_processing import convolve1d
+from paddlespeech.vector.io.signal_processing import dB_to_amplitude
+from paddlespeech.vector.io.signal_processing import notch_filter
+from paddlespeech.vector.io.signal_processing import reverberate
+
+logger = Log(__name__).getlog()
+
+
+# TODO: Complete type-hint and doc string.
+class DropFreq(nn.Layer):
+    def __init__(
+            self,
+            drop_freq_low=1e-14,
+            drop_freq_high=1,
+            drop_count_low=1,
+            drop_count_high=2,
+            drop_width=0.05,
+            drop_prob=1, ):
+        super(DropFreq, self).__init__()
+        self.drop_freq_low = drop_freq_low
+        self.drop_freq_high = drop_freq_high
+        self.drop_count_low = drop_count_low
+        self.drop_count_high = drop_count_high
+        self.drop_width = drop_width
+        self.drop_prob = drop_prob
+
+    def forward(self, waveforms):
+        # Don't drop (return early) 1-`drop_prob` portion of the batches
+        dropped_waveform = waveforms.clone()
+        if paddle.rand([1]) > self.drop_prob:
+            return dropped_waveform
+
+        # Add channels dimension
+        if len(waveforms.shape) == 2:
+            dropped_waveform = dropped_waveform.unsqueeze(-1)
+
+        # Pick number of frequencies to drop
+        drop_count = paddle.randint(
+            low=self.drop_count_low, high=self.drop_count_high + 1, shape=[1])
+
+        # Pick a frequency to drop
+        drop_range = self.drop_freq_high - self.drop_freq_low
+        drop_frequency = (
+            paddle.rand([drop_count]) * drop_range + self.drop_freq_low)
+
+        # Filter parameters
+        filter_length = 101
+        pad = filter_length // 2
+
+        # Start with delta function
+        drop_filter = paddle.zeros([1, filter_length, 1])
+        drop_filter[0, pad, 0] = 1
+
+        # Subtract each frequency
+        for frequency in drop_frequency:
+            notch_kernel = notch_filter(frequency, filter_length,
+                                        self.drop_width)
+            drop_filter = convolve1d(drop_filter, notch_kernel, pad)
+
+        # Apply filter
+        dropped_waveform = convolve1d(dropped_waveform, drop_filter, pad)
+
+        # Remove channels dimension if added
+        return dropped_waveform.squeeze(-1)
+
+
+class DropChunk(nn.Layer):
+    def __init__(
+            self,
+            drop_length_low=100,
+            drop_length_high=1000,
+            drop_count_low=1,
+            drop_count_high=10,
+            drop_start=0,
+            drop_end=None,
+            drop_prob=1,
+            noise_factor=0.0, ):
+        super(DropChunk, self).__init__()
+        self.drop_length_low = drop_length_low
+        self.drop_length_high = drop_length_high
+        self.drop_count_low = drop_count_low
+        self.drop_count_high = drop_count_high
+        self.drop_start = drop_start
+        self.drop_end = drop_end
+        self.drop_prob = drop_prob
+        self.noise_factor = noise_factor
+
+        # Validate low < high
+        if drop_length_low > drop_length_high:
+            raise ValueError("Low limit must not be more than high limit")
+        if drop_count_low > drop_count_high:
+            raise ValueError("Low limit must not be more than high limit")
+
+        # Make sure the length doesn't exceed end - start
+        if drop_end is not None and drop_end >= 0:
+            if drop_start > drop_end:
+                raise ValueError("Low limit must not be more than high limit")
+
+            drop_range = drop_end - drop_start
+            self.drop_length_low = min(drop_length_low, drop_range)
+            self.drop_length_high = min(drop_length_high, drop_range)
+
+    def forward(self, waveforms, lengths):
+        # Reading input list
+        lengths = (lengths * waveforms.shape[1]).astype('int64')
+        batch_size = waveforms.shape[0]
+        dropped_waveform = waveforms.clone()
+
+        # Don't drop (return early) 1-`drop_prob` portion of the batches
+        if paddle.rand([1]) > self.drop_prob:
+            return dropped_waveform
+
+        # Store original amplitude for computing white noise amplitude
+        clean_amplitude = compute_amplitude(waveforms, lengths.unsqueeze(1))
+
+        # Pick a number of times to drop
+        drop_times = paddle.randint(
+            low=self.drop_count_low,
+            high=self.drop_count_high + 1,
+            shape=[batch_size], )
+
+        # Iterate batch to set mask
+        for i in range(batch_size):
+            if drop_times[i] == 0:
+                continue
+
+            # Pick lengths
+            length = paddle.randint(
+                low=self.drop_length_low,
+                high=self.drop_length_high + 1,
+                shape=[drop_times[i]], )
+
+            # Compute range of starting locations
+            start_min = self.drop_start
+            if start_min < 0:
+                start_min += lengths[i]
+            start_max = self.drop_end
+            if start_max is None:
+                start_max = lengths[i]
+            if start_max < 0:
+                start_max += lengths[i]
+            start_max = max(0, start_max - length.max())
+
+            # Pick starting locations
+            start = paddle.randint(
+                low=start_min,
+                high=start_max + 1,
+                shape=[drop_times[i]], )
+
+            end = start + length
+
+            # Update waveform
+            if not self.noise_factor:
+                for j in range(drop_times[i]):
+                    if start[j] < end[j]:
+                        dropped_waveform[i, start[j]:end[j]] = 0.0
+            else:
+                # Uniform distribution of -2 to +2 * avg amplitude should
+                # preserve the average for normalization
+                noise_max = 2 * clean_amplitude[i] * self.noise_factor
+                for j in range(drop_times[i]):
+                    # zero-center the noise distribution
+                    noise_vec = paddle.rand([length[j]], dtype='float32')
+
+                    noise_vec = 2 * noise_max * noise_vec - noise_max
+                    dropped_waveform[i, int(start[j]):int(end[j])] = noise_vec
+
+        return dropped_waveform
+
+
+class Resample(nn.Layer):
+    def __init__(
+            self,
+            orig_freq=16000,
+            new_freq=16000,
+            lowpass_filter_width=6, ):
+        super(Resample, self).__init__()
+        self.orig_freq = orig_freq
+        self.new_freq = new_freq
+        self.lowpass_filter_width = lowpass_filter_width
+
+        # Compute rate for striding
+        self._compute_strides()
+        assert self.orig_freq % self.conv_stride == 0
+        assert self.new_freq % self.conv_transpose_stride == 0
+
+    def _compute_strides(self):
+        # Compute new unit based on ratio of in/out frequencies
+        base_freq = math.gcd(self.orig_freq, self.new_freq)
+        input_samples_in_unit = self.orig_freq // base_freq
+        self.output_samples = self.new_freq // base_freq
+
+        # Store the appropriate stride based on the new units
+        self.conv_stride = input_samples_in_unit
+        self.conv_transpose_stride = self.output_samples
+
+    def forward(self, waveforms):
+        if not hasattr(self, "first_indices"):
+            self._indices_and_weights(waveforms)
+
+        # Don't do anything if the frequencies are the same
+        if self.orig_freq == self.new_freq:
+            return waveforms
+
+        unsqueezed = False
+        if len(waveforms.shape) == 2:
+            waveforms = waveforms.unsqueeze(1)
+            unsqueezed = True
+        elif len(waveforms.shape) == 3:
+            waveforms = waveforms.transpose([0, 2, 1])
+        else:
+            raise ValueError("Input must be 2 or 3 dimensions")
+
+        # Do resampling
+        resampled_waveform = self._perform_resample(waveforms)
+
+        if unsqueezed:
+            resampled_waveform = resampled_waveform.squeeze(1)
+        else:
+            resampled_waveform = resampled_waveform.transpose([0, 2, 1])
+
+        return resampled_waveform
+
+    def _perform_resample(self, waveforms):
+        # Compute output size and initialize
+        batch_size, num_channels, wave_len = waveforms.shape
+        window_size = self.weights.shape[1]
+        tot_output_samp = self._output_samples(wave_len)
+        resampled_waveform = paddle.zeros((batch_size, num_channels,
+                                           tot_output_samp))
+
+        # eye size: (num_channels, num_channels, 1)
+        eye = paddle.eye(num_channels).unsqueeze(2)
+
+        # Iterate over the phases in the polyphase filter
+        for i in range(self.first_indices.shape[0]):
+            wave_to_conv = waveforms
+            first_index = int(self.first_indices[i].item())
+            if first_index >= 0:
+                # trim the signal as the filter will not be applied
+                # before the first_index
+                wave_to_conv = wave_to_conv[:, :, first_index:]
+
+            # pad the right of the signal to allow partial convolutions
+            # meaning compute values for partial windows (e.g. end of the
+            # window is outside the signal length)
+            max_index = (tot_output_samp - 1) // self.output_samples
+            end_index = max_index * self.conv_stride + window_size
+            current_wave_len = wave_len - first_index
+            right_padding = max(0, end_index + 1 - current_wave_len)
+            left_padding = max(0, -first_index)
+            wave_to_conv = paddle.nn.functional.pad(
+                wave_to_conv, [left_padding, right_padding], data_format='NCL')
+            conv_wave = paddle.nn.functional.conv1d(
+                x=wave_to_conv,
+                # weight=self.weights[i].repeat(num_channels, 1, 1),
+                weight=self.weights[i].expand((num_channels, 1, -1)),
+                stride=self.conv_stride,
+                groups=num_channels, )
+
+            # we want conv_wave[:, i] to be at
+            # output[:, i + n*conv_transpose_stride]
+            dilated_conv_wave = paddle.nn.functional.conv1d_transpose(
+                conv_wave, eye, stride=self.conv_transpose_stride)
+
+            # pad dilated_conv_wave so it reaches the output length if needed.
+            left_padding = i
+            previous_padding = left_padding + dilated_conv_wave.shape[-1]
+            right_padding = max(0, tot_output_samp - previous_padding)
+            dilated_conv_wave = paddle.nn.functional.pad(
+                dilated_conv_wave, [left_padding, right_padding],
+                data_format='NCL')
+            dilated_conv_wave = dilated_conv_wave[:, :, :tot_output_samp]
+
+            resampled_waveform += dilated_conv_wave
+
+        return resampled_waveform
+
+    def _output_samples(self, input_num_samp):
+        samp_in = int(self.orig_freq)
+        samp_out = int(self.new_freq)
+
+        tick_freq = abs(samp_in * samp_out) // math.gcd(samp_in, samp_out)
+        ticks_per_input_period = tick_freq // samp_in
+
+        # work out the number of ticks in the time interval
+        # [ 0, input_num_samp/samp_in ).
+        interval_length = input_num_samp * ticks_per_input_period
+        if interval_length <= 0:
+            return 0
+        ticks_per_output_period = tick_freq // samp_out
+
+        # Get the last output-sample in the closed interval,
+        # i.e. replacing [ ) with [ ]. Note: integer division rounds down.
+        # See http://en.wikipedia.org/wiki/Interval_(mathematics) for an
+        # explanation of the notation.
+        last_output_samp = interval_length // ticks_per_output_period
+
+        # We need the last output-sample in the open interval, so if it
+        # takes us to the end of the interval exactly, subtract one.
+        if last_output_samp * ticks_per_output_period == interval_length:
+            last_output_samp -= 1
+
+        # First output-sample index is zero, so the number of output samples
+        # is the last output-sample plus one.
+        num_output_samp = last_output_samp + 1
+
+        return num_output_samp
+
+    def _indices_and_weights(self, waveforms):
+        # Lowpass filter frequency depends on smaller of two frequencies
+        min_freq = min(self.orig_freq, self.new_freq)
+        lowpass_cutoff = 0.99 * 0.5 * min_freq
+
+        assert lowpass_cutoff * 2 <= min_freq
+        window_width = self.lowpass_filter_width / (2.0 * lowpass_cutoff)
+
+        assert lowpass_cutoff < min(self.orig_freq, self.new_freq) / 2
+        output_t = paddle.arange(start=0.0, end=self.output_samples)
+        output_t /= self.new_freq
+        min_t = output_t - window_width
+        max_t = output_t + window_width
+
+        min_input_index = paddle.ceil(min_t * self.orig_freq)
+        max_input_index = paddle.floor(max_t * self.orig_freq)
+        num_indices = max_input_index - min_input_index + 1
+
+        max_weight_width = num_indices.max()
+        j = paddle.arange(max_weight_width, dtype='float32')
+        input_index = min_input_index.unsqueeze(1) + j.unsqueeze(0)
+        delta_t = (input_index / self.orig_freq) - output_t.unsqueeze(1)
+
+        weights = paddle.zeros_like(delta_t)
+        inside_window_indices = delta_t.abs().less_than(
+            paddle.to_tensor(window_width))
+
+        # raised-cosine (Hanning) window with width `window_width`
+        weights[inside_window_indices] = 0.5 * (1 + paddle.cos(
+            2 * math.pi * lowpass_cutoff / self.lowpass_filter_width *
+            delta_t.masked_select(inside_window_indices)))
+
+        t_eq_zero_indices = delta_t.equal(paddle.zeros_like(delta_t))
+        t_not_eq_zero_indices = delta_t.not_equal(paddle.zeros_like(delta_t))
+
+        # sinc filter function
+        weights = paddle.where(
+            t_not_eq_zero_indices,
+            weights * paddle.sin(2 * math.pi * lowpass_cutoff * delta_t) /
+            (math.pi * delta_t), weights)
+
+        # limit of the function at t = 0
+        weights = paddle.where(t_eq_zero_indices, weights * 2 * lowpass_cutoff,
+                               weights)
+
+        # size (output_samples, max_weight_width)
+        weights /= self.orig_freq
+
+        self.first_indices = min_input_index
+        self.weights = weights
+
+
+class SpeedPerturb(nn.Layer):
+    def __init__(
+            self,
+            orig_freq,
+            speeds=[90, 100, 110],
+            perturb_prob=1.0, ):
+        super(SpeedPerturb, self).__init__()
+        self.orig_freq = orig_freq
+        self.speeds = speeds
+        self.perturb_prob = perturb_prob
+
+        # Initialize index of perturbation
+        self.samp_index = 0
+
+        # Initialize resamplers
+        self.resamplers = []
+        for speed in self.speeds:
+            config = {
+                "orig_freq": self.orig_freq,
+                "new_freq": self.orig_freq * speed // 100,
+            }
+            self.resamplers.append(Resample(**config))
+
+    def forward(self, waveform):
+        # Don't perturb (return early) 1-`perturb_prob` portion of the batches
+        if paddle.rand([1]) > self.perturb_prob:
+            return waveform.clone()
+
+        # Perform a random perturbation
+        self.samp_index = paddle.randint(len(self.speeds), shape=[1]).item()
+        perturbed_waveform = self.resamplers[self.samp_index](waveform)
+
+        return perturbed_waveform
+
+
+class AddNoise(nn.Layer):
+    def __init__(
+            self,
+            noise_dataset=None,  # None for white noise
+            num_workers=0,
+            snr_low=0,
+            snr_high=0,
+            mix_prob=1.0,
+            start_index=None,
+            normalize=False, ):
+        super(AddNoise, self).__init__()
+
+        self.num_workers = num_workers
+        self.snr_low = snr_low
+        self.snr_high = snr_high
+        self.mix_prob = mix_prob
+        self.start_index = start_index
+        self.normalize = normalize
+        self.noise_dataset = noise_dataset
+        self.noise_dataloader = None
+
+    def forward(self, waveforms, lengths=None):
+        if lengths is None:
+            lengths = paddle.ones([len(waveforms)])
+
+        # Copy clean waveform to initialize noisy waveform
+        noisy_waveform = waveforms.clone()
+        lengths = (lengths * waveforms.shape[1]).astype('int64').unsqueeze(1)
+
+        # Don't add noise (return early) 1-`mix_prob` portion of the batches
+        if paddle.rand([1]) > self.mix_prob:
+            return noisy_waveform
+
+        # Compute the average amplitude of the clean waveforms
+        clean_amplitude = compute_amplitude(waveforms, lengths)
+
+        # Pick an SNR and use it to compute the mixture amplitude factors
+        SNR = paddle.rand((len(waveforms), 1))
+        SNR = SNR * (self.snr_high - self.snr_low) + self.snr_low
+        noise_amplitude_factor = 1 / (dB_to_amplitude(SNR) + 1)
+        new_noise_amplitude = noise_amplitude_factor * clean_amplitude
+
+        # Scale clean signal appropriately
+        noisy_waveform *= 1 - noise_amplitude_factor
+
+        # Loop through clean samples and create mixture
+        if self.noise_dataset is None:
+            white_noise = paddle.normal(shape=waveforms.shape)
+            noisy_waveform += new_noise_amplitude * white_noise
+        else:
+            tensor_length = waveforms.shape[1]
+            noise_waveform, noise_length = self._load_noise(
+                lengths,
+                tensor_length, )
+
+            # Rescale and add
+            noise_amplitude = compute_amplitude(noise_waveform, noise_length)
+            noise_waveform *= new_noise_amplitude / (noise_amplitude + 1e-14)
+            noisy_waveform += noise_waveform
+
+        # Normalizing to prevent clipping
+        if self.normalize:
+            abs_max, _ = paddle.max(
+                paddle.abs(noisy_waveform), axis=1, keepdim=True)
+            noisy_waveform = noisy_waveform / abs_max.clip(min=1.0)
+
+        return noisy_waveform
+
+    def _load_noise(self, lengths, max_length):
+        """
+        Load a batch of noises
+
+        args
+        lengths(Paddle.Tensor): Num samples of waveforms with shape (N, 1).
+        max_length(int): Width of a batch.
+        """
+        lengths = lengths.squeeze(1)
+        batch_size = len(lengths)
+
+        # Load a noise batch
+        if self.noise_dataloader is None:
+
+            def noise_collate_fn(batch):
+                def pad(x, target_length, mode='constant', **kwargs):
+                    x = np.asarray(x)
+                    w = target_length - x.shape[0]
+                    assert w >= 0, f'Target length {target_length} is less than origin length {x.shape[0]}'
+                    return np.pad(x, [0, w], mode=mode, **kwargs)
+
+                ids = [item['id'] for item in batch]
+                lengths = np.asarray([item['feat'].shape[0] for item in batch])
+                waveforms = list(
+                    map(lambda x: pad(x, max(max_length, lengths.max().item())),
+                        [item['feat'] for item in batch]))
+                waveforms = np.stack(waveforms)
+                return {'ids': ids, 'feats': waveforms, 'lengths': lengths}
+
+            # Create noise data loader.
+            self.noise_dataloader = paddle.io.DataLoader(
+                self.noise_dataset,
+                batch_size=batch_size,
+                shuffle=True,
+                num_workers=self.num_workers,
+                collate_fn=noise_collate_fn,
+                return_list=True, )
+            self.noise_data = iter(self.noise_dataloader)
+
+        noise_batch, noise_len = self._load_noise_batch_of_size(batch_size)
+
+        # Select a random starting location in the waveform
+        start_index = self.start_index
+        if self.start_index is None:
+            start_index = 0
+            max_chop = (noise_len - lengths).min().clip(min=1)
+            start_index = paddle.randint(high=max_chop, shape=[1])
+
+        # Truncate noise_batch to max_length
+        noise_batch = noise_batch[:, start_index:start_index + max_length]
+        noise_len = (noise_len - start_index).clip(max=max_length).unsqueeze(1)
+        return noise_batch, noise_len
+
+    def _load_noise_batch_of_size(self, batch_size):
+        """Concatenate noise batches, then chop to correct size"""
+        noise_batch, noise_lens = self._load_noise_batch()
+
+        # Expand
+        while len(noise_batch) < batch_size:
+            noise_batch = paddle.concat((noise_batch, noise_batch))
+            noise_lens = paddle.concat((noise_lens, noise_lens))
+
+        # Contract
+        if len(noise_batch) > batch_size:
+            noise_batch = noise_batch[:batch_size]
+            noise_lens = noise_lens[:batch_size]
+
+        return noise_batch, noise_lens
+
+    def _load_noise_batch(self):
+        """Load a batch of noises, restarting iteration if necessary."""
+        try:
+            batch = next(self.noise_data)
+        except StopIteration:
+            self.noise_data = iter(self.noise_dataloader)
+            batch = next(self.noise_data)
+
+        noises, lens = batch['feats'], batch['lengths']
+        return noises, lens
+
+
+class AddReverb(nn.Layer):
+    def __init__(
+            self,
+            rir_dataset,
+            reverb_prob=1.0,
+            rir_scale_factor=1.0,
+            num_workers=0, ):
+        super(AddReverb, self).__init__()
+        self.rir_dataset = rir_dataset
+        self.reverb_prob = reverb_prob
+        self.rir_scale_factor = rir_scale_factor
+
+        # Create rir data loader.
+        def rir_collate_fn(batch):
+            def pad(x, target_length, mode='constant', **kwargs):
+                x = np.asarray(x)
+                w = target_length - x.shape[0]
+                assert w >= 0, f'Target length {target_length} is less than origin length {x.shape[0]}'
+                return np.pad(x, [0, w], mode=mode, **kwargs)
+
+            ids = [item['id'] for item in batch]
+            lengths = np.asarray([item['feat'].shape[0] for item in batch])
+            waveforms = list(
+                map(lambda x: pad(x, lengths.max().item()),
+                    [item['feat'] for item in batch]))
+            waveforms = np.stack(waveforms)
+            return {'ids': ids, 'feats': waveforms, 'lengths': lengths}
+
+        self.rir_dataloader = paddle.io.DataLoader(
+            self.rir_dataset,
+            collate_fn=rir_collate_fn,
+            num_workers=num_workers,
+            shuffle=True,
+            return_list=True, )
+
+        self.rir_data = iter(self.rir_dataloader)
+
+    def forward(self, waveforms, lengths=None):
+        """
+        Arguments
+        ---------
+        waveforms : tensor
+            Shape should be `[batch, time]` or `[batch, time, channels]`.
+        lengths : tensor
+            Shape should be a single dimension, `[batch]`.
+
+        Returns
+        -------
+        Tensor of shape `[batch, time]` or `[batch, time, channels]`.
+        """
+
+        if lengths is None:
+            lengths = paddle.ones([len(waveforms)])
+
+        # Don't add reverb (return early) 1-`reverb_prob` portion of the time
+        if paddle.rand([1]) > self.reverb_prob:
+            return waveforms.clone()
+
+        # Add channels dimension if necessary
+        channel_added = False
+        if len(waveforms.shape) == 2:
+            waveforms = waveforms.unsqueeze(-1)
+            channel_added = True
+
+        # Load and prepare RIR
+        rir_waveform = self._load_rir()
+
+        # Compress or dilate RIR
+        if self.rir_scale_factor != 1:
+            rir_waveform = F.interpolate(
+                rir_waveform.transpose([0, 2, 1]),
+                scale_factor=self.rir_scale_factor,
+                mode="linear",
+                align_corners=False,
+                data_format='NCW', )
+            # (N, C, L) -> (N, L, C)
+            rir_waveform = rir_waveform.transpose([0, 2, 1])
+
+        rev_waveform = reverberate(
+            waveforms,
+            rir_waveform,
+            self.rir_dataset.sample_rate,
+            rescale_amp="avg")
+
+        # Remove channels dimension if added
+        if channel_added:
+            return rev_waveform.squeeze(-1)
+
+        return rev_waveform
+
+    def _load_rir(self):
+        try:
+            batch = next(self.rir_data)
+        except StopIteration:
+            self.rir_data = iter(self.rir_dataloader)
+            batch = next(self.rir_data)
+
+        rir_waveform = batch['feats']
+
+        # Make sure RIR has correct channels
+        if len(rir_waveform.shape) == 2:
+            rir_waveform = rir_waveform.unsqueeze(-1)
+
+        return rir_waveform
+
+
+class AddBabble(nn.Layer):
+    def __init__(
+            self,
+            speaker_count=3,
+            snr_low=0,
+            snr_high=0,
+            mix_prob=1, ):
+        super(AddBabble, self).__init__()
+        self.speaker_count = speaker_count
+        self.snr_low = snr_low
+        self.snr_high = snr_high
+        self.mix_prob = mix_prob
+
+    def forward(self, waveforms, lengths=None):
+        if lengths is None:
+            lengths = paddle.ones([len(waveforms)])
+
+        babbled_waveform = waveforms.clone()
+        lengths = (lengths * waveforms.shape[1]).unsqueeze(1)
+        batch_size = len(waveforms)
+
+        # Don't mix (return early) 1-`mix_prob` portion of the batches
+        if paddle.rand([1]) > self.mix_prob:
+            return babbled_waveform
+
+        # Pick an SNR and use it to compute the mixture amplitude factors
+        clean_amplitude = compute_amplitude(waveforms, lengths)
+        SNR = paddle.rand((batch_size, 1))
+        SNR = SNR * (self.snr_high - self.snr_low) + self.snr_low
+        noise_amplitude_factor = 1 / (dB_to_amplitude(SNR) + 1)
+        new_noise_amplitude = noise_amplitude_factor * clean_amplitude
+
+        # Scale clean signal appropriately
+        babbled_waveform *= 1 - noise_amplitude_factor
+
+        # For each speaker in the mixture, roll and add
+        babble_waveform = waveforms.roll((1, ), axis=0)
+        babble_len = lengths.roll((1, ), axis=0)
+        for i in range(1, self.speaker_count):
+            babble_waveform += waveforms.roll((1 + i, ), axis=0)
+            babble_len = paddle.concat(
+                [babble_len, babble_len.roll((1, ), axis=0)], axis=-1).max(
+                    axis=-1, keepdim=True)
+
+        # Rescale and add to mixture
+        babble_amplitude = compute_amplitude(babble_waveform, babble_len)
+        babble_waveform *= new_noise_amplitude / (babble_amplitude + 1e-14)
+        babbled_waveform += babble_waveform
+
+        return babbled_waveform
+
+
+class TimeDomainSpecAugment(nn.Layer):
+    def __init__(
+            self,
+            perturb_prob=1.0,
+            drop_freq_prob=1.0,
+            drop_chunk_prob=1.0,
+            speeds=[95, 100, 105],
+            sample_rate=16000,
+            drop_freq_count_low=0,
+            drop_freq_count_high=3,
+            drop_chunk_count_low=0,
+            drop_chunk_count_high=5,
+            drop_chunk_length_low=1000,
+            drop_chunk_length_high=2000,
+            drop_chunk_noise_factor=0, ):
+        super(TimeDomainSpecAugment, self).__init__()
+        self.speed_perturb = SpeedPerturb(
+            perturb_prob=perturb_prob,
+            orig_freq=sample_rate,
+            speeds=speeds, )
+        self.drop_freq = DropFreq(
+            drop_prob=drop_freq_prob,
+            drop_count_low=drop_freq_count_low,
+            drop_count_high=drop_freq_count_high, )
+        self.drop_chunk = DropChunk(
+            drop_prob=drop_chunk_prob,
+            drop_count_low=drop_chunk_count_low,
+            drop_count_high=drop_chunk_count_high,
+            drop_length_low=drop_chunk_length_low,
+            drop_length_high=drop_chunk_length_high,
+            noise_factor=drop_chunk_noise_factor, )
+
+    def forward(self, waveforms, lengths=None):
+        if lengths is None:
+            lengths = paddle.ones([len(waveforms)])
+
+        with paddle.no_grad():
+            # Augmentation
+            waveforms = self.speed_perturb(waveforms)
+            waveforms = self.drop_freq(waveforms)
+            waveforms = self.drop_chunk(waveforms, lengths)
+
+        return waveforms
+
+
+class EnvCorrupt(nn.Layer):
+    def __init__(
+            self,
+            reverb_prob=1.0,
+            babble_prob=1.0,
+            noise_prob=1.0,
+            rir_dataset=None,
+            noise_dataset=None,
+            num_workers=0,
+            babble_speaker_count=0,
+            babble_snr_low=0,
+            babble_snr_high=0,
+            noise_snr_low=0,
+            noise_snr_high=0,
+            rir_scale_factor=1.0, ):
+        super(EnvCorrupt, self).__init__()
+
+        # Initialize corrupters
+        if rir_dataset is not None and reverb_prob > 0.0:
+            self.add_reverb = AddReverb(
+                rir_dataset=rir_dataset,
+                num_workers=num_workers,
+                reverb_prob=reverb_prob,
+                rir_scale_factor=rir_scale_factor, )
+
+        if babble_speaker_count > 0 and babble_prob > 0.0:
+            self.add_babble = AddBabble(
+                speaker_count=babble_speaker_count,
+                snr_low=babble_snr_low,
+                snr_high=babble_snr_high,
+                mix_prob=babble_prob, )
+
+        if noise_dataset is not None and noise_prob > 0.0:
+            self.add_noise = AddNoise(
+                noise_dataset=noise_dataset,
+                num_workers=num_workers,
+                snr_low=noise_snr_low,
+                snr_high=noise_snr_high,
+                mix_prob=noise_prob, )
+
+    def forward(self, waveforms, lengths=None):
+        if lengths is None:
+            lengths = paddle.ones([len(waveforms)])
+
+        # Augmentation
+        with paddle.no_grad():
+            if hasattr(self, "add_reverb"):
+                try:
+                    waveforms = self.add_reverb(waveforms, lengths)
+                except Exception:
+                    pass
+            if hasattr(self, "add_babble"):
+                waveforms = self.add_babble(waveforms, lengths)
+            if hasattr(self, "add_noise"):
+                waveforms = self.add_noise(waveforms, lengths)
+
+        return waveforms
+
+
+def build_augment_pipeline(target_dir=None) -> List[paddle.nn.Layer]:
+    """build augment pipeline
+    Note: this pipeline cannot be used in the paddle.DataLoader
+
+    Returns:
+        List[paddle.nn.Layer]: all augment process
+    """
+    logger.info("start to build the augment pipeline")
+    noise_dataset = OpenRIRNoise('noise', target_dir=target_dir)
+    rir_dataset = OpenRIRNoise('rir', target_dir=target_dir)
+
+    wavedrop = TimeDomainSpecAugment(
+        sample_rate=16000,
+        speeds=[100], )
+    speed_perturb = TimeDomainSpecAugment(
+        sample_rate=16000,
+        speeds=[95, 100, 105], )
+    add_noise = EnvCorrupt(
+        noise_dataset=noise_dataset,
+        reverb_prob=0.0,
+        noise_prob=1.0,
+        noise_snr_low=0,
+        noise_snr_high=15,
+        rir_scale_factor=1.0, )
+    add_rev = EnvCorrupt(
+        rir_dataset=rir_dataset,
+        reverb_prob=1.0,
+        noise_prob=0.0,
+        rir_scale_factor=1.0, )
+    add_rev_noise = EnvCorrupt(
+        noise_dataset=noise_dataset,
+        rir_dataset=rir_dataset,
+        reverb_prob=1.0,
+        noise_prob=1.0,
+        noise_snr_low=0,
+        noise_snr_high=15,
+        rir_scale_factor=1.0, )
+
+    return [wavedrop, speed_perturb, add_noise, add_rev, add_rev_noise]
+
+
+def waveform_augment(waveforms: paddle.Tensor,
+                     augment_pipeline: List[paddle.nn.Layer]) -> paddle.Tensor:
+    """process the augment pipeline and return all the waveforms
+
+    Args:
+        waveforms (paddle.Tensor): original batch waveform
+        augment_pipeline (List[paddle.nn.Layer]): agument pipeline process
+
+    Returns:
+        paddle.Tensor: all the audio waveform including the original waveform and augmented waveform
+    """
+    # stage 0: store the original waveforms
+    waveforms_aug_list = [waveforms]
+
+    # augment the original batch waveform
+    for aug in augment_pipeline:
+        # stage 1: augment the data
+        waveforms_aug = aug(waveforms)  # (N, L)
+        if waveforms_aug.shape[1] >= waveforms.shape[1]:
+            # Trunc
+            waveforms_aug = waveforms_aug[:, :waveforms.shape[1]]
+        else:
+            # Pad
+            lengths_to_pad = waveforms.shape[1] - waveforms_aug.shape[1]
+            waveforms_aug = F.pad(
+                waveforms_aug.unsqueeze(-1), [0, lengths_to_pad],
+                data_format='NLC').squeeze(-1)
+        # stage 2: append the augmented waveform into the list
+        waveforms_aug_list.append(waveforms_aug)
+
+    # get the all the waveforms
+    return paddle.concat(waveforms_aug_list, axis=0)
diff --git a/paddlespeech/vector/io/batch.py b/paddlespeech/vector/io/batch.py
new file mode 100644
index 000000000..92ca990cf
--- /dev/null
+++ b/paddlespeech/vector/io/batch.py
@@ -0,0 +1,166 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy
+import numpy as np
+import paddle
+
+
+def waveform_collate_fn(batch):
+    waveforms = np.stack([item['feat'] for item in batch])
+    labels = np.stack([item['label'] for item in batch])
+
+    return {'waveforms': waveforms, 'labels': labels}
+
+
+def feature_normalize(feats: paddle.Tensor,
+                      mean_norm: bool=True,
+                      std_norm: bool=True,
+                      convert_to_numpy: bool=False):
+    # Features normalization if needed
+    # numpy.mean is a little with paddle.mean about 1e-6
+    if convert_to_numpy:
+        feats_np = feats.numpy()
+        mean = feats_np.mean(axis=-1, keepdims=True) if mean_norm else 0
+        std = feats_np.std(axis=-1, keepdims=True) if std_norm else 1
+        feats_np = (feats_np - mean) / std
+        feats = paddle.to_tensor(feats_np, dtype=feats.dtype)
+    else:
+        mean = feats.mean(axis=-1, keepdim=True) if mean_norm else 0
+        std = feats.std(axis=-1, keepdim=True) if std_norm else 1
+        feats = (feats - mean) / std
+
+    return feats
+
+
+def pad_right_2d(x, target_length, axis=-1, mode='constant', **kwargs):
+    x = np.asarray(x)
+    assert len(
+        x.shape) == 2, f'Only 2D arrays supported, but got shape: {x.shape}'
+
+    w = target_length - x.shape[axis]
+    assert w >= 0, f'Target length {target_length} is less than origin length {x.shape[axis]}'
+
+    if axis == 0:
+        pad_width = [[0, w], [0, 0]]
+    else:
+        pad_width = [[0, 0], [0, w]]
+
+    return np.pad(x, pad_width, mode=mode, **kwargs)
+
+
+def batch_feature_normalize(batch, mean_norm: bool=True, std_norm: bool=True):
+    ids = [item['id'] for item in batch]
+    lengths = np.asarray([item['feat'].shape[1] for item in batch])
+    feats = list(
+        map(lambda x: pad_right_2d(x, lengths.max()),
+            [item['feat'] for item in batch]))
+    feats = np.stack(feats)
+
+    # Features normalization if needed
+    for i in range(len(feats)):
+        feat = feats[i][:, :lengths[i]]  # Excluding pad values.
+        mean = feat.mean(axis=-1, keepdims=True) if mean_norm else 0
+        std = feat.std(axis=-1, keepdims=True) if std_norm else 1
+        feats[i][:, :lengths[i]] = (feat - mean) / std
+        assert feats[i][:, lengths[
+            i]:].sum() == 0  # Padding valus should all be 0.
+
+    # Converts into ratios.
+    # the utterance of the max length doesn't need to padding
+    # the remaining utterances need to padding and all of them will be padded to max length
+    # we convert the original length of each utterance to the ratio of the max length
+    lengths = (lengths / lengths.max()).astype(np.float32)
+
+    return {'ids': ids, 'feats': feats, 'lengths': lengths}
+
+
+def pad_right_to(array, target_shape, mode="constant", value=0):
+    """
+    This function takes a numpy array of arbitrary shape and pads it to target
+    shape by appending values on the right.
+
+    Args:
+        array: input numpy array. Input array whose dimension we need to pad.
+    target_shape : (list, tuple). Target shape we want for the target array its len must be equal to array.ndim
+    mode : str. Pad mode, please refer to numpy.pad documentation.
+    value : float. Pad value, please refer to numpy.pad documentation.
+
+    Returns:
+        array: numpy.array. Padded array.
+        valid_vals : list. List containing proportion for each dimension of original, non-padded values.
+    """
+    assert len(target_shape) == array.ndim
+    pads = []  # this contains the abs length of the padding for each dimension.
+    valid_vals = []  # this contains the relative lengths for each dimension.
+    i = 0  # iterating over target_shape ndims
+    while i < len(target_shape):
+        assert (target_shape[i] >= array.shape[i]
+                ), "Target shape must be >= original shape for every dim"
+        pads.append([0, target_shape[i] - array.shape[i]])
+        valid_vals.append(array.shape[i] / target_shape[i])
+        i += 1
+
+    array = numpy.pad(array, pads, mode=mode, constant_values=value)
+
+    return array, valid_vals
+
+
+def batch_pad_right(arrays, mode="constant", value=0):
+    """Given a list of numpy arrays it batches them together by padding to the right
+    on each dimension in order to get same length for all.
+
+    Args:
+        arrays : list. List of array we wish to pad together.
+        mode : str. Padding mode see numpy.pad documentation.
+        value : float. Padding value see numpy.pad documentation.
+
+    Returns:
+        array : numpy.array. Padded array.
+        valid_vals : list. List containing proportion for each dimension of original, non-padded values.
+    """
+
+    if not len(arrays):
+        raise IndexError("arrays list must not be empty")
+
+    if len(arrays) == 1:
+        # if there is only one array in the batch we simply unsqueeze it.
+        return numpy.expand_dims(arrays[0], axis=0), numpy.array([1.0])
+
+    if not (any(
+        [arrays[i].ndim == arrays[0].ndim for i in range(1, len(arrays))])):
+        raise IndexError("All arrays must have same number of dimensions")
+
+    # FIXME we limit the support here: we allow padding of only the last dimension
+    # need to remove this when feat extraction is updated to handle multichannel.
+    max_shape = []
+    for dim in range(arrays[0].ndim):
+        if dim != (arrays[0].ndim - 1):
+            if not all(
+                [x.shape[dim] == arrays[0].shape[dim] for x in arrays[1:]]):
+                raise EnvironmentError(
+                    "arrays should have same dimensions except for last one")
+        max_shape.append(max([x.shape[dim] for x in arrays]))
+
+    batched = []
+    valid = []
+    for t in arrays:
+        # for each array we apply pad_right_to
+        padded, valid_percent = pad_right_to(
+            t, max_shape, mode=mode, value=value)
+        batched.append(padded)
+        valid.append(valid_percent[-1])
+
+    batched = numpy.stack(batched)
+
+    return batched, numpy.array(valid)
diff --git a/paddlespeech/vector/io/signal_processing.py b/paddlespeech/vector/io/signal_processing.py
new file mode 100644
index 000000000..ee939bdb1
--- /dev/null
+++ b/paddlespeech/vector/io/signal_processing.py
@@ -0,0 +1,217 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import paddle
+
+# TODO: Complete type-hint and doc string.
+
+
+def blackman_window(win_len, dtype=np.float32):
+    arcs = np.pi * np.arange(win_len) / float(win_len)
+    win = np.asarray(
+        [0.42 - 0.5 * np.cos(2 * arc) + 0.08 * np.cos(4 * arc) for arc in arcs],
+        dtype=dtype)
+    return paddle.to_tensor(win)
+
+
+def compute_amplitude(waveforms, lengths=None, amp_type="avg", scale="linear"):
+    if len(waveforms.shape) == 1:
+        waveforms = waveforms.unsqueeze(0)
+
+    assert amp_type in ["avg", "peak"]
+    assert scale in ["linear", "dB"]
+
+    if amp_type == "avg":
+        if lengths is None:
+            out = paddle.mean(paddle.abs(waveforms), axis=1, keepdim=True)
+        else:
+            wav_sum = paddle.sum(paddle.abs(waveforms), axis=1, keepdim=True)
+            out = wav_sum / lengths
+    elif amp_type == "peak":
+        out = paddle.max(paddle.abs(waveforms), axis=1, keepdim=True)
+    else:
+        raise NotImplementedError
+
+    if scale == "linear":
+        return out
+    elif scale == "dB":
+        return paddle.clip(20 * paddle.log10(out), min=-80)
+    else:
+        raise NotImplementedError
+
+
+def dB_to_amplitude(SNR):
+    return 10**(SNR / 20)
+
+
+def convolve1d(
+        waveform,
+        kernel,
+        padding=0,
+        pad_type="constant",
+        stride=1,
+        groups=1, ):
+    if len(waveform.shape) != 3:
+        raise ValueError("Convolve1D expects a 3-dimensional tensor")
+
+    # Padding can be a tuple (left_pad, right_pad) or an int
+    if isinstance(padding, list):
+        waveform = paddle.nn.functional.pad(
+            x=waveform,
+            pad=padding,
+            mode=pad_type,
+            data_format='NLC', )
+
+    # Move time dimension last, which pad and fft and conv expect.
+    # (N, L, C) -> (N, C, L)
+    waveform = waveform.transpose([0, 2, 1])
+    kernel = kernel.transpose([0, 2, 1])
+
+    convolved = paddle.nn.functional.conv1d(
+        x=waveform,
+        weight=kernel,
+        stride=stride,
+        groups=groups,
+        padding=padding if not isinstance(padding, list) else 0, )
+
+    # Return time dimension to the second dimension.
+    return convolved.transpose([0, 2, 1])
+
+
+def notch_filter(notch_freq, filter_width=101, notch_width=0.05):
+    # Check inputs
+    assert 0 < notch_freq <= 1
+    assert filter_width % 2 != 0
+    pad = filter_width // 2
+    inputs = paddle.arange(filter_width, dtype='float32') - pad
+
+    # Avoid frequencies that are too low
+    notch_freq += notch_width
+
+    # Define sinc function, avoiding division by zero
+    def sinc(x):
+        def _sinc(x):
+            return paddle.sin(x) / x
+
+        # The zero is at the middle index
+        res = paddle.concat(
+            [_sinc(x[:pad]), paddle.ones([1]), _sinc(x[pad + 1:])])
+        return res
+
+    # Compute a low-pass filter with cutoff frequency notch_freq.
+    hlpf = sinc(3 * (notch_freq - notch_width) * inputs)
+    # import torch
+    # hlpf *= paddle.to_tensor(torch.blackman_window(filter_width).detach().numpy())
+    hlpf *= blackman_window(filter_width)
+    hlpf /= paddle.sum(hlpf)
+
+    # Compute a high-pass filter with cutoff frequency notch_freq.
+    hhpf = sinc(3 * (notch_freq + notch_width) * inputs)
+    # hhpf *= paddle.to_tensor(torch.blackman_window(filter_width).detach().numpy())
+    hhpf *= blackman_window(filter_width)
+    hhpf /= -paddle.sum(hhpf)
+    hhpf[pad] += 1
+
+    # Adding filters creates notch filter
+    return (hlpf + hhpf).reshape([1, -1, 1])
+
+
+def reverberate(waveforms,
+                rir_waveform,
+                sample_rate,
+                impulse_duration=0.3,
+                rescale_amp="avg"):
+    orig_shape = waveforms.shape
+
+    if len(waveforms.shape) > 3 or len(rir_waveform.shape) > 3:
+        raise NotImplementedError
+
+    # if inputs are mono tensors we reshape to 1, samples
+    if len(waveforms.shape) == 1:
+        waveforms = waveforms.unsqueeze(0).unsqueeze(-1)
+    elif len(waveforms.shape) == 2:
+        waveforms = waveforms.unsqueeze(-1)
+
+    if len(rir_waveform.shape) == 1:  # convolve1d expects a 3d tensor !
+        rir_waveform = rir_waveform.unsqueeze(0).unsqueeze(-1)
+    elif len(rir_waveform.shape) == 2:
+        rir_waveform = rir_waveform.unsqueeze(-1)
+
+    # Compute the average amplitude of the clean
+    orig_amplitude = compute_amplitude(waveforms, waveforms.shape[1],
+                                       rescale_amp)
+
+    # Compute index of the direct signal, so we can preserve alignment
+    impulse_index_start = rir_waveform.abs().argmax(axis=1).item()
+    impulse_index_end = min(
+        impulse_index_start + int(sample_rate * impulse_duration),
+        rir_waveform.shape[1])
+    rir_waveform = rir_waveform[:, impulse_index_start:impulse_index_end, :]
+    rir_waveform = rir_waveform / paddle.norm(rir_waveform, p=2)
+    rir_waveform = paddle.flip(rir_waveform, [1])
+
+    waveforms = convolve1d(
+        waveform=waveforms,
+        kernel=rir_waveform,
+        padding=[rir_waveform.shape[1] - 1, 0], )
+
+    # Rescale to the peak amplitude of the clean waveform
+    waveforms = rescale(waveforms, waveforms.shape[1], orig_amplitude,
+                        rescale_amp)
+
+    if len(orig_shape) == 1:
+        waveforms = waveforms.squeeze(0).squeeze(-1)
+    if len(orig_shape) == 2:
+        waveforms = waveforms.squeeze(-1)
+
+    return waveforms
+
+
+def rescale(waveforms, lengths, target_lvl, amp_type="avg", scale="linear"):
+    assert amp_type in ["peak", "avg"]
+    assert scale in ["linear", "dB"]
+
+    batch_added = False
+    if len(waveforms.shape) == 1:
+        batch_added = True
+        waveforms = waveforms.unsqueeze(0)
+
+    waveforms = normalize(waveforms, lengths, amp_type)
+
+    if scale == "linear":
+        out = target_lvl * waveforms
+    elif scale == "dB":
+        out = dB_to_amplitude(target_lvl) * waveforms
+
+    else:
+        raise NotImplementedError("Invalid scale, choose between dB and linear")
+
+    if batch_added:
+        out = out.squeeze(0)
+
+    return out
+
+
+def normalize(waveforms, lengths=None, amp_type="avg", eps=1e-14):
+    assert amp_type in ["avg", "peak"]
+
+    batch_added = False
+    if len(waveforms.shape) == 1:
+        batch_added = True
+        waveforms = waveforms.unsqueeze(0)
+
+    den = compute_amplitude(waveforms, lengths, amp_type) + eps
+    if batch_added:
+        waveforms = waveforms.squeeze(0)
+    return waveforms / den
diff --git a/paddlespeech/vector/models/ecapa_tdnn.py b/paddlespeech/vector/models/ecapa_tdnn.py
index e493b8004..0e7287cd3 100644
--- a/paddlespeech/vector/models/ecapa_tdnn.py
+++ b/paddlespeech/vector/models/ecapa_tdnn.py
@@ -47,6 +47,19 @@ class Conv1d(nn.Layer):
             groups=1,
             bias=True,
             padding_mode="reflect", ):
+        """_summary_
+
+        Args:
+            in_channels (int): intput channel or input data dimensions
+            out_channels (int): output channel or output data dimensions
+            kernel_size (int): kernel size of 1-d convolution
+            stride (int, optional): strid in 1-d convolution . Defaults to 1.
+            padding (str, optional): padding value. Defaults to "same".
+            dilation (int, optional): dilation in 1-d convolution. Defaults to 1.
+            groups (int, optional): groups in 1-d convolution. Defaults to 1.
+            bias (bool, optional): bias in 1-d convolution . Defaults to True.
+            padding_mode (str, optional): padding mode. Defaults to "reflect".
+        """
         super().__init__()
 
         self.kernel_size = kernel_size
@@ -134,6 +147,15 @@ class TDNNBlock(nn.Layer):
             kernel_size,
             dilation,
             activation=nn.ReLU, ):
+        """Implementation of TDNN network
+
+        Args:
+            in_channels (int): input channels or input embedding dimensions
+            out_channels (int): output channels or output embedding dimensions
+            kernel_size (int): the kernel size of the TDNN network block
+            dilation (int): the dilation of the TDNN network block
+            activation (paddle class, optional): the activation layers. Defaults to nn.ReLU.
+        """
         super().__init__()
         self.conv = Conv1d(
             in_channels=in_channels,
@@ -149,6 +171,15 @@ class TDNNBlock(nn.Layer):
 
 class Res2NetBlock(nn.Layer):
     def __init__(self, in_channels, out_channels, scale=8, dilation=1):
+        """Implementation of Res2Net Block with dilation
+           The paper is refered as "Res2Net: A New Multi-scale Backbone Architecture",
+           whose url is https://arxiv.org/abs/1904.01169
+        Args:
+            in_channels (int): input channels or input dimensions
+            out_channels (int): output channels or output dimensions
+            scale (int, optional): scale in res2net bolck. Defaults to 8.
+            dilation (int, optional): dilation of 1-d convolution in TDNN block. Defaults to 1.
+        """
         super().__init__()
         assert in_channels % scale == 0
         assert out_channels % scale == 0
@@ -179,6 +210,14 @@ class Res2NetBlock(nn.Layer):
 
 class SEBlock(nn.Layer):
     def __init__(self, in_channels, se_channels, out_channels):
+        """Implementation of SEBlock
+           The paper is refered as "Squeeze-and-Excitation Networks"
+           whose url is https://arxiv.org/abs/1709.01507
+        Args:
+            in_channels (int): input channels or input data dimensions
+            se_channels (_type_): _description_
+            out_channels (int): output channels or output data dimensions
+        """
         super().__init__()
 
         self.conv1 = Conv1d(
@@ -275,6 +314,18 @@ class SERes2NetBlock(nn.Layer):
             kernel_size=1,
             dilation=1,
             activation=nn.ReLU, ):
+        """Implementation of Squeeze-Extraction Res2Blocks in ECAPA-TDNN network model
+           The paper is refered "Squeeze-and-Excitation Networks"
+           whose url is: https://arxiv.org/pdf/1709.01507.pdf
+        Args:
+            in_channels (int): input channels or input data dimensions
+            out_channels (int): output channels or output data dimensions
+            res2net_scale (int, optional): scale in the res2net block. Defaults to 8.
+            se_channels (int, optional): embedding dimensions of res2net block. Defaults to 128.
+            kernel_size (int, optional): kernel size of 1-d convolution in TDNN block. Defaults to 1.
+            dilation (int, optional): dilation of 1-d convolution in TDNN block. Defaults to 1.
+            activation (paddle.nn.class, optional): activation function. Defaults to nn.ReLU.
+        """
         super().__init__()
         self.out_channels = out_channels
         self.tdnn1 = TDNNBlock(
@@ -326,7 +377,21 @@ class EcapaTdnn(nn.Layer):
             res2net_scale=8,
             se_channels=128,
             global_context=True, ):
-
+        """Implementation of ECAPA-TDNN backbone model network
+           The paper is refered as "ECAPA-TDNN: Emphasized Channel Attention, Propagation and Aggregation in TDNN Based Speaker Verification"
+           whose url is: https://arxiv.org/abs/2005.07143
+        Args:
+            input_size (_type_): input fature dimension
+            lin_neurons (int, optional): speaker embedding size. Defaults to 192.
+            activation (paddle.nn.class, optional): activation function. Defaults to nn.ReLU.
+            channels (list, optional): inter embedding dimension. Defaults to [512, 512, 512, 512, 1536].
+            kernel_sizes (list, optional): kernel size of 1-d convolution in TDNN block . Defaults to [5, 3, 3, 3, 1].
+            dilations (list, optional): dilations of 1-d convolution in TDNN block. Defaults to [1, 2, 3, 4, 1].
+            attention_channels (int, optional): attention dimensions. Defaults to 128.
+            res2net_scale (int, optional): scale value in res2net. Defaults to 8.
+            se_channels (int, optional): dimensions of squeeze-excitation block. Defaults to 128.
+            global_context (bool, optional): global context flag. Defaults to True.
+        """
         super().__init__()
         assert len(channels) == len(kernel_sizes)
         assert len(channels) == len(dilations)
diff --git a/paddlespeech/vector/modules/__init__.py b/paddlespeech/vector/modules/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/paddlespeech/vector/modules/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddlespeech/vector/modules/loss.py b/paddlespeech/vector/modules/loss.py
new file mode 100644
index 000000000..1c80dda4f
--- /dev/null
+++ b/paddlespeech/vector/modules/loss.py
@@ -0,0 +1,93 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This is modified from SpeechBrain
+# https://github.com/speechbrain/speechbrain/blob/085be635c07f16d42cd1295045bc46c407f1e15b/speechbrain/nnet/losses.py
+import math
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+class AngularMargin(nn.Layer):
+    def __init__(self, margin=0.0, scale=1.0):
+        """An implementation of Angular Margin (AM) proposed in the following
+           paper: '''Margin Matters: Towards More Discriminative Deep Neural Network
+           Embeddings for Speaker Recognition''' (https://arxiv.org/abs/1906.07317)
+
+        Args:
+            margin (float, optional): The margin for cosine similiarity. Defaults to 0.0.
+            scale (float, optional): The scale for cosine similiarity. Defaults to 1.0.
+        """
+        super(AngularMargin, self).__init__()
+        self.margin = margin
+        self.scale = scale
+
+    def forward(self, outputs, targets):
+        outputs = outputs - self.margin * targets
+        return self.scale * outputs
+
+
+class AdditiveAngularMargin(AngularMargin):
+    def __init__(self, margin=0.0, scale=1.0, easy_margin=False):
+        """The Implementation of Additive Angular Margin (AAM) proposed
+       in the following paper: '''Margin Matters: Towards More Discriminative Deep Neural Network Embeddings for Speaker Recognition'''
+       (https://arxiv.org/abs/1906.07317)
+
+        Args:
+            margin (float, optional): margin factor. Defaults to 0.0.
+            scale (float, optional): scale factor. Defaults to 1.0.
+            easy_margin (bool, optional): easy_margin flag. Defaults to False.
+        """
+        super(AdditiveAngularMargin, self).__init__(margin, scale)
+        self.easy_margin = easy_margin
+
+        self.cos_m = math.cos(self.margin)
+        self.sin_m = math.sin(self.margin)
+        self.th = math.cos(math.pi - self.margin)
+        self.mm = math.sin(math.pi - self.margin) * self.margin
+
+    def forward(self, outputs, targets):
+        cosine = outputs.astype('float32')
+        sine = paddle.sqrt(1.0 - paddle.pow(cosine, 2))
+        phi = cosine * self.cos_m - sine * self.sin_m  # cos(theta + m)
+        if self.easy_margin:
+            phi = paddle.where(cosine > 0, phi, cosine)
+        else:
+            phi = paddle.where(cosine > self.th, phi, cosine - self.mm)
+        outputs = (targets * phi) + ((1.0 - targets) * cosine)
+        return self.scale * outputs
+
+
+class LogSoftmaxWrapper(nn.Layer):
+    def __init__(self, loss_fn):
+        """Speaker identificatin loss function wrapper 
+           including all of compositions of the loss transformation
+        Args:
+            loss_fn (_type_): the loss value of a batch
+        """
+        super(LogSoftmaxWrapper, self).__init__()
+        self.loss_fn = loss_fn
+        self.criterion = paddle.nn.KLDivLoss(reduction="sum")
+
+    def forward(self, outputs, targets, length=None):
+        targets = F.one_hot(targets, outputs.shape[1])
+        try:
+            predictions = self.loss_fn(outputs, targets)
+        except TypeError:
+            predictions = self.loss_fn(outputs)
+
+        predictions = F.log_softmax(predictions, axis=1)
+        loss = self.criterion(predictions, targets) / targets.sum()
+        return loss
diff --git a/paddlespeech/vector/modules/sid_model.py b/paddlespeech/vector/modules/sid_model.py
new file mode 100644
index 000000000..4045f75d1
--- /dev/null
+++ b/paddlespeech/vector/modules/sid_model.py
@@ -0,0 +1,87 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+class SpeakerIdetification(nn.Layer):
+    def __init__(
+            self,
+            backbone,
+            num_class,
+            lin_blocks=0,
+            lin_neurons=192,
+            dropout=0.1, ):
+        """The speaker identification model, which includes the speaker backbone network 
+           and the a linear transform to speaker class num in training
+
+        Args:
+            backbone (Paddle.nn.Layer class): the speaker identification backbone network model
+            num_class (_type_): the speaker class num in the training dataset
+            lin_blocks (int, optional): the linear layer transform between the embedding and the final linear layer. Defaults to 0.
+            lin_neurons (int, optional): the output dimension of final linear layer. Defaults to 192.
+            dropout (float, optional): the dropout factor on the embedding. Defaults to 0.1.
+        """
+        super(SpeakerIdetification, self).__init__()
+        # speaker idenfication backbone network model
+        # the output of the backbond network is the target embedding
+        self.backbone = backbone
+        if dropout > 0:
+            self.dropout = nn.Dropout(dropout)
+        else:
+            self.dropout = None
+
+        # construct the speaker classifer
+        input_size = self.backbone.emb_size
+        self.blocks = nn.LayerList()
+        for i in range(lin_blocks):
+            self.blocks.extend([
+                nn.BatchNorm1D(input_size),
+                nn.Linear(in_features=input_size, out_features=lin_neurons),
+            ])
+            input_size = lin_neurons
+
+        # the final layer
+        self.weight = paddle.create_parameter(
+            shape=(input_size, num_class),
+            dtype='float32',
+            attr=paddle.ParamAttr(initializer=nn.initializer.XavierUniform()), )
+
+    def forward(self, x, lengths=None):
+        """Do the speaker identification model forwrd, 
+           including the speaker embedding model and the classifier model network
+
+        Args:
+            x (paddle.Tensor): input audio feats, 
+                               shape=[batch, dimension, times]
+            lengths (paddle.Tensor, optional): input audio length.
+                                        shape=[batch, times]
+                                        Defaults to None.
+
+        Returns:
+            paddle.Tensor: return the logits of the feats
+        """
+        # x.shape: (N, C, L)
+        x = self.backbone(x, lengths).squeeze(
+            -1)  # (N, emb_size, 1) -> (N, emb_size)
+        if self.dropout is not None:
+            x = self.dropout(x)
+
+        for fc in self.blocks:
+            x = fc(x)
+
+        logits = F.linear(F.normalize(x), F.normalize(self.weight, axis=0))
+
+        return logits
diff --git a/paddlespeech/vector/training/__init__.py b/paddlespeech/vector/training/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/paddlespeech/vector/training/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddlespeech/vector/training/scheduler.py b/paddlespeech/vector/training/scheduler.py
new file mode 100644
index 000000000..3dcac0576
--- /dev/null
+++ b/paddlespeech/vector/training/scheduler.py
@@ -0,0 +1,45 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle.optimizer.lr import LRScheduler
+
+
+class CyclicLRScheduler(LRScheduler):
+    def __init__(self,
+                 base_lr: float=1e-8,
+                 max_lr: float=1e-3,
+                 step_size: int=10000):
+
+        super(CyclicLRScheduler, self).__init__()
+
+        self.current_step = -1
+        self.base_lr = base_lr
+        self.max_lr = max_lr
+        self.step_size = step_size
+
+    def step(self):
+        if not hasattr(self, 'current_step'):
+            return
+
+        self.current_step += 1
+        if self.current_step >= 2 * self.step_size:
+            self.current_step %= 2 * self.step_size
+
+        self.last_lr = self.get_lr()
+
+    def get_lr(self):
+        p = self.current_step / (2 * self.step_size)  # Proportion in one cycle.
+        if p < 0.5:  # Increase
+            return self.base_lr + p / 0.5 * (self.max_lr - self.base_lr)
+        else:  # Decrease
+            return self.max_lr - (p / 0.5 - 1) * (self.max_lr - self.base_lr)
diff --git a/paddlespeech/vector/training/seeding.py b/paddlespeech/vector/training/seeding.py
new file mode 100644
index 000000000..0778a27d6
--- /dev/null
+++ b/paddlespeech/vector/training/seeding.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddlespeech.s2t.utils.log import Log
+
+logger = Log(__name__).getlog()
+import random
+
+import numpy as np
+import paddle
+
+
+def seed_everything(seed: int):
+    """Seed paddle, random and np.random to help reproductivity."""
+    paddle.seed(seed)
+    random.seed(seed)
+    np.random.seed(seed)
+    logger.info(f"Set the seed of paddle, random, np.random to {seed}.")
diff --git a/paddlespeech/vector/utils/__init__.py b/paddlespeech/vector/utils/__init__.py
new file mode 100644
index 000000000..97043fd7b
--- /dev/null
+++ b/paddlespeech/vector/utils/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddlespeech/vector/utils/time.py b/paddlespeech/vector/utils/time.py
new file mode 100644
index 000000000..8e85b0e12
--- /dev/null
+++ b/paddlespeech/vector/utils/time.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import time
+
+
+class Timer(object):
+    '''Calculate runing speed and estimated time of arrival(ETA)'''
+
+    def __init__(self, total_step: int):
+        self.total_step = total_step
+        self.last_start_step = 0
+        self.current_step = 0
+        self._is_running = True
+
+    def start(self):
+        self.last_time = time.time()
+        self.start_time = time.time()
+
+    def stop(self):
+        self._is_running = False
+        self.end_time = time.time()
+
+    def count(self) -> int:
+        if not self.current_step >= self.total_step:
+            self.current_step += 1
+        return self.current_step
+
+    @property
+    def timing(self) -> float:
+        run_steps = self.current_step - self.last_start_step
+        self.last_start_step = self.current_step
+        time_used = time.time() - self.last_time
+        self.last_time = time.time()
+        return time_used / run_steps
+
+    @property
+    def is_running(self) -> bool:
+        return self._is_running
+
+    @property
+    def eta(self) -> str:
+        if not self.is_running:
+            return '00:00:00'
+        remaining_time = time.time() - self.start_time
+        return seconds_to_hms(remaining_time)
+
+
+def seconds_to_hms(seconds: int) -> str:
+    '''Convert the number of seconds to hh:mm:ss'''
+    h = math.floor(seconds / 3600)
+    m = math.floor((seconds - h * 3600) / 60)
+    s = int(seconds - h * 3600 - m * 60)
+    hms_str = '{:0>2}:{:0>2}:{:0>2}'.format(h, m, s)
+    return hms_str
diff --git a/setup.py b/setup.py
index f86758bab..82ff63412 100644
--- a/setup.py
+++ b/setup.py
@@ -27,7 +27,7 @@ from setuptools.command.install import install
 
 HERE = Path(os.path.abspath(os.path.dirname(__file__)))
 
-VERSION = '0.1.2'
+VERSION = '0.2.0'
 
 base = [
     "editdistance",
diff --git a/speechx/.gitignore b/speechx/.gitignore
new file mode 100644
index 000000000..e0c618470
--- /dev/null
+++ b/speechx/.gitignore
@@ -0,0 +1 @@
+tools/valgrind*
diff --git a/speechx/CMakeLists.txt b/speechx/CMakeLists.txt
index e003136a9..f1330d1da 100644
--- a/speechx/CMakeLists.txt
+++ b/speechx/CMakeLists.txt
@@ -2,18 +2,32 @@ cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
 
 project(paddlespeech VERSION 0.1)
 
+set(CMAKE_PROJECT_INCLUDE_BEFORE "${CMAKE_CURRENT_SOURCE_DIR}/cmake/EnableCMP0048.cmake")
+
 set(CMAKE_VERBOSE_MAKEFILE on)
+
 # set std-14
 set(CMAKE_CXX_STANDARD 14)
 
-# include file 
+# cmake dir
+set(speechx_cmake_dir ${PROJECT_SOURCE_DIR}/cmake)
+
+# Modules
+list(APPEND CMAKE_MODULE_PATH ${speechx_cmake_dir}/external)
+list(APPEND CMAKE_MODULE_PATH ${speechx_cmake_dir})
 include(FetchContent)
 include(ExternalProject)
+
 # fc_patch dir
 set(FETCHCONTENT_QUIET off)
 get_filename_component(fc_patch "fc_patch" REALPATH BASE_DIR "${CMAKE_SOURCE_DIR}")
 set(FETCHCONTENT_BASE_DIR ${fc_patch})
 
+# compiler option
+# Keep the same with openfst, -fPIC or -fpic
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --std=c++14 -pthread -fPIC -O0 -Wall -g")
+SET(CMAKE_CXX_FLAGS_DEBUG "$ENV{CXXFLAGS} --std=c++14 -pthread -fPIC -O0 -Wall -g -ggdb")
+SET(CMAKE_CXX_FLAGS_RELEASE "$ENV{CXXFLAGS} --std=c++14 -pthread -fPIC -O3 -Wall")
 
 ###############################################################################
 # Option Configurations
@@ -25,91 +39,92 @@ option(TEST_DEBUG "option for debug" OFF)
 ###############################################################################
 # Include third party
 ###############################################################################
-# #example for include third party
-# FetchContent_Declare()
-# # FetchContent_MakeAvailable was not added until CMake 3.14
+# example for include third party
+# FetchContent_MakeAvailable was not added until CMake 3.14
 # FetchContent_MakeAvailable()
 # include_directories()
 
+# gflags
+include(gflags)
+
+# glog
+include(glog)
+
+# gtest
+include(gtest)
+
 # ABSEIL-CPP
-include(FetchContent)
-FetchContent_Declare(
-  absl
-  GIT_REPOSITORY "https://github.com/abseil/abseil-cpp.git"
-  GIT_TAG "20210324.1"
-)
-FetchContent_MakeAvailable(absl)
+include(absl)
 
 # libsndfile
-include(FetchContent)
-FetchContent_Declare(
-  libsndfile
-  GIT_REPOSITORY "https://github.com/libsndfile/libsndfile.git"
-  GIT_TAG "1.0.31"
-)
-FetchContent_MakeAvailable(libsndfile)
+include(libsndfile)
 
-# gflags
-FetchContent_Declare(
-  gflags
-  URL      https://github.com/gflags/gflags/archive/v2.2.1.zip
-  URL_HASH SHA256=4e44b69e709c826734dbbbd5208f61888a2faf63f239d73d8ba0011b2dccc97a
-)
-FetchContent_MakeAvailable(gflags)
-include_directories(${gflags_BINARY_DIR}/include)
+# boost
+# include(boost) # not work
+set(boost_SOURCE_DIR ${fc_patch}/boost-src)
+set(BOOST_ROOT ${boost_SOURCE_DIR})
+# #find_package(boost REQUIRED PATHS ${BOOST_ROOT})
 
-# glog
-FetchContent_Declare(
-  glog
-  URL      https://github.com/google/glog/archive/v0.4.0.zip
-  URL_HASH SHA256=9e1b54eb2782f53cd8af107ecf08d2ab64b8d0dc2b7f5594472f3bd63ca85cdc
-)
-FetchContent_MakeAvailable(glog)
-include_directories(${glog_BINARY_DIR})
+# Eigen
+include(eigen)
+find_package(Eigen3 REQUIRED)
 
-# gtest
-FetchContent_Declare(googletest
-  URL      https://github.com/google/googletest/archive/release-1.10.0.zip
-  URL_HASH SHA256=94c634d499558a76fa649edb13721dce6e98fb1e7018dfaeba3cd7a083945e91
-)
-FetchContent_MakeAvailable(googletest)
+# Kenlm
+include(kenlm)
+add_dependencies(kenlm eigen boost)
+
+#openblas
+include(openblas)
 
 # openfst
-set(openfst_SOURCE_DIR ${fc_patch}/openfst-src)
-set(openfst_BINARY_DIR ${fc_patch}/openfst-build)
-set(openfst_PREFIX_DIR ${fc_patch}/openfst-subbuild/openfst-populate-prefix)
-ExternalProject_Add(openfst
-  URL               https://github.com/mjansche/openfst/archive/refs/tags/1.7.2.zip
-  URL_HASH          SHA256=ffc56931025579a8af3515741c0f3b0fc3a854c023421472c07ca0c6389c75e6
-  SOURCE_DIR        ${openfst_SOURCE_DIR}
-  BINARY_DIR        ${openfst_BINARY_DIR}
-  CONFIGURE_COMMAND ${openfst_SOURCE_DIR}/configure --prefix=${openfst_PREFIX_DIR}
-                      "CPPFLAGS=-I${gflags_BINARY_DIR}/include -I${glog_SOURCE_DIR}/src -I${glog_BINARY_DIR}"
-                      "LDFLAGS=-L${gflags_BINARY_DIR} -L${glog_BINARY_DIR}"
-                      "LIBS=-lgflags_nothreads -lglog -lpthread"
-  BUILD_COMMAND     make -j 4
-)
+include(openfst)
 add_dependencies(openfst gflags glog)
-link_directories(${openfst_PREFIX_DIR}/lib)
-include_directories(${openfst_PREFIX_DIR}/include)
 
-add_subdirectory(speechx)
 
-#openblas
-#set(OpenBLAS_INSTALL_PREFIX ${fc_patch}/OpenBLAS)
-#set(OpenBLAS_SOURCE_DIR ${fc_patch}/OpenBLAS-src)
-#ExternalProject_Add(
-#  OpenBLAS
-#  GIT_REPOSITORY https://github.com/xianyi/OpenBLAS
-#  GIT_TAG v0.3.13
-#  GIT_SHALLOW TRUE
-#  GIT_PROGRESS TRUE
-#  CONFIGURE_COMMAND ""
-#  BUILD_IN_SOURCE TRUE
-#  BUILD_COMMAND make USE_LOCKING=1 USE_THREAD=0
-#  INSTALL_COMMAND make PREFIX=${OpenBLAS_INSTALL_PREFIX} install
-#  UPDATE_DISCONNECTED TRUE
-#)
+# paddle lib
+set(paddle_SOURCE_DIR ${fc_patch}/paddle-lib)
+set(paddle_PREFIX_DIR ${fc_patch}/paddle-lib-prefix)
+ExternalProject_Add(paddle
+  URL      https://paddle-inference-lib.bj.bcebos.com/2.2.2/cxx_c/Linux/CPU/gcc8.2_avx_mkl/paddle_inference.tgz
+  URL_HASH SHA256=7c6399e778c6554a929b5a39ba2175e702e115145e8fa690d2af974101d98873
+  PREFIX            ${paddle_PREFIX_DIR} 
+  SOURCE_DIR        ${paddle_SOURCE_DIR}
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND     ""
+  INSTALL_COMMAND   ""
+)
+
+set(PADDLE_LIB ${fc_patch}/paddle-lib)
+include_directories("${PADDLE_LIB}/paddle/include")
+set(PADDLE_LIB_THIRD_PARTY_PATH "${PADDLE_LIB}/third_party/install/")
+include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/include")
+include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/include")
+include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}cryptopp/include")
+
+link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/lib")
+link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/lib")
+link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}cryptopp/lib")
+link_directories("${PADDLE_LIB}/paddle/lib")
+link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}mklml/lib")
+
+##paddle with mkl
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
+set(MATH_LIB_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}mklml")
+include_directories("${MATH_LIB_PATH}/include")
+set(MATH_LIB ${MATH_LIB_PATH}/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX}
+                 ${MATH_LIB_PATH}/lib/libiomp5${CMAKE_SHARED_LIBRARY_SUFFIX})
+set(MKLDNN_PATH "${PADDLE_LIB_THIRD_PARTY_PATH}mkldnn")
+include_directories("${MKLDNN_PATH}/include")
+set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0)
+set(EXTERNAL_LIB "-lrt -ldl -lpthread")
+
+set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_inference${CMAKE_SHARED_LIBRARY_SUFFIX})
+set(DEPS ${DEPS}
+      ${MATH_LIB} ${MKLDNN_LIB}
+      glog gflags protobuf xxhash cryptopp
+      ${EXTERNAL_LIB})
+
+
 
 ###############################################################################
 # Add local library
@@ -121,4 +136,9 @@ add_subdirectory(speechx)
 # if dir do not have CmakeLists.txt 
 #add_library(lib_name STATIC file.cc)
 #target_link_libraries(lib_name item0 item1)
-#add_dependencies(lib_name depend-target)
\ No newline at end of file
+#add_dependencies(lib_name depend-target)
+
+set(SPEECHX_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/speechx)
+
+add_subdirectory(speechx)
+add_subdirectory(examples)
\ No newline at end of file
diff --git a/speechx/README.md b/speechx/README.md
new file mode 100644
index 000000000..610b88a8f
--- /dev/null
+++ b/speechx/README.md
@@ -0,0 +1,63 @@
+# SpeechX -- All in One Speech Task Inference 
+
+## Environment
+
+We develop under:
+* docker - registry.baidubce.com/paddlepaddle/paddle:2.1.1-gpu-cuda10.2-cudnn7
+* os - Ubuntu 16.04.7 LTS
+* gcc/g++/gfortran - 8.2.0
+* cmake - 3.16.0
+
+> We make sure all things work fun under docker, and recommend using it to develop and deploy.
+
+* [How to Install Docker](https://docs.docker.com/engine/install/)
+* [A Docker Tutorial for Beginners](https://docker-curriculum.com/)
+* [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/overview.html)
+
+## Build
+
+1. First to launch docker container.
+
+```
+nvidia-docker run --privileged  --net=host --ipc=host -it --rm -v $PWD:/workspace --name=dev registry.baidubce.com/paddlepaddle/paddle:2.1.1-gpu-cuda10.2-cudnn7 /bin/bash
+```
+
+* More `Paddle` docker images you can see [here](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/docker/linux-docker.html).
+
+* If you want only work under cpu, please download corresponded [image](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/docker/linux-docker.html), and using `docker` instead `nvidia-docker`.
+
+
+2. Build `speechx` and `examples`.
+
+> Do not source venv.
+
+```
+pushd /path/to/speechx
+./build.sh
+```
+
+3. Go to `examples` to have a fun.
+
+More details please see `README.md` under `examples`.
+
+
+## Valgrind (Optional)
+
+> If using docker please check `--privileged` is set when `docker run`.
+
+* Fatal error at startup: `a function redirection which is mandatory for this platform-tool combination cannot be set up`
+```
+apt-get install libc6-dbg
+```
+
+* Install
+
+```
+pushd tools
+./setup_valgrind.sh
+popd
+```
+
+## TODO
+
+* DecibelNormalizer: there is a little bit difference between offline and online db norm. The computation of online db norm read feature chunk by chunk, which causes the feature size is different with offline db norm. In normalizer.cc:73, the samples.size() is different, which causes the difference of result.
diff --git a/speechx/build.sh b/speechx/build.sh
new file mode 100755
index 000000000..8e36d2336
--- /dev/null
+++ b/speechx/build.sh
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+
+# the build script had verified in the paddlepaddle docker image.
+# please follow the instruction below to install PaddlePaddle image.
+# https://www.paddlepaddle.org.cn/documentation/docs/zh/install/docker/linux-docker.html 
+boost_SOURCE_DIR=$PWD/fc_patch/boost-src
+if [ ! -d ${boost_SOURCE_DIR} ]; then wget -c https://boostorg.jfrog.io/artifactory/main/release/1.75.0/source/boost_1_75_0.tar.gz 
+  tar xzfv boost_1_75_0.tar.gz
+  mkdir -p $PWD/fc_patch
+  mv boost_1_75_0 ${boost_SOURCE_DIR} 
+  cd ${boost_SOURCE_DIR}
+  bash ./bootstrap.sh
+  ./b2
+  cd -
+  echo -e "\n"
+fi
+
+#rm -rf build
+mkdir -p build
+cd build
+
+cmake .. -DBOOST_ROOT:STRING=${boost_SOURCE_DIR}
+#cmake .. 
+
+make -j10
+
+cd -
diff --git a/speechx/cmake/EnableCMP0048.cmake b/speechx/cmake/EnableCMP0048.cmake
new file mode 100644
index 000000000..1b59188fd
--- /dev/null
+++ b/speechx/cmake/EnableCMP0048.cmake
@@ -0,0 +1 @@
+cmake_policy(SET CMP0048 NEW)
\ No newline at end of file
diff --git a/speechx/cmake/FindGFortranLibs.cmake b/speechx/cmake/FindGFortranLibs.cmake
new file mode 100644
index 000000000..763f78833
--- /dev/null
+++ b/speechx/cmake/FindGFortranLibs.cmake
@@ -0,0 +1,145 @@
+#.rst:
+# FindGFortranLibs
+# --------
+#  https://github.com/Argonne-National-Laboratory/PIPS/blob/master/cmake/Modules/FindGFortranLibs.cmake
+#  https://enccs.github.io/cmake-workshop/cxx-fortran/
+#
+# Find gcc Fortran compiler & library paths
+#
+# The module defines the following variables:
+#
+# ::
+#
+#
+#   GFORTRANLIBS_FOUND - true if system has gfortran
+#   LIBGFORTRAN_LIBRARIES - path to libgfortran
+#   LIBQUADMATH_LIBRARIES - path to libquadmath
+#   GFORTRAN_LIBARIES_DIR - directory containing libgfortran, libquadmath
+#   GFORTRAN_INCLUDE_DIR - directory containing gfortran/gcc headers
+#   LIBGOMP_LIBRARIES - path to libgomp
+#   LIBGOMP_INCLUDE_DIR - directory containing omp.h header
+#   GFORTRAN_VERSION_STRING - version of gfortran found
+#
+set(CMAKE_REQUIRED_QUIET ${LIBIOMP_FIND_QUIETLY})
+
+if(NOT CMAKE_REQUIRED_QUIET)
+  message(STATUS "Looking for gfortran related libraries...")
+endif()
+
+enable_language(Fortran)
+if(CMAKE_Fortran_COMPILER_ID MATCHES "GNU")
+
+  # Basically, call "gfortran -v" to dump compiler info to the string
+  # GFORTRAN_VERBOSE_STR, which will be used to get necessary paths
+  message(STATUS "Extracting library and header information by calling 'gfortran -v'...")
+  execute_process(COMMAND "${CMAKE_Fortran_COMPILER}" "-v" ERROR_VARIABLE
+    GFORTRAN_VERBOSE_STR RESULT_VARIABLE FLAG)
+
+  # For debugging
+  message(STATUS "'gfortran -v' returned:")
+  message(STATUS "${GFORTRAN_VERBOSE_STR}")
+
+  # Detect gfortran version
+  string(REGEX MATCH "gcc version [^\t\n ]+" GFORTRAN_VER_STR "${GFORTRAN_VERBOSE_STR}")
+  string(REGEX REPLACE "gcc version ([^\t\n ]+)" "\\1" GFORTRAN_VERSION_STRING "${GFORTRAN_VER_STR}")
+  message(STATUS "Detected gfortran version ${GFORTRAN_VERSION_STRING}")
+  unset(GFORTRAN_VER_STR)
+
+  set(MATCH_REGEX "[^\t\n ]+[\t\n ]+")
+  set(REPLACE_REGEX "([^\t\n ]+)")
+
+  # Find architecture for compiler
+  string(REGEX MATCH "Target: [^\t\n ]+"
+    GFORTRAN_ARCH_STR "${GFORTRAN_VERBOSE_STR}")
+  message(STATUS "Architecture string: ${GFORTRAN_ARCH_STR}")
+  string(REGEX REPLACE "Target: ([^\t\n ]+)" "\\1"
+    GFORTRAN_ARCH "${GFORTRAN_ARCH_STR}")
+  message(STATUS "Detected gfortran architecture: ${GFORTRAN_ARCH}")
+  unset(GFORTRAN_ARCH_STR)
+
+  # Find install prefix, if it exists; if not, use default
+  string(REGEX MATCH  "--prefix=[^\t\n ]+[\t\n ]+"
+    GFORTRAN_PREFIX_STR "${GFORTRAN_VERBOSE_STR}")
+  if(NOT GFORTRAN_PREFIX_STR)
+    message(STATUS "Detected default gfortran prefix")
+    set(GFORTRAN_PREFIX_DIR "/usr/local") # default prefix for gcc install
+  else()
+    string(REGEX REPLACE "--prefix=([^\t\n ]+)" "\\1"
+      GFORTRAN_PREFIX_DIR "${GFORTRAN_PREFIX_STR}")
+  endif()
+  message(STATUS "Detected gfortran prefix: ${GFORTRAN_PREFIX_DIR}")
+  unset(GFORTRAN_PREFIX_STR)
+
+  # Find install exec-prefix, if it exists; if not, use default
+  string(REGEX MATCH "--exec-prefix=[^\t\n ]+[\t\n ]+" "\\1"
+    GFORTRAN_EXEC_PREFIX_STR "${GFORTRAN_VERBOSE_STR}")
+  if(NOT GFORTRAN_EXEC_PREFIX_STR)
+    message(STATUS "Detected default gfortran exec-prefix")
+    set(GFORTRAN_EXEC_PREFIX_DIR "${GFORTRAN_PREFIX_DIR}")
+  else()
+    string(REGEX REPLACE "--exec-prefix=([^\t\n ]+)" "\\1"
+      GFORTRAN_EXEC_PREFIX_DIR "${GFORTRAN_EXEC_PREFIX_STR}")
+  endif()
+  message(STATUS "Detected gfortran exec-prefix: ${GFORTRAN_EXEC_PREFIX_DIR}")
+  UNSET(GFORTRAN_EXEC_PREFIX_STR)
+
+  # Find library directory and include directory, if library directory specified
+  string(REGEX MATCH "--libdir=[^\t\n ]+"
+    GFORTRAN_LIB_DIR_STR "${GFORTRAN_VERBOSE_STR}")
+  if(NOT GFORTRAN_LIB_DIR_STR)
+    message(STATUS "Found --libdir flag -- not found")
+    message(STATUS "Using default gfortran library & include directory paths")
+    set(GFORTRAN_LIBRARIES_DIR
+      "${GFORTRAN_EXEC_PREFIX_DIR}/lib/gcc/${GFORTRAN_ARCH}/${GFORTRAN_VERSION_STRING}")
+    string(CONCAT GFORTRAN_INCLUDE_DIR "${GFORTRAN_LIBRARIES_DIR}" "/include")
+  else()
+    message(STATUS "Found --libdir flag -- yes")
+    string(REGEX REPLACE "--libdir=([^\t\n ]+)" "\\1"
+      GFORTRAN_LIBRARIES_DIR "${GFORTRAN_LIB_DIR_STR}")
+    string(CONCAT GFORTRAN_INCLUDE_DIR "${GFORTRAN_LIBRARIES_DIR}" "/gcc/" "${GFORTRAN_ARCH}" "/" "${GFORTRAN_VERSION_STRING}" "/include")
+  endif()
+  message(STATUS "gfortran libraries path: ${GFORTRAN_LIBRARIES_DIR}")
+  message(STATUS "gfortran include path dir: ${GFORTRAN_INCLUDE_DIR}")
+  unset(GFORTRAN_LIB_DIR_STR)
+
+  # There are lots of other build options for gcc & gfortran. For now, the
+  # options implemented above should cover a lot of common use cases.
+
+  # Clean up be deleting the output string from "gfortran -v"
+  unset(GFORTRAN_VERBOSE_STR)
+
+  # Find paths for libgfortran, libquadmath, libgomp
+  # libgomp needed for OpenMP support without Clang
+  find_library(LIBGFORTRAN_LIBRARIES NAMES gfortran libgfortran
+    HINTS ${GFORTRAN_LIBRARIES_DIR})
+  find_library(LIBQUADMATH_LIBRARIES NAMES quadmath libquadmath
+    HINTS ${GFORTRAN_LIBRARIES_DIR})
+  find_library(LIBGOMP_LIBRARIES NAMES gomp libgomp
+    HINTS ${GFORTRAN_LIBRARIES_DIR})
+
+  # Find OpenMP headers
+  find_path(LIBGOMP_INCLUDE_DIR NAMES omp.h HINTS ${GFORTRAN_INCLUDE_DIR})
+
+else()
+  message(STATUS "CMAKE_Fortran_COMPILER_ID does not match 'GNU'!")
+endif()
+
+include(FindPackageHandleStandardArgs)
+
+# Required: libgfortran, libquadmath, path for gfortran libraries
+# Optional: libgomp, path for OpenMP headers, path for gcc/gfortran headers
+find_package_handle_standard_args(GFortranLibs
+  REQUIRED_VARS LIBGFORTRAN_LIBRARIES LIBQUADMATH_LIBRARIES GFORTRAN_LIBRARIES_DIR
+  VERSION_VAR GFORTRAN_VERSION_STRING)
+
+if(GFORTRANLIBS_FOUND)
+  message(STATUS "Looking for gfortran libraries -- found")
+  message(STATUS "gfortran version: ${GFORTRAN_VERSION_STRING}")
+else()
+  message(STATUS "Looking for gfortran libraries -- not found")
+endif()
+
+mark_as_advanced(LIBGFORTRAN_LIBRARIES LIBQUADMATH_LIBRARIES
+  LIBGOMP_LIBRARIES LIBGOMP_INCLUDE_DIR
+  GFORTRAN_LIBRARIES_DIR GFORTRAN_INCLUDE_DIR)
+# FindGFortranLIBS.cmake ends here
\ No newline at end of file
diff --git a/speechx/cmake/external/absl.cmake b/speechx/cmake/external/absl.cmake
new file mode 100644
index 000000000..2c5e5af5c
--- /dev/null
+++ b/speechx/cmake/external/absl.cmake
@@ -0,0 +1,16 @@
+include(FetchContent)
+
+
+set(BUILD_SHARED_LIBS OFF) # up to you
+set(BUILD_TESTING OFF) # to disable abseil test, or gtest will fail.
+set(ABSL_ENABLE_INSTALL ON) # now you can enable install rules even in subproject...
+
+FetchContent_Declare(
+  absl
+  GIT_REPOSITORY "https://github.com/abseil/abseil-cpp.git"
+  GIT_TAG "20210324.1"
+)
+FetchContent_MakeAvailable(absl)
+
+set(EIGEN3_INCLUDE_DIR ${Eigen3_SOURCE_DIR})
+include_directories(${absl_SOURCE_DIR})
\ No newline at end of file
diff --git a/speechx/cmake/external/boost.cmake b/speechx/cmake/external/boost.cmake
new file mode 100644
index 000000000..6bc97aad4
--- /dev/null
+++ b/speechx/cmake/external/boost.cmake
@@ -0,0 +1,27 @@
+include(FetchContent)
+set(Boost_DEBUG ON)
+
+set(Boost_PREFIX_DIR ${fc_patch}/boost)
+set(Boost_SOURCE_DIR ${fc_patch}/boost-src)
+
+FetchContent_Declare(
+  Boost
+  URL      https://boostorg.jfrog.io/artifactory/main/release/1.75.0/source/boost_1_75_0.tar.gz
+  URL_HASH SHA256=aeb26f80e80945e82ee93e5939baebdca47b9dee80a07d3144be1e1a6a66dd6a
+  PREFIX            ${Boost_PREFIX_DIR}
+  SOURCE_DIR        ${Boost_SOURCE_DIR}
+)
+
+execute_process(COMMAND bootstrap.sh WORKING_DIRECTORY ${Boost_SOURCE_DIR})
+execute_process(COMMAND b2 WORKING_DIRECTORY ${Boost_SOURCE_DIR})
+
+FetchContent_MakeAvailable(Boost)
+
+message(STATUS "boost src dir: ${Boost_SOURCE_DIR}")
+message(STATUS "boost inc dir: ${Boost_INCLUDE_DIR}")
+message(STATUS "boost bin dir: ${Boost_BINARY_DIR}")
+
+set(BOOST_ROOT ${Boost_SOURCE_DIR})
+message(STATUS "boost root dir: ${BOOST_ROOT}")
+
+include_directories(${Boost_SOURCE_DIR})
\ No newline at end of file
diff --git a/speechx/cmake/external/eigen.cmake b/speechx/cmake/external/eigen.cmake
new file mode 100644
index 000000000..12bd3cdf5
--- /dev/null
+++ b/speechx/cmake/external/eigen.cmake
@@ -0,0 +1,27 @@
+include(FetchContent)
+
+# update eigen to the commit id f612df27 on 03/16/2021
+set(EIGEN_PREFIX_DIR ${fc_patch}/eigen3)
+
+FetchContent_Declare(
+  Eigen3
+  GIT_REPOSITORY https://gitlab.com/libeigen/eigen.git
+  GIT_TAG master
+  PREFIX            ${EIGEN_PREFIX_DIR}
+  GIT_SHALLOW TRUE
+  GIT_PROGRESS TRUE)
+
+set(EIGEN_BUILD_DOC OFF)
+# note: To disable eigen tests,
+# you should put this code in a add_subdirectory to avoid to change
+# BUILD_TESTING for your own project too since variables are directory
+# scoped
+set(BUILD_TESTING OFF)
+set(EIGEN_BUILD_PKGCONFIG OFF)
+set( OFF)
+FetchContent_MakeAvailable(Eigen3)
+
+message(STATUS "eigen src dir: ${Eigen3_SOURCE_DIR}")
+message(STATUS "eigen bin dir: ${Eigen3_BINARY_DIR}")
+#include_directories(${Eigen3_SOURCE_DIR})
+#link_directories(${Eigen3_BINARY_DIR})
\ No newline at end of file
diff --git a/speechx/cmake/external/gflags.cmake b/speechx/cmake/external/gflags.cmake
new file mode 100644
index 000000000..66ae47f70
--- /dev/null
+++ b/speechx/cmake/external/gflags.cmake
@@ -0,0 +1,12 @@
+include(FetchContent)
+
+FetchContent_Declare(
+  gflags
+  URL      https://github.com/gflags/gflags/archive/v2.2.1.zip
+  URL_HASH SHA256=4e44b69e709c826734dbbbd5208f61888a2faf63f239d73d8ba0011b2dccc97a
+)
+
+FetchContent_MakeAvailable(gflags)
+
+# openfst need
+include_directories(${gflags_BINARY_DIR}/include)
\ No newline at end of file
diff --git a/speechx/cmake/external/glog.cmake b/speechx/cmake/external/glog.cmake
new file mode 100644
index 000000000..dcfd86c3e
--- /dev/null
+++ b/speechx/cmake/external/glog.cmake
@@ -0,0 +1,8 @@
+include(FetchContent)
+FetchContent_Declare(
+  glog
+  URL      https://github.com/google/glog/archive/v0.4.0.zip
+  URL_HASH SHA256=9e1b54eb2782f53cd8af107ecf08d2ab64b8d0dc2b7f5594472f3bd63ca85cdc
+)
+FetchContent_MakeAvailable(glog)
+include_directories(${glog_BINARY_DIR} ${glog_SOURCE_DIR}/src)
diff --git a/speechx/cmake/external/gtest.cmake b/speechx/cmake/external/gtest.cmake
new file mode 100644
index 000000000..7fe397fcb
--- /dev/null
+++ b/speechx/cmake/external/gtest.cmake
@@ -0,0 +1,9 @@
+include(FetchContent)
+FetchContent_Declare(
+  gtest
+  URL      https://github.com/google/googletest/archive/release-1.10.0.zip
+  URL_HASH SHA256=94c634d499558a76fa649edb13721dce6e98fb1e7018dfaeba3cd7a083945e91
+)
+FetchContent_MakeAvailable(gtest)
+
+include_directories(${gtest_BINARY_DIR} ${gtest_SOURCE_DIR}/src)
\ No newline at end of file
diff --git a/speechx/cmake/external/kenlm.cmake b/speechx/cmake/external/kenlm.cmake
new file mode 100644
index 000000000..17c76c3f6
--- /dev/null
+++ b/speechx/cmake/external/kenlm.cmake
@@ -0,0 +1,10 @@
+include(FetchContent)
+FetchContent_Declare(
+  kenlm
+  GIT_REPOSITORY "https://github.com/kpu/kenlm.git"
+  GIT_TAG "df2d717e95183f79a90b2fa6e4307083a351ca6a"
+)
+# https://github.com/kpu/kenlm/blob/master/cmake/modules/FindEigen3.cmake
+set(EIGEN3_INCLUDE_DIR ${Eigen3_SOURCE_DIR})
+FetchContent_MakeAvailable(kenlm)
+include_directories(${kenlm_SOURCE_DIR})
\ No newline at end of file
diff --git a/speechx/cmake/external/libsndfile.cmake b/speechx/cmake/external/libsndfile.cmake
new file mode 100644
index 000000000..52d64bacd
--- /dev/null
+++ b/speechx/cmake/external/libsndfile.cmake
@@ -0,0 +1,56 @@
+include(FetchContent)
+
+# https://github.com/pongasoft/vst-sam-spl-64/blob/master/libsndfile.cmake
+# https://github.com/popojan/goban/blob/master/CMakeLists.txt#L38
+# https://github.com/ddiakopoulos/libnyquist/blob/master/CMakeLists.txt
+
+if(LIBSNDFILE_ROOT_DIR)
+  # instructs FetchContent to not download or update but use the location instead
+  set(FETCHCONTENT_SOURCE_DIR_LIBSNDFILE ${LIBSNDFILE_ROOT_DIR})
+else()
+  set(FETCHCONTENT_SOURCE_DIR_LIBSNDFILE "")
+endif()
+
+set(LIBSNDFILE_GIT_REPO "https://github.com/libsndfile/libsndfile.git" CACHE STRING "libsndfile git repository url" FORCE)
+set(LIBSNDFILE_GIT_TAG 1.0.31 CACHE STRING "libsndfile git tag" FORCE)
+
+FetchContent_Declare(libsndfile
+      GIT_REPOSITORY    ${LIBSNDFILE_GIT_REPO}
+      GIT_TAG           ${LIBSNDFILE_GIT_TAG}
+      GIT_CONFIG        advice.detachedHead=false
+#      GIT_SHALLOW       true
+      CONFIGURE_COMMAND ""
+      BUILD_COMMAND     ""
+      INSTALL_COMMAND   ""
+      TEST_COMMAND      ""
+      )
+
+FetchContent_GetProperties(libsndfile)
+if(NOT libsndfile_POPULATED)
+  if(FETCHCONTENT_SOURCE_DIR_LIBSNDFILE)
+    message(STATUS "Using libsndfile from local ${FETCHCONTENT_SOURCE_DIR_LIBSNDFILE}")
+  else()
+    message(STATUS "Fetching libsndfile ${LIBSNDFILE_GIT_REPO}/tree/${LIBSNDFILE_GIT_TAG}")
+  endif()
+  FetchContent_Populate(libsndfile)
+endif()
+
+set(LIBSNDFILE_ROOT_DIR ${libsndfile_SOURCE_DIR})
+set(LIBSNDFILE_INCLUDE_DIR "${libsndfile_BINARY_DIR}/src")
+
+function(libsndfile_build)
+  option(BUILD_PROGRAMS "Build programs" OFF)
+  option(BUILD_EXAMPLES "Build examples" OFF)
+  option(BUILD_TESTING "Build examples" OFF)
+  option(ENABLE_CPACK "Enable CPack support" OFF)
+  option(ENABLE_PACKAGE_CONFIG "Generate and install package config file" OFF)
+  option(BUILD_REGTEST "Build regtest" OFF)
+  # finally we include libsndfile itself
+  add_subdirectory(${libsndfile_SOURCE_DIR} ${libsndfile_BINARY_DIR} EXCLUDE_FROM_ALL)
+  # copying .hh for c++ support
+  #file(COPY "${libsndfile_SOURCE_DIR}/src/sndfile.hh" DESTINATION ${LIBSNDFILE_INCLUDE_DIR})
+endfunction()
+
+libsndfile_build()
+
+include_directories(${LIBSNDFILE_INCLUDE_DIR})
\ No newline at end of file
diff --git a/speechx/cmake/external/openblas.cmake b/speechx/cmake/external/openblas.cmake
new file mode 100644
index 000000000..5c196527e
--- /dev/null
+++ b/speechx/cmake/external/openblas.cmake
@@ -0,0 +1,58 @@
+include(FetchContent)
+
+set(OpenBLAS_SOURCE_DIR ${fc_patch}/OpenBLAS-src)
+set(OpenBLAS_PREFIX ${fc_patch}/OpenBLAS-prefix)
+
+# ######################################################################################################################
+# OPENBLAS  https://github.com/lattice/quda/blob/develop/CMakeLists.txt#L575
+# ######################################################################################################################
+enable_language(Fortran)
+
+include(FortranCInterface)
+
+# # Clang doesn't have a Fortran compiler in its suite (yet),
+# # so detect libraries for gfortran; we need equivalents to
+# # libgfortran and libquadmath, which are implicitly
+# # linked by flags in CMAKE_Fortran_IMPLICIT_LINK_LIBRARIES
+# include(FindGFortranLibs REQUIRED)
+# # Add directory containing libgfortran and libquadmath to
+# # linker. Should also contain libgomp, if not using
+# # Intel OpenMP runtime
+# link_directories(${GFORTRAN_LIBRARIES_DIR})
+# # gfortan dir in the docker.
+# link_directories(/usr/local/gcc-8.2/lib64)
+# # if you are working with C and Fortran
+# FortranCInterface_VERIFY()
+
+# # if you are working with C++ and Fortran
+# FortranCInterface_VERIFY(CXX)
+
+
+#TODO: switch to CPM
+include(GNUInstallDirs)
+ExternalProject_Add(
+    OPENBLAS
+    GIT_REPOSITORY https://github.com/xianyi/OpenBLAS.git
+    GIT_TAG v0.3.10
+    GIT_SHALLOW YES
+    PREFIX ${OpenBLAS_PREFIX}
+    SOURCE_DIR  ${OpenBLAS_SOURCE_DIR}
+    CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR> 
+    CMAKE_GENERATOR "Unix Makefiles")
+
+
+# https://cmake.org/cmake/help/latest/module/ExternalProject.html?highlight=externalproject_get_property#external-project-definition
+ExternalProject_Get_Property(OPENBLAS INSTALL_DIR)
+set(OpenBLAS_INSTALL_PREFIX ${INSTALL_DIR})
+add_library(openblas STATIC IMPORTED)
+add_dependencies(openblas OPENBLAS)
+set_target_properties(openblas PROPERTIES IMPORTED_LINK_INTERFACE_LANGUAGES Fortran)
+# ${CMAKE_INSTALL_LIBDIR}  lib
+set_target_properties(openblas PROPERTIES IMPORTED_LOCATION ${OpenBLAS_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/libopenblas.a)
+
+
+# https://cmake.org/cmake/help/latest/command/install.html?highlight=cmake_install_libdir#installing-targets
+# ${CMAKE_INSTALL_LIBDIR}  lib
+# ${CMAKE_INSTALL_INCLUDEDIR}  include
+link_directories(${OpenBLAS_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR})
+include_directories(${OpenBLAS_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR})
\ No newline at end of file
diff --git a/speechx/cmake/external/openfst.cmake b/speechx/cmake/external/openfst.cmake
new file mode 100644
index 000000000..9acf530a1
--- /dev/null
+++ b/speechx/cmake/external/openfst.cmake
@@ -0,0 +1,20 @@
+include(FetchContent)
+set(openfst_PREFIX_DIR ${fc_patch}/openfst)
+set(openfst_SOURCE_DIR ${fc_patch}/openfst-src)
+set(openfst_BINARY_DIR ${fc_patch}/openfst-build)
+
+ExternalProject_Add(openfst
+  URL               https://github.com/mjansche/openfst/archive/refs/tags/1.7.2.zip
+  URL_HASH          SHA256=ffc56931025579a8af3515741c0f3b0fc3a854c023421472c07ca0c6389c75e6
+  PREFIX            ${openfst_PREFIX_DIR} 
+  SOURCE_DIR        ${openfst_SOURCE_DIR}
+  BINARY_DIR        ${openfst_BINARY_DIR}
+  CONFIGURE_COMMAND ${openfst_SOURCE_DIR}/configure --prefix=${openfst_PREFIX_DIR}
+                      "CPPFLAGS=-I${gflags_BINARY_DIR}/include -I${glog_SOURCE_DIR}/src -I${glog_BINARY_DIR}"
+                      "LDFLAGS=-L${gflags_BINARY_DIR} -L${glog_BINARY_DIR}"
+                      "LIBS=-lgflags_nothreads -lglog -lpthread"
+  COMMAND           ${CMAKE_COMMAND} -E copy_directory ${PROJECT_SOURCE_DIR}/patch/openfst ${openfst_SOURCE_DIR}
+  BUILD_COMMAND     make -j 4
+)
+link_directories(${openfst_PREFIX_DIR}/lib)
+include_directories(${openfst_PREFIX_DIR}/include)
\ No newline at end of file
diff --git a/speechx/examples/.gitignore b/speechx/examples/.gitignore
new file mode 100644
index 000000000..b7075fa56
--- /dev/null
+++ b/speechx/examples/.gitignore
@@ -0,0 +1,2 @@
+*.ark
+paddle_asr_model/
diff --git a/speechx/examples/.gitkeep b/speechx/examples/.gitkeep
deleted file mode 100644
index e69de29bb..000000000
diff --git a/speechx/examples/CMakeLists.txt b/speechx/examples/CMakeLists.txt
new file mode 100644
index 000000000..7f1543c25
--- /dev/null
+++ b/speechx/examples/CMakeLists.txt
@@ -0,0 +1,7 @@
+cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
+
+add_subdirectory(feat)
+add_subdirectory(nnet)
+add_subdirectory(decoder)
+
+add_subdirectory(glog)
\ No newline at end of file
diff --git a/speechx/examples/README.md b/speechx/examples/README.md
new file mode 100644
index 000000000..705ca2006
--- /dev/null
+++ b/speechx/examples/README.md
@@ -0,0 +1,17 @@
+# Examples
+
+* glog - glog usage
+* feat - mfcc, linear 
+* nnet - ds2 nn
+* decoder - online decoder to work as offline
+
+## How to run
+
+`run.sh` is the entry point.
+
+Example to play `decoder`:
+
+```
+pushd decoder
+bash run.sh
+```
diff --git a/speechx/examples/decoder/CMakeLists.txt b/speechx/examples/decoder/CMakeLists.txt
new file mode 100644
index 000000000..ded423e94
--- /dev/null
+++ b/speechx/examples/decoder/CMakeLists.txt
@@ -0,0 +1,14 @@
+cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
+
+add_executable(offline_decoder_sliding_chunk_main ${CMAKE_CURRENT_SOURCE_DIR}/offline_decoder_sliding_chunk_main.cc)
+target_include_directories(offline_decoder_sliding_chunk_main PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
+target_link_libraries(offline_decoder_sliding_chunk_main PUBLIC nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util ${DEPS})
+
+add_executable(offline_decoder_main ${CMAKE_CURRENT_SOURCE_DIR}/offline_decoder_main.cc)
+target_include_directories(offline_decoder_main PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
+target_link_libraries(offline_decoder_main PUBLIC nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util ${DEPS})
+
+add_executable(decoder_test_main ${CMAKE_CURRENT_SOURCE_DIR}/decoder_test_main.cc)
+target_include_directories(decoder_test_main PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
+target_link_libraries(decoder_test_main PUBLIC nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util ${DEPS})
+
diff --git a/speechx/examples/decoder/decoder_test_main.cc b/speechx/examples/decoder/decoder_test_main.cc
new file mode 100644
index 000000000..0e249cc6b
--- /dev/null
+++ b/speechx/examples/decoder/decoder_test_main.cc
@@ -0,0 +1,74 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// todo refactor, repalce with gtest
+
+#include "base/flags.h"
+#include "base/log.h"
+#include "decoder/ctc_beam_search_decoder.h"
+#include "kaldi/util/table-types.h"
+#include "nnet/decodable.h"
+
+DEFINE_string(nnet_prob_respecifier, "", "test nnet prob rspecifier");
+DEFINE_string(dict_file, "vocab.txt", "vocabulary of lm");
+DEFINE_string(lm_path, "lm.klm", "language model");
+
+using kaldi::BaseFloat;
+using kaldi::Matrix;
+using std::vector;
+
+// test decoder by feeding nnet posterior probability
+int main(int argc, char* argv[]) {
+    gflags::ParseCommandLineFlags(&argc, &argv, false);
+    google::InitGoogleLogging(argv[0]);
+
+    kaldi::SequentialBaseFloatMatrixReader likelihood_reader(
+        FLAGS_nnet_prob_respecifier);
+    std::string dict_file = FLAGS_dict_file;
+    std::string lm_path = FLAGS_lm_path;
+    LOG(INFO) << "dict path: " << dict_file;
+    LOG(INFO) << "lm path: " << lm_path;
+
+    int32 num_done = 0, num_err = 0;
+
+    ppspeech::CTCBeamSearchOptions opts;
+    opts.dict_file = dict_file;
+    opts.lm_path = lm_path;
+    ppspeech::CTCBeamSearch decoder(opts);
+
+    std::shared_ptr<ppspeech::Decodable> decodable(
+        new ppspeech::Decodable(nullptr, nullptr));
+
+    decoder.InitDecoder();
+
+    for (; !likelihood_reader.Done(); likelihood_reader.Next()) {
+        string utt = likelihood_reader.Key();
+        const kaldi::Matrix<BaseFloat> likelihood = likelihood_reader.Value();
+        LOG(INFO) << "process utt: " << utt;
+        LOG(INFO) << "rows: " << likelihood.NumRows();
+        LOG(INFO) << "cols: " << likelihood.NumCols();
+        decodable->Acceptlikelihood(likelihood);
+        decoder.AdvanceDecode(decodable);
+        std::string result;
+        result = decoder.GetFinalBestPath();
+        KALDI_LOG << " the result of " << utt << " is " << result;
+        decodable->Reset();
+        decoder.Reset();
+        ++num_done;
+    }
+
+    KALDI_LOG << "Done " << num_done << " utterances, " << num_err
+              << " with errors.";
+    return (num_done != 0 ? 0 : 1);
+}
diff --git a/speechx/examples/decoder/local/model.sh b/speechx/examples/decoder/local/model.sh
new file mode 100644
index 000000000..5c609a6cf
--- /dev/null
+++ b/speechx/examples/decoder/local/model.sh
@@ -0,0 +1,3 @@
+#!/bin/bash 
+
+
diff --git a/speechx/examples/decoder/offline_decoder_main.cc b/speechx/examples/decoder/offline_decoder_main.cc
new file mode 100644
index 000000000..9a9c14a0c
--- /dev/null
+++ b/speechx/examples/decoder/offline_decoder_main.cc
@@ -0,0 +1,121 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// todo refactor, repalce with gtest
+
+#include "base/flags.h"
+#include "base/log.h"
+#include "decoder/ctc_beam_search_decoder.h"
+#include "frontend/audio/data_cache.h"
+#include "kaldi/util/table-types.h"
+#include "nnet/decodable.h"
+#include "nnet/paddle_nnet.h"
+
+DEFINE_string(feature_respecifier, "", "feature matrix rspecifier");
+DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model");
+DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param");
+DEFINE_string(dict_file, "vocab.txt", "vocabulary of lm");
+DEFINE_string(lm_path, "lm.klm", "language model");
+DEFINE_int32(chunk_size, 35, "feat chunk size");
+
+
+using kaldi::BaseFloat;
+using kaldi::Matrix;
+using std::vector;
+
+// test decoder by feeding speech feature, deprecated.
+int main(int argc, char* argv[]) {
+    gflags::ParseCommandLineFlags(&argc, &argv, false);
+    google::InitGoogleLogging(argv[0]);
+
+    kaldi::SequentialBaseFloatMatrixReader feature_reader(
+        FLAGS_feature_respecifier);
+    std::string model_graph = FLAGS_model_path;
+    std::string model_params = FLAGS_param_path;
+    std::string dict_file = FLAGS_dict_file;
+    std::string lm_path = FLAGS_lm_path;
+    int32 chunk_size = FLAGS_chunk_size;
+    LOG(INFO) << "model path: " << model_graph;
+    LOG(INFO) << "model param: " << model_params;
+    LOG(INFO) << "dict path: " << dict_file;
+    LOG(INFO) << "lm path: " << lm_path;
+    LOG(INFO) << "chunk size (frame): " << chunk_size;
+
+    int32 num_done = 0, num_err = 0;
+
+    // frontend + nnet is decodable
+    ppspeech::ModelOptions model_opts;
+    model_opts.model_path = model_graph;
+    model_opts.params_path = model_params;
+    std::shared_ptr<ppspeech::PaddleNnet> nnet(
+        new ppspeech::PaddleNnet(model_opts));
+    std::shared_ptr<ppspeech::DataCache> raw_data(new ppspeech::DataCache());
+    std::shared_ptr<ppspeech::Decodable> decodable(
+        new ppspeech::Decodable(nnet, raw_data));
+    LOG(INFO) << "Init decodeable.";
+
+    // init decoder
+    ppspeech::CTCBeamSearchOptions opts;
+    opts.dict_file = dict_file;
+    opts.lm_path = lm_path;
+    ppspeech::CTCBeamSearch decoder(opts);
+    LOG(INFO) << "Init decoder.";
+
+    decoder.InitDecoder();
+    for (; !feature_reader.Done(); feature_reader.Next()) {
+        string utt = feature_reader.Key();
+        const kaldi::Matrix<BaseFloat> feature = feature_reader.Value();
+        LOG(INFO) << "utt: " << utt;
+
+        // feat dim
+        raw_data->SetDim(feature.NumCols());
+        LOG(INFO) << "dim: " << raw_data->Dim();
+
+        int32 row_idx = 0;
+        int32 num_chunks = feature.NumRows() / chunk_size;
+        LOG(INFO) << "n chunks: " << num_chunks;
+        for (int chunk_idx = 0; chunk_idx < num_chunks; ++chunk_idx) {
+            // feat chunk
+            kaldi::Vector<kaldi::BaseFloat> feature_chunk(chunk_size *
+                                                          feature.NumCols());
+            for (int row_id = 0; row_id < chunk_size; ++row_id) {
+                kaldi::SubVector<kaldi::BaseFloat> feat_one_row(feature,
+                                                                row_idx);
+                kaldi::SubVector<kaldi::BaseFloat> f_chunk_tmp(
+                    feature_chunk.Data() + row_id * feature.NumCols(),
+                    feature.NumCols());
+                f_chunk_tmp.CopyFromVec(feat_one_row);
+                row_idx++;
+            }
+            // feed to raw cache
+            raw_data->Accept(feature_chunk);
+            if (chunk_idx == num_chunks - 1) {
+                raw_data->SetFinished();
+            }
+            // decode step
+            decoder.AdvanceDecode(decodable);
+        }
+
+        std::string result;
+        result = decoder.GetFinalBestPath();
+        KALDI_LOG << " the result of " << utt << " is " << result;
+        decodable->Reset();
+        decoder.Reset();
+        ++num_done;
+    }
+
+    KALDI_LOG << "Done " << num_done << " utterances, " << num_err
+              << " with errors.";
+    return (num_done != 0 ? 0 : 1);
+}
diff --git a/speechx/examples/decoder/offline_decoder_sliding_chunk_main.cc b/speechx/examples/decoder/offline_decoder_sliding_chunk_main.cc
new file mode 100644
index 000000000..7f6c572ca
--- /dev/null
+++ b/speechx/examples/decoder/offline_decoder_sliding_chunk_main.cc
@@ -0,0 +1,141 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// todo refactor, repalce with gtest
+
+#include "base/flags.h"
+#include "base/log.h"
+#include "decoder/ctc_beam_search_decoder.h"
+#include "frontend/audio/data_cache.h"
+#include "kaldi/util/table-types.h"
+#include "nnet/decodable.h"
+#include "nnet/paddle_nnet.h"
+
+DEFINE_string(feature_respecifier, "", "test feature rspecifier");
+DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model");
+DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param");
+DEFINE_string(dict_file, "vocab.txt", "vocabulary of lm");
+DEFINE_string(lm_path, "lm.klm", "language model");
+DEFINE_int32(receptive_field_length,
+             7,
+             "receptive field of two CNN(kernel=5) downsampling module.");
+DEFINE_int32(downsampling_rate,
+             4,
+             "two CNN(kernel=5) module downsampling rate.");
+
+using kaldi::BaseFloat;
+using kaldi::Matrix;
+using std::vector;
+
+
+// test ds2 online decoder by feeding speech feature
+int main(int argc, char* argv[]) {
+    gflags::ParseCommandLineFlags(&argc, &argv, false);
+    google::InitGoogleLogging(argv[0]);
+
+    kaldi::SequentialBaseFloatMatrixReader feature_reader(
+        FLAGS_feature_respecifier);
+    std::string model_graph = FLAGS_model_path;
+    std::string model_params = FLAGS_param_path;
+    std::string dict_file = FLAGS_dict_file;
+    std::string lm_path = FLAGS_lm_path;
+    LOG(INFO) << "model path: " << model_graph;
+    LOG(INFO) << "model param: " << model_params;
+    LOG(INFO) << "dict path: " << dict_file;
+    LOG(INFO) << "lm path: " << lm_path;
+
+
+    int32 num_done = 0, num_err = 0;
+
+    ppspeech::CTCBeamSearchOptions opts;
+    opts.dict_file = dict_file;
+    opts.lm_path = lm_path;
+    ppspeech::CTCBeamSearch decoder(opts);
+
+    ppspeech::ModelOptions model_opts;
+    model_opts.model_path = model_graph;
+    model_opts.params_path = model_params;
+    model_opts.cache_shape = "5-1-1024,5-1-1024";
+    std::shared_ptr<ppspeech::PaddleNnet> nnet(
+        new ppspeech::PaddleNnet(model_opts));
+    std::shared_ptr<ppspeech::DataCache> raw_data(new ppspeech::DataCache());
+    std::shared_ptr<ppspeech::Decodable> decodable(
+        new ppspeech::Decodable(nnet, raw_data));
+
+    int32 chunk_size = FLAGS_receptive_field_length;
+    int32 chunk_stride = FLAGS_downsampling_rate;
+    int32 receptive_field_length = FLAGS_receptive_field_length;
+    LOG(INFO) << "chunk size (frame): " << chunk_size;
+    LOG(INFO) << "chunk stride (frame): " << chunk_stride;
+    LOG(INFO) << "receptive field (frame): " << receptive_field_length;
+    decoder.InitDecoder();
+
+    for (; !feature_reader.Done(); feature_reader.Next()) {
+        string utt = feature_reader.Key();
+        kaldi::Matrix<BaseFloat> feature = feature_reader.Value();
+        raw_data->SetDim(feature.NumCols());
+        LOG(INFO) << "process utt: " << utt;
+        LOG(INFO) << "rows: " << feature.NumRows();
+        LOG(INFO) << "cols: " << feature.NumCols();
+
+        int32 row_idx = 0;
+        int32 padding_len = 0;
+        int32 ori_feature_len = feature.NumRows();
+        if ((feature.NumRows() - chunk_size) % chunk_stride != 0) {
+            padding_len =
+                chunk_stride - (feature.NumRows() - chunk_size) % chunk_stride;
+            feature.Resize(feature.NumRows() + padding_len,
+                           feature.NumCols(),
+                           kaldi::kCopyData);
+        }
+        int32 num_chunks = (feature.NumRows() - chunk_size) / chunk_stride + 1;
+        for (int chunk_idx = 0; chunk_idx < num_chunks; ++chunk_idx) {
+            kaldi::Vector<kaldi::BaseFloat> feature_chunk(chunk_size *
+                                                          feature.NumCols());
+            int32 feature_chunk_size = 0;
+            if (ori_feature_len > chunk_idx * chunk_stride) {
+                feature_chunk_size = std::min(
+                    ori_feature_len - chunk_idx * chunk_stride, chunk_size);
+            }
+            if (feature_chunk_size < receptive_field_length) break;
+
+            int32 start = chunk_idx * chunk_stride;
+            int32 end = start + chunk_size;
+
+            for (int row_id = 0; row_id < chunk_size; ++row_id) {
+                kaldi::SubVector<kaldi::BaseFloat> tmp(feature, start);
+                kaldi::SubVector<kaldi::BaseFloat> f_chunk_tmp(
+                    feature_chunk.Data() + row_id * feature.NumCols(),
+                    feature.NumCols());
+                f_chunk_tmp.CopyFromVec(tmp);
+                ++start;
+            }
+            raw_data->Accept(feature_chunk);
+            if (chunk_idx == num_chunks - 1) {
+                raw_data->SetFinished();
+            }
+            decoder.AdvanceDecode(decodable);
+        }
+        std::string result;
+        result = decoder.GetFinalBestPath();
+        KALDI_LOG << " the result of " << utt << " is " << result;
+        decodable->Reset();
+        decoder.Reset();
+        ++num_done;
+    }
+
+    KALDI_LOG << "Done " << num_done << " utterances, " << num_err
+              << " with errors.";
+    return (num_done != 0 ? 0 : 1);
+}
diff --git a/speechx/examples/decoder/path.sh b/speechx/examples/decoder/path.sh
new file mode 100644
index 000000000..a0e7c9aed
--- /dev/null
+++ b/speechx/examples/decoder/path.sh
@@ -0,0 +1,14 @@
+# This contains the locations of binarys build required for running the examples.
+
+SPEECHX_ROOT=$PWD/../..
+SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples
+
+SPEECHX_TOOLS=$SPEECHX_ROOT/tools
+TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
+
+[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; }
+
+export LC_AL=C
+
+SPEECHX_BIN=$SPEECHX_EXAMPLES/decoder:$SPEECHX_EXAMPLES/feat
+export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN
diff --git a/speechx/examples/decoder/run.sh b/speechx/examples/decoder/run.sh
new file mode 100755
index 000000000..ddda89702
--- /dev/null
+++ b/speechx/examples/decoder/run.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+set +x
+set -e
+
+. path.sh
+
+# 1. compile
+if [ ! -d ${SPEECHX_EXAMPLES} ]; then
+    pushd ${SPEECHX_ROOT} 
+    bash build.sh
+    popd
+fi
+
+
+# 2. download model
+if [ ! -d ../paddle_asr_model ]; then
+    wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/paddle_asr_model.tar.gz
+    tar xzfv paddle_asr_model.tar.gz
+    mv ./paddle_asr_model ../
+    # produce wav scp
+    echo "utt1 " $PWD/../paddle_asr_model/BAC009S0764W0290.wav > ../paddle_asr_model/wav.scp
+fi
+
+model_dir=../paddle_asr_model
+feat_wspecifier=./feats.ark
+cmvn=./cmvn.ark
+
+
+export GLOG_logtostderr=1
+
+# 3. gen linear feat
+linear_spectrogram_main \
+    --wav_rspecifier=scp:$model_dir/wav.scp \
+    --feature_wspecifier=ark,t:$feat_wspecifier \
+    --cmvn_write_path=$cmvn
+
+# 4. run decoder
+offline_decoder_main \
+    --feature_respecifier=ark:$feat_wspecifier \
+    --model_path=$model_dir/avg_1.jit.pdmodel \
+    --param_path=$model_dir/avg_1.jit.pdparams \
+    --dict_file=$model_dir/vocab.txt \
+    --lm_path=$model_dir/avg_1.jit.klm
diff --git a/speechx/examples/decoder/valgrind.sh b/speechx/examples/decoder/valgrind.sh
new file mode 100755
index 000000000..14efe0ba4
--- /dev/null
+++ b/speechx/examples/decoder/valgrind.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+# this script is for memory check, so please run ./run.sh first.
+
+set +x
+set -e
+
+. ./path.sh
+
+if [ ! -d ${SPEECHX_TOOLS}/valgrind/install ]; then
+  echo "please install valgrind in the speechx tools dir.\n" 
+  exit 1
+fi
+
+model_dir=../paddle_asr_model
+feat_wspecifier=./feats.ark
+cmvn=./cmvn.ark
+
+valgrind --tool=memcheck --track-origins=yes --leak-check=full --show-leak-kinds=all \
+  offline_decoder_main \
+  --feature_respecifier=ark:$feat_wspecifier \
+  --model_path=$model_dir/avg_1.jit.pdmodel \
+  --param_path=$model_dir/avg_1.jit.pdparams \
+  --dict_file=$model_dir/vocab.txt \
+  --lm_path=$model_dir/avg_1.jit.klm
+
diff --git a/speechx/examples/feat/CMakeLists.txt b/speechx/examples/feat/CMakeLists.txt
new file mode 100644
index 000000000..b8f516afb
--- /dev/null
+++ b/speechx/examples/feat/CMakeLists.txt
@@ -0,0 +1,10 @@
+cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
+
+
+add_executable(mfcc-test ${CMAKE_CURRENT_SOURCE_DIR}/feature-mfcc-test.cc)
+target_include_directories(mfcc-test PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
+target_link_libraries(mfcc-test kaldi-mfcc)
+
+add_executable(linear_spectrogram_main ${CMAKE_CURRENT_SOURCE_DIR}/linear_spectrogram_main.cc)
+target_include_directories(linear_spectrogram_main PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
+target_link_libraries(linear_spectrogram_main frontend kaldi-util kaldi-feat-common gflags glog)
\ No newline at end of file
diff --git a/speechx/examples/feat/feature-mfcc-test.cc b/speechx/examples/feat/feature-mfcc-test.cc
new file mode 100644
index 000000000..48a9e1c29
--- /dev/null
+++ b/speechx/examples/feat/feature-mfcc-test.cc
@@ -0,0 +1,719 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// feat/feature-mfcc-test.cc
+
+// Copyright 2009-2011  Karel Vesely;  Petr Motlicek
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include <iostream>
+
+#include "base/kaldi-math.h"
+#include "feat/feature-mfcc.h"
+#include "feat/wave-reader.h"
+#include "matrix/kaldi-matrix-inl.h"
+
+using namespace kaldi;
+
+static void UnitTestReadWave() {
+    std::cout << "=== UnitTestReadWave() ===\n";
+
+    Vector<BaseFloat> v, v2;
+
+    std::cout << "<<<=== Reading waveform\n";
+
+    {
+        std::ifstream is("test_data/test.wav", std::ios_base::binary);
+        WaveData wave;
+        wave.Read(is);
+        const Matrix<BaseFloat> data(wave.Data());
+        KALDI_ASSERT(data.NumRows() == 1);
+        v.Resize(data.NumCols());
+        v.CopyFromVec(data.Row(0));
+    }
+
+    std::cout
+        << "<<<=== Reading Vector<BaseFloat> waveform, prepared by matlab\n";
+    std::ifstream input("test_data/test_matlab.ascii");
+    KALDI_ASSERT(input.good());
+    v2.Read(input, false);
+    input.close();
+
+    std::cout
+        << "<<<=== Comparing freshly read waveform to 'libsndfile' waveform\n";
+    KALDI_ASSERT(v.Dim() == v2.Dim());
+    for (int32 i = 0; i < v.Dim(); i++) {
+        KALDI_ASSERT(v(i) == v2(i));
+    }
+    std::cout << "<<<=== Comparing done\n";
+
+    // std::cout << "== The Waveform Samples == \n";
+    // std::cout << v;
+
+    std::cout << "Test passed :)\n\n";
+}
+
+
+/**
+ */
+static void UnitTestSimple() {
+    std::cout << "=== UnitTestSimple() ===\n";
+
+    Vector<BaseFloat> v(100000);
+    Matrix<BaseFloat> m;
+
+    // init with noise
+    for (int32 i = 0; i < v.Dim(); i++) {
+        v(i) = (abs(i * 433024253) % 65535) - (65535 / 2);
+    }
+
+    std::cout << "<<<=== Just make sure it runs... Nothing is compared\n";
+    // the parametrization object
+    MfccOptions op;
+    // trying to have same opts as baseline.
+    op.frame_opts.dither = 0.0;
+    op.frame_opts.preemph_coeff = 0.0;
+    op.frame_opts.window_type = "rectangular";
+    op.frame_opts.remove_dc_offset = false;
+    op.frame_opts.round_to_power_of_two = true;
+    op.mel_opts.low_freq = 0.0;
+    op.mel_opts.htk_mode = true;
+    op.htk_compat = true;
+
+    Mfcc mfcc(op);
+    // use default parameters
+
+    // compute mfccs.
+    mfcc.Compute(v, 1.0, &m);
+
+    // possibly dump
+    //   std::cout << "== Output features == \n" << m;
+    std::cout << "Test passed :)\n\n";
+}
+
+
+static void UnitTestHTKCompare1() {
+    std::cout << "=== UnitTestHTKCompare1() ===\n";
+
+    std::ifstream is("test_data/test.wav", std::ios_base::binary);
+    WaveData wave;
+    wave.Read(is);
+    KALDI_ASSERT(wave.Data().NumRows() == 1);
+    SubVector<BaseFloat> waveform(wave.Data(), 0);
+
+    // read the HTK features
+    Matrix<BaseFloat> htk_features;
+    {
+        std::ifstream is("test_data/test.wav.fea_htk.1",
+                         std::ios::in | std::ios_base::binary);
+        bool ans = ReadHtk(is, &htk_features, 0);
+        KALDI_ASSERT(ans);
+    }
+
+    // use mfcc with default configuration...
+    MfccOptions op;
+    op.frame_opts.dither = 0.0;
+    op.frame_opts.preemph_coeff = 0.0;
+    op.frame_opts.window_type = "hamming";
+    op.frame_opts.remove_dc_offset = false;
+    op.frame_opts.round_to_power_of_two = true;
+    op.mel_opts.low_freq = 0.0;
+    op.mel_opts.htk_mode = true;
+    op.htk_compat = true;
+    op.use_energy = false;  // C0 not energy.
+
+    Mfcc mfcc(op);
+
+    // calculate kaldi features
+    Matrix<BaseFloat> kaldi_raw_features;
+    mfcc.Compute(waveform, 1.0, &kaldi_raw_features);
+
+    DeltaFeaturesOptions delta_opts;
+    Matrix<BaseFloat> kaldi_features;
+    ComputeDeltas(delta_opts, kaldi_raw_features, &kaldi_features);
+
+    // compare the results
+    bool passed = true;
+    int32 i_old = -1;
+    KALDI_ASSERT(kaldi_features.NumRows() == htk_features.NumRows());
+    KALDI_ASSERT(kaldi_features.NumCols() == htk_features.NumCols());
+    // Ignore ends-- we make slightly different choices than
+    // HTK about how to treat the deltas at the ends.
+    for (int32 i = 10; i + 10 < kaldi_features.NumRows(); i++) {
+        for (int32 j = 0; j < kaldi_features.NumCols(); j++) {
+            BaseFloat a = kaldi_features(i, j), b = htk_features(i, j);
+            if ((std::abs(b - a)) > 1.0) {  //<< TOLERANCE TO DIFFERENCES!!!!!
+                // print the non-matching data only once per-line
+                if (i_old != i) {
+                    std::cout << "\n\n\n[HTK-row: " << i << "] "
+                              << htk_features.Row(i) << "\n";
+                    std::cout << "[Kaldi-row: " << i << "] "
+                              << kaldi_features.Row(i) << "\n\n\n";
+                    i_old = i;
+                }
+                // print indices of non-matching cells
+                std::cout << "[" << i << ", " << j << "]";
+                passed = false;
+            }
+        }
+    }
+    if (!passed) KALDI_ERR << "Test failed";
+
+    // write the htk features for later inspection
+    HtkHeader header = {
+        kaldi_features.NumRows(),
+        100000,  // 10ms
+        static_cast<int16>(sizeof(float) * kaldi_features.NumCols()),
+        021406  // MFCC_D_A_0
+    };
+    {
+        std::ofstream os("tmp.test.wav.fea_kaldi.1",
+                         std::ios::out | std::ios::binary);
+        WriteHtk(os, kaldi_features, header);
+    }
+
+    std::cout << "Test passed :)\n\n";
+
+    unlink("tmp.test.wav.fea_kaldi.1");
+}
+
+
+static void UnitTestHTKCompare2() {
+    std::cout << "=== UnitTestHTKCompare2() ===\n";
+
+    std::ifstream is("test_data/test.wav", std::ios_base::binary);
+    WaveData wave;
+    wave.Read(is);
+    KALDI_ASSERT(wave.Data().NumRows() == 1);
+    SubVector<BaseFloat> waveform(wave.Data(), 0);
+
+    // read the HTK features
+    Matrix<BaseFloat> htk_features;
+    {
+        std::ifstream is("test_data/test.wav.fea_htk.2",
+                         std::ios::in | std::ios_base::binary);
+        bool ans = ReadHtk(is, &htk_features, 0);
+        KALDI_ASSERT(ans);
+    }
+
+    // use mfcc with default configuration...
+    MfccOptions op;
+    op.frame_opts.dither = 0.0;
+    op.frame_opts.preemph_coeff = 0.0;
+    op.frame_opts.window_type = "hamming";
+    op.frame_opts.remove_dc_offset = false;
+    op.frame_opts.round_to_power_of_two = true;
+    op.mel_opts.low_freq = 0.0;
+    op.mel_opts.htk_mode = true;
+    op.htk_compat = true;
+    op.use_energy = true;  // Use energy.
+
+    Mfcc mfcc(op);
+
+    // calculate kaldi features
+    Matrix<BaseFloat> kaldi_raw_features;
+    mfcc.Compute(waveform, 1.0, &kaldi_raw_features);
+
+    DeltaFeaturesOptions delta_opts;
+    Matrix<BaseFloat> kaldi_features;
+    ComputeDeltas(delta_opts, kaldi_raw_features, &kaldi_features);
+
+    // compare the results
+    bool passed = true;
+    int32 i_old = -1;
+    KALDI_ASSERT(kaldi_features.NumRows() == htk_features.NumRows());
+    KALDI_ASSERT(kaldi_features.NumCols() == htk_features.NumCols());
+    // Ignore ends-- we make slightly different choices than
+    // HTK about how to treat the deltas at the ends.
+    for (int32 i = 10; i + 10 < kaldi_features.NumRows(); i++) {
+        for (int32 j = 0; j < kaldi_features.NumCols(); j++) {
+            BaseFloat a = kaldi_features(i, j), b = htk_features(i, j);
+            if ((std::abs(b - a)) > 1.0) {  //<< TOLERANCE TO DIFFERENCES!!!!!
+                // print the non-matching data only once per-line
+                if (i_old != i) {
+                    std::cout << "\n\n\n[HTK-row: " << i << "] "
+                              << htk_features.Row(i) << "\n";
+                    std::cout << "[Kaldi-row: " << i << "] "
+                              << kaldi_features.Row(i) << "\n\n\n";
+                    i_old = i;
+                }
+                // print indices of non-matching cells
+                std::cout << "[" << i << ", " << j << "]";
+                passed = false;
+            }
+        }
+    }
+    if (!passed) KALDI_ERR << "Test failed";
+
+    // write the htk features for later inspection
+    HtkHeader header = {
+        kaldi_features.NumRows(),
+        100000,  // 10ms
+        static_cast<int16>(sizeof(float) * kaldi_features.NumCols()),
+        021406  // MFCC_D_A_0
+    };
+    {
+        std::ofstream os("tmp.test.wav.fea_kaldi.2",
+                         std::ios::out | std::ios::binary);
+        WriteHtk(os, kaldi_features, header);
+    }
+
+    std::cout << "Test passed :)\n\n";
+
+    unlink("tmp.test.wav.fea_kaldi.2");
+}
+
+
+static void UnitTestHTKCompare3() {
+    std::cout << "=== UnitTestHTKCompare3() ===\n";
+
+    std::ifstream is("test_data/test.wav", std::ios_base::binary);
+    WaveData wave;
+    wave.Read(is);
+    KALDI_ASSERT(wave.Data().NumRows() == 1);
+    SubVector<BaseFloat> waveform(wave.Data(), 0);
+
+    // read the HTK features
+    Matrix<BaseFloat> htk_features;
+    {
+        std::ifstream is("test_data/test.wav.fea_htk.3",
+                         std::ios::in | std::ios_base::binary);
+        bool ans = ReadHtk(is, &htk_features, 0);
+        KALDI_ASSERT(ans);
+    }
+
+    // use mfcc with default configuration...
+    MfccOptions op;
+    op.frame_opts.dither = 0.0;
+    op.frame_opts.preemph_coeff = 0.0;
+    op.frame_opts.window_type = "hamming";
+    op.frame_opts.remove_dc_offset = false;
+    op.frame_opts.round_to_power_of_two = true;
+    op.htk_compat = true;
+    op.use_energy = true;  // Use energy.
+    op.mel_opts.low_freq = 20.0;
+    // op.mel_opts.debug_mel = true;
+    op.mel_opts.htk_mode = true;
+
+    Mfcc mfcc(op);
+
+    // calculate kaldi features
+    Matrix<BaseFloat> kaldi_raw_features;
+    mfcc.Compute(waveform, 1.0, &kaldi_raw_features);
+
+    DeltaFeaturesOptions delta_opts;
+    Matrix<BaseFloat> kaldi_features;
+    ComputeDeltas(delta_opts, kaldi_raw_features, &kaldi_features);
+
+    // compare the results
+    bool passed = true;
+    int32 i_old = -1;
+    KALDI_ASSERT(kaldi_features.NumRows() == htk_features.NumRows());
+    KALDI_ASSERT(kaldi_features.NumCols() == htk_features.NumCols());
+    // Ignore ends-- we make slightly different choices than
+    // HTK about how to treat the deltas at the ends.
+    for (int32 i = 10; i + 10 < kaldi_features.NumRows(); i++) {
+        for (int32 j = 0; j < kaldi_features.NumCols(); j++) {
+            BaseFloat a = kaldi_features(i, j), b = htk_features(i, j);
+            if ((std::abs(b - a)) > 1.0) {  //<< TOLERANCE TO DIFFERENCES!!!!!
+                // print the non-matching data only once per-line
+                if (static_cast<int32>(i_old) != i) {
+                    std::cout << "\n\n\n[HTK-row: " << i << "] "
+                              << htk_features.Row(i) << "\n";
+                    std::cout << "[Kaldi-row: " << i << "] "
+                              << kaldi_features.Row(i) << "\n\n\n";
+                    i_old = i;
+                }
+                // print indices of non-matching cells
+                std::cout << "[" << i << ", " << j << "]";
+                passed = false;
+            }
+        }
+    }
+    if (!passed) KALDI_ERR << "Test failed";
+
+    // write the htk features for later inspection
+    HtkHeader header = {
+        kaldi_features.NumRows(),
+        100000,  // 10ms
+        static_cast<int16>(sizeof(float) * kaldi_features.NumCols()),
+        021406  // MFCC_D_A_0
+    };
+    {
+        std::ofstream os("tmp.test.wav.fea_kaldi.3",
+                         std::ios::out | std::ios::binary);
+        WriteHtk(os, kaldi_features, header);
+    }
+
+    std::cout << "Test passed :)\n\n";
+
+    unlink("tmp.test.wav.fea_kaldi.3");
+}
+
+
+static void UnitTestHTKCompare4() {
+    std::cout << "=== UnitTestHTKCompare4() ===\n";
+
+    std::ifstream is("test_data/test.wav", std::ios_base::binary);
+    WaveData wave;
+    wave.Read(is);
+    KALDI_ASSERT(wave.Data().NumRows() == 1);
+    SubVector<BaseFloat> waveform(wave.Data(), 0);
+
+    // read the HTK features
+    Matrix<BaseFloat> htk_features;
+    {
+        std::ifstream is("test_data/test.wav.fea_htk.4",
+                         std::ios::in | std::ios_base::binary);
+        bool ans = ReadHtk(is, &htk_features, 0);
+        KALDI_ASSERT(ans);
+    }
+
+    // use mfcc with default configuration...
+    MfccOptions op;
+    op.frame_opts.dither = 0.0;
+    op.frame_opts.window_type = "hamming";
+    op.frame_opts.remove_dc_offset = false;
+    op.frame_opts.round_to_power_of_two = true;
+    op.mel_opts.low_freq = 0.0;
+    op.htk_compat = true;
+    op.use_energy = true;  // Use energy.
+    op.mel_opts.htk_mode = true;
+
+    Mfcc mfcc(op);
+
+    // calculate kaldi features
+    Matrix<BaseFloat> kaldi_raw_features;
+    mfcc.Compute(waveform, 1.0, &kaldi_raw_features);
+
+    DeltaFeaturesOptions delta_opts;
+    Matrix<BaseFloat> kaldi_features;
+    ComputeDeltas(delta_opts, kaldi_raw_features, &kaldi_features);
+
+    // compare the results
+    bool passed = true;
+    int32 i_old = -1;
+    KALDI_ASSERT(kaldi_features.NumRows() == htk_features.NumRows());
+    KALDI_ASSERT(kaldi_features.NumCols() == htk_features.NumCols());
+    // Ignore ends-- we make slightly different choices than
+    // HTK about how to treat the deltas at the ends.
+    for (int32 i = 10; i + 10 < kaldi_features.NumRows(); i++) {
+        for (int32 j = 0; j < kaldi_features.NumCols(); j++) {
+            BaseFloat a = kaldi_features(i, j), b = htk_features(i, j);
+            if ((std::abs(b - a)) > 1.0) {  //<< TOLERANCE TO DIFFERENCES!!!!!
+                // print the non-matching data only once per-line
+                if (static_cast<int32>(i_old) != i) {
+                    std::cout << "\n\n\n[HTK-row: " << i << "] "
+                              << htk_features.Row(i) << "\n";
+                    std::cout << "[Kaldi-row: " << i << "] "
+                              << kaldi_features.Row(i) << "\n\n\n";
+                    i_old = i;
+                }
+                // print indices of non-matching cells
+                std::cout << "[" << i << ", " << j << "]";
+                passed = false;
+            }
+        }
+    }
+    if (!passed) KALDI_ERR << "Test failed";
+
+    // write the htk features for later inspection
+    HtkHeader header = {
+        kaldi_features.NumRows(),
+        100000,  // 10ms
+        static_cast<int16>(sizeof(float) * kaldi_features.NumCols()),
+        021406  // MFCC_D_A_0
+    };
+    {
+        std::ofstream os("tmp.test.wav.fea_kaldi.4",
+                         std::ios::out | std::ios::binary);
+        WriteHtk(os, kaldi_features, header);
+    }
+
+    std::cout << "Test passed :)\n\n";
+
+    unlink("tmp.test.wav.fea_kaldi.4");
+}
+
+
+static void UnitTestHTKCompare5() {
+    std::cout << "=== UnitTestHTKCompare5() ===\n";
+
+    std::ifstream is("test_data/test.wav", std::ios_base::binary);
+    WaveData wave;
+    wave.Read(is);
+    KALDI_ASSERT(wave.Data().NumRows() == 1);
+    SubVector<BaseFloat> waveform(wave.Data(), 0);
+
+    // read the HTK features
+    Matrix<BaseFloat> htk_features;
+    {
+        std::ifstream is("test_data/test.wav.fea_htk.5",
+                         std::ios::in | std::ios_base::binary);
+        bool ans = ReadHtk(is, &htk_features, 0);
+        KALDI_ASSERT(ans);
+    }
+
+    // use mfcc with default configuration...
+    MfccOptions op;
+    op.frame_opts.dither = 0.0;
+    op.frame_opts.window_type = "hamming";
+    op.frame_opts.remove_dc_offset = false;
+    op.frame_opts.round_to_power_of_two = true;
+    op.htk_compat = true;
+    op.use_energy = true;  // Use energy.
+    op.mel_opts.low_freq = 0.0;
+    op.mel_opts.vtln_low = 100.0;
+    op.mel_opts.vtln_high = 7500.0;
+    op.mel_opts.htk_mode = true;
+
+    BaseFloat vtln_warp =
+        1.1;  // our approach identical to htk for warp factor >1,
+    // differs slightly for higher mel bins if warp_factor <0.9
+
+    Mfcc mfcc(op);
+
+    // calculate kaldi features
+    Matrix<BaseFloat> kaldi_raw_features;
+    mfcc.Compute(waveform, vtln_warp, &kaldi_raw_features);
+
+    DeltaFeaturesOptions delta_opts;
+    Matrix<BaseFloat> kaldi_features;
+    ComputeDeltas(delta_opts, kaldi_raw_features, &kaldi_features);
+
+    // compare the results
+    bool passed = true;
+    int32 i_old = -1;
+    KALDI_ASSERT(kaldi_features.NumRows() == htk_features.NumRows());
+    KALDI_ASSERT(kaldi_features.NumCols() == htk_features.NumCols());
+    // Ignore ends-- we make slightly different choices than
+    // HTK about how to treat the deltas at the ends.
+    for (int32 i = 10; i + 10 < kaldi_features.NumRows(); i++) {
+        for (int32 j = 0; j < kaldi_features.NumCols(); j++) {
+            BaseFloat a = kaldi_features(i, j), b = htk_features(i, j);
+            if ((std::abs(b - a)) > 1.0) {  //<< TOLERANCE TO DIFFERENCES!!!!!
+                // print the non-matching data only once per-line
+                if (static_cast<int32>(i_old) != i) {
+                    std::cout << "\n\n\n[HTK-row: " << i << "] "
+                              << htk_features.Row(i) << "\n";
+                    std::cout << "[Kaldi-row: " << i << "] "
+                              << kaldi_features.Row(i) << "\n\n\n";
+                    i_old = i;
+                }
+                // print indices of non-matching cells
+                std::cout << "[" << i << ", " << j << "]";
+                passed = false;
+            }
+        }
+    }
+    if (!passed) KALDI_ERR << "Test failed";
+
+    // write the htk features for later inspection
+    HtkHeader header = {
+        kaldi_features.NumRows(),
+        100000,  // 10ms
+        static_cast<int16>(sizeof(float) * kaldi_features.NumCols()),
+        021406  // MFCC_D_A_0
+    };
+    {
+        std::ofstream os("tmp.test.wav.fea_kaldi.5",
+                         std::ios::out | std::ios::binary);
+        WriteHtk(os, kaldi_features, header);
+    }
+
+    std::cout << "Test passed :)\n\n";
+
+    unlink("tmp.test.wav.fea_kaldi.5");
+}
+
+static void UnitTestHTKCompare6() {
+    std::cout << "=== UnitTestHTKCompare6() ===\n";
+
+
+    std::ifstream is("test_data/test.wav", std::ios_base::binary);
+    WaveData wave;
+    wave.Read(is);
+    KALDI_ASSERT(wave.Data().NumRows() == 1);
+    SubVector<BaseFloat> waveform(wave.Data(), 0);
+
+    // read the HTK features
+    Matrix<BaseFloat> htk_features;
+    {
+        std::ifstream is("test_data/test.wav.fea_htk.6",
+                         std::ios::in | std::ios_base::binary);
+        bool ans = ReadHtk(is, &htk_features, 0);
+        KALDI_ASSERT(ans);
+    }
+
+    // use mfcc with default configuration...
+    MfccOptions op;
+    op.frame_opts.dither = 0.0;
+    op.frame_opts.preemph_coeff = 0.97;
+    op.frame_opts.window_type = "hamming";
+    op.frame_opts.remove_dc_offset = false;
+    op.frame_opts.round_to_power_of_two = true;
+    op.mel_opts.num_bins = 24;
+    op.mel_opts.low_freq = 125.0;
+    op.mel_opts.high_freq = 7800.0;
+    op.htk_compat = true;
+    op.use_energy = false;  // C0 not energy.
+
+    Mfcc mfcc(op);
+
+    // calculate kaldi features
+    Matrix<BaseFloat> kaldi_raw_features;
+    mfcc.Compute(waveform, 1.0, &kaldi_raw_features);
+
+    DeltaFeaturesOptions delta_opts;
+    Matrix<BaseFloat> kaldi_features;
+    ComputeDeltas(delta_opts, kaldi_raw_features, &kaldi_features);
+
+    // compare the results
+    bool passed = true;
+    int32 i_old = -1;
+    KALDI_ASSERT(kaldi_features.NumRows() == htk_features.NumRows());
+    KALDI_ASSERT(kaldi_features.NumCols() == htk_features.NumCols());
+    // Ignore ends-- we make slightly different choices than
+    // HTK about how to treat the deltas at the ends.
+    for (int32 i = 10; i + 10 < kaldi_features.NumRows(); i++) {
+        for (int32 j = 0; j < kaldi_features.NumCols(); j++) {
+            BaseFloat a = kaldi_features(i, j), b = htk_features(i, j);
+            if ((std::abs(b - a)) > 1.0) {  //<< TOLERANCE TO DIFFERENCES!!!!!
+                // print the non-matching data only once per-line
+                if (static_cast<int32>(i_old) != i) {
+                    std::cout << "\n\n\n[HTK-row: " << i << "] "
+                              << htk_features.Row(i) << "\n";
+                    std::cout << "[Kaldi-row: " << i << "] "
+                              << kaldi_features.Row(i) << "\n\n\n";
+                    i_old = i;
+                }
+                // print indices of non-matching cells
+                std::cout << "[" << i << ", " << j << "]";
+                passed = false;
+            }
+        }
+    }
+    if (!passed) KALDI_ERR << "Test failed";
+
+    // write the htk features for later inspection
+    HtkHeader header = {
+        kaldi_features.NumRows(),
+        100000,  // 10ms
+        static_cast<int16>(sizeof(float) * kaldi_features.NumCols()),
+        021406  // MFCC_D_A_0
+    };
+    {
+        std::ofstream os("tmp.test.wav.fea_kaldi.6",
+                         std::ios::out | std::ios::binary);
+        WriteHtk(os, kaldi_features, header);
+    }
+
+    std::cout << "Test passed :)\n\n";
+
+    unlink("tmp.test.wav.fea_kaldi.6");
+}
+
+void UnitTestVtln() {
+    // Test the function VtlnWarpFreq.
+    BaseFloat low_freq = 10, high_freq = 7800, vtln_low_cutoff = 20,
+              vtln_high_cutoff = 7400;
+
+    for (size_t i = 0; i < 100; i++) {
+        BaseFloat freq = 5000, warp_factor = 0.9 + RandUniform() * 0.2;
+        AssertEqual(MelBanks::VtlnWarpFreq(vtln_low_cutoff,
+                                           vtln_high_cutoff,
+                                           low_freq,
+                                           high_freq,
+                                           warp_factor,
+                                           freq),
+                    freq / warp_factor);
+
+        AssertEqual(MelBanks::VtlnWarpFreq(vtln_low_cutoff,
+                                           vtln_high_cutoff,
+                                           low_freq,
+                                           high_freq,
+                                           warp_factor,
+                                           low_freq),
+                    low_freq);
+        AssertEqual(MelBanks::VtlnWarpFreq(vtln_low_cutoff,
+                                           vtln_high_cutoff,
+                                           low_freq,
+                                           high_freq,
+                                           warp_factor,
+                                           high_freq),
+                    high_freq);
+        BaseFloat freq2 = low_freq + (high_freq - low_freq) * RandUniform(),
+                  freq3 = freq2 +
+                          (high_freq - freq2) * RandUniform();  // freq3>=freq2
+        BaseFloat w2 = MelBanks::VtlnWarpFreq(vtln_low_cutoff,
+                                              vtln_high_cutoff,
+                                              low_freq,
+                                              high_freq,
+                                              warp_factor,
+                                              freq2);
+        BaseFloat w3 = MelBanks::VtlnWarpFreq(vtln_low_cutoff,
+                                              vtln_high_cutoff,
+                                              low_freq,
+                                              high_freq,
+                                              warp_factor,
+                                              freq3);
+        KALDI_ASSERT(w3 >= w2);  // increasing function.
+        BaseFloat w3dash = MelBanks::VtlnWarpFreq(
+            vtln_low_cutoff, vtln_high_cutoff, low_freq, high_freq, 1.0, freq3);
+        AssertEqual(w3dash, freq3);
+    }
+}
+
+static void UnitTestFeat() {
+    UnitTestVtln();
+    UnitTestReadWave();
+    UnitTestSimple();
+    UnitTestHTKCompare1();
+    UnitTestHTKCompare2();
+    // commenting out this one as it doesn't compare right now I normalized
+    // the way the FFT bins are treated (removed offset of 0.5)... this seems
+    // to relate to the way frequency zero behaves.
+    UnitTestHTKCompare3();
+    UnitTestHTKCompare4();
+    UnitTestHTKCompare5();
+    UnitTestHTKCompare6();
+    std::cout << "Tests succeeded.\n";
+}
+
+
+int main() {
+    try {
+        for (int i = 0; i < 5; i++) UnitTestFeat();
+        std::cout << "Tests succeeded.\n";
+        return 0;
+    } catch (const std::exception &e) {
+        std::cerr << e.what();
+        return 1;
+    }
+}
diff --git a/speechx/examples/feat/linear_spectrogram_main.cc b/speechx/examples/feat/linear_spectrogram_main.cc
new file mode 100644
index 000000000..2d75bb5df
--- /dev/null
+++ b/speechx/examples/feat/linear_spectrogram_main.cc
@@ -0,0 +1,267 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// todo refactor, repalce with gtest
+
+#include "base/flags.h"
+#include "base/log.h"
+#include "kaldi/feat/wave-reader.h"
+#include "kaldi/util/kaldi-io.h"
+#include "kaldi/util/table-types.h"
+
+#include "frontend/audio/audio_cache.h"
+#include "frontend/audio/data_cache.h"
+#include "frontend/audio/feature_cache.h"
+#include "frontend/audio/frontend_itf.h"
+#include "frontend/audio/linear_spectrogram.h"
+#include "frontend/audio/normalizer.h"
+
+DEFINE_string(wav_rspecifier, "", "test wav scp path");
+DEFINE_string(feature_wspecifier, "", "output feats wspecifier");
+DEFINE_string(cmvn_write_path, "./cmvn.ark", "write cmvn");
+
+
+std::vector<float> mean_{
+    -13730251.531853663, -12982852.199316509, -13673844.299583456,
+    -13089406.559646806, -12673095.524938712, -12823859.223276224,
+    -13590267.158903603, -14257618.467152044, -14374605.116185192,
+    -14490009.21822485,  -14849827.158924166, -15354435.470563512,
+    -15834149.206532761, -16172971.985514281, -16348740.496746974,
+    -16423536.699409386, -16556246.263649225, -16744088.772748645,
+    -16916184.08510357,  -17054034.840031497, -17165612.509455364,
+    -17255955.470915023, -17322572.527648456, -17408943.862033736,
+    -17521554.799865916, -17620623.254924215, -17699792.395918526,
+    -17723364.411134344, -17741483.4433254,   -17747426.888704527,
+    -17733315.928209435, -17748780.160905756, -17808336.883775543,
+    -17895918.671983004, -18009812.59173023,  -18098188.66548325,
+    -18195798.958462656, -18293617.62980999,  -18397432.92077201,
+    -18505834.787318766, -18585451.8100908,   -18652438.235649142,
+    -18700960.306275308, -18734944.58792185,  -18737426.313365128,
+    -18735347.165987637, -18738813.444170244, -18737086.848890636,
+    -18731576.2474336,   -18717405.44095871,  -18703089.25545657,
+    -18691014.546456724, -18692460.568905357, -18702119.628629155,
+    -18727710.621126678, -18761582.72034647,  -18806745.835547544,
+    -18850674.8692112,   -18884431.510951452, -18919999.992506847,
+    -18939303.799078144, -18952946.273760635, -18980289.22996379,
+    -19011610.17803294,  -19040948.61805145,  -19061021.429847397,
+    -19112055.53768819,  -19149667.414264943, -19201127.05091321,
+    -19270250.82564605,  -19334606.883057203, -19390513.336589377,
+    -19444176.259208687, -19502755.000038862, -19544333.014549147,
+    -19612668.183176614, -19681902.19006569,  -19771969.951249883,
+    -19873329.723376893, -19996752.59235844,  -20110031.131400537,
+    -20231658.612529557, -20319378.894054495, -20378534.45718066,
+    -20413332.089584175, -20438147.844177883, -20443710.248040095,
+    -20465457.02238927,  -20488610.969337028, -20516295.16424432,
+    -20541423.795738827, -20553192.874953747, -20573605.50701977,
+    -20577871.61936797,  -20571807.008916274, -20556242.38912231,
+    -20542199.30819195,  -20521239.063551214, -20519150.80004532,
+    -20527204.80248933,  -20536933.769257784, -20543470.522332076,
+    -20549700.089992985, -20551525.24958494,  -20554873.406493705,
+    -20564277.65794227,  -20572211.740052115, -20574305.69550465,
+    -20575494.450104576, -20567092.577932164, -20549302.929608088,
+    -20545445.11878376,  -20546625.326603737, -20549190.03499401,
+    -20554824.947828256, -20568341.378989458, -20577582.331383612,
+    -20577980.519402675, -20566603.03458152,  -20560131.592262644,
+    -20552166.469060015, -20549063.06763577,  -20544490.562339947,
+    -20539817.82346569,  -20528747.715731595, -20518026.24576161,
+    -20510977.844974525, -20506874.36087992,  -20506731.11977665,
+    -20510482.133420516, -20507760.92101862,  -20494644.834457114,
+    -20480107.89304893,  -20461312.091867123, -20442941.75080173,
+    -20426123.02834838,  -20424607.675283,    -20426810.369107097,
+    -20434024.50097819,  -20437404.75544205,  -20447688.63916367,
+    -20460893.335563846, -20482922.735127095, -20503610.119434915,
+    -20527062.76448319,  -20557830.035128627, -20593274.72068722,
+    -20632528.452965066, -20673637.471334763, -20733106.97143075,
+    -20842921.0447562,   -21054357.83621519,  -21416569.534189366,
+    -21978460.272811692, -22753170.052172784, -23671344.10563395,
+    -24613499.293358143, -25406477.12230188,  -25884377.82156489,
+    -26049040.62791664,  -26996879.104431007};
+std::vector<float> variance_{
+    213747175.10846674, 188395815.34302503, 212706429.10966414,
+    199109025.81461075, 189235901.23864496, 194901336.53253657,
+    217481594.29306737, 238689869.12327808, 243977501.24115244,
+    248479623.6431067,  259766741.47116545, 275516766.7790273,
+    291271202.3691234,  302693239.8220509,  308627358.3997694,
+    311143911.38788426, 315446105.07731867, 321705430.9341829,
+    327458907.4659941,  332245072.43223983, 336251717.5935284,
+    339694069.7639722,  342188204.4322228,  345587110.31313115,
+    349903086.2875232,  353660214.20643026, 356700344.5270885,
+    357665362.3529641,  358493352.05658793, 358857951.620328,
+    358375239.52774596, 358899733.6342954,  361051818.3511561,
+    364361716.05025816, 368750322.3771452,  372047800.6462831,
+    375655861.1349018,  379358519.1980013,  383327605.3935181,
+    387458599.282341,   390434692.3406868,  392994486.35057056,
+    394874418.04603153, 396230525.79763395, 396365592.0414835,
+    396334819.8242737,  396488353.19250053, 396438877.00744957,
+    396197980.4459586,  395590921.6672991,  395001107.62072515,
+    394528291.7318225,  394593110.424006,   395018405.59353715,
+    396110577.5415993,  397506704.0371068,  399400197.4657644,
+    401243568.2468382,  402687134.7805103,  404136047.2872507,
+    404883170.001883,   405522253.219517,   406660365.3626476,
+    407919346.0991902,  409045348.5384909,  409759588.7889818,
+    411974821.8564483,  413489718.78201455, 415535392.56684107,
+    418466481.97674364, 421104678.35678065, 423405392.5200779,
+    425550570.40798235, 427929423.9579701,  429585274.253478,
+    432368493.55181056, 435193587.13513297, 438886855.20476013,
+    443058876.8633751,  448181232.5093362,  452883835.6332396,
+    458056721.77926534, 461816531.22735566, 464363620.1970998,
+    465886343.5057493,  466928872.0651,     467180536.42647296,
+    468111848.70714295, 469138695.3071312,  470378429.6930793,
+    471517958.7132626,  472109050.4262365,  473087417.0177867,
+    473381322.04648733, 473220195.85483915, 472666071.8998819,
+    472124669.87879956, 471298571.411737,   471251033.2902761,
+    471672676.43128747, 472177147.2193172,  472572361.7711908,
+    472968783.7751127,  473156295.4164052,  473398034.82676554,
+    473897703.5203811,  474328271.33112127, 474452670.98002136,
+    474549003.99284613, 474252887.13567275, 473557462.909069,
+    473483385.85193115, 473609738.04855174, 473746944.82085115,
+    474016729.91696435, 474617321.94138587, 475045097.237122,
+    475125402.586558,   474664112.9824912,  474426247.5800283,
+    474104075.42796475, 473978219.7273978,  473773171.7798875,
+    473578534.69508696, 473102924.16904145, 472651240.5232615,
+    472374383.1810912,  472209479.6956096,  472202298.8921673,
+    472370090.76781124, 472220933.99374026, 471625467.37106377,
+    470994646.51883453, 470182428.9637543,  469348211.5939578,
+    468570387.4467277,  468540442.7225135,  468672018.90414184,
+    468994346.9533251,  469138757.58201426, 469553915.95710236,
+    470134523.38582784, 471082421.62055486, 471962316.51804745,
+    472939745.1708408,  474250621.5944825,  475773933.43199486,
+    477465399.71087736, 479218782.61382693, 481752299.7930922,
+    486608947.8984568,  496119403.2067917,  512730085.5704984,
+    539048915.2641417,  576285298.3548826,  621610270.2240586,
+    669308196.4436442,  710656993.5957186,  736344437.3725077,
+    745481288.0241544,  801121432.9925804};
+int count_ = 912592;
+
+void WriteMatrix() {
+    kaldi::Matrix<double> cmvn_stats(2, mean_.size() + 1);
+    for (size_t idx = 0; idx < mean_.size(); ++idx) {
+        cmvn_stats(0, idx) = mean_[idx];
+        cmvn_stats(1, idx) = variance_[idx];
+    }
+    cmvn_stats(0, mean_.size()) = count_;
+    kaldi::WriteKaldiObject(cmvn_stats, FLAGS_cmvn_write_path, false);
+}
+
+int main(int argc, char* argv[]) {
+    gflags::ParseCommandLineFlags(&argc, &argv, false);
+    google::InitGoogleLogging(argv[0]);
+
+    kaldi::SequentialTableReader<kaldi::WaveHolder> wav_reader(
+        FLAGS_wav_rspecifier);
+    kaldi::BaseFloatMatrixWriter feat_writer(FLAGS_feature_wspecifier);
+    WriteMatrix();
+
+
+    int32 num_done = 0, num_err = 0;
+
+    // feature pipeline: wave cache --> decibel_normalizer --> hanning
+    // window -->linear_spectrogram --> global cmvn -> feat cache
+
+    // std::unique_ptr<ppspeech::FrontendInterface> data_source(new
+    // ppspeech::DataCache());
+    std::unique_ptr<ppspeech::FrontendInterface> data_source(
+        new ppspeech::AudioCache());
+
+    ppspeech::DecibelNormalizerOptions db_norm_opt;
+    std::unique_ptr<ppspeech::FrontendInterface> db_norm(
+        new ppspeech::DecibelNormalizer(db_norm_opt, std::move(data_source)));
+
+    ppspeech::LinearSpectrogramOptions opt;
+    opt.frame_opts.frame_length_ms = 20;
+    opt.frame_opts.frame_shift_ms = 10;
+    opt.frame_opts.dither = 0.0;
+    opt.frame_opts.remove_dc_offset = false;
+    opt.frame_opts.window_type = "hanning";
+    opt.frame_opts.preemph_coeff = 0.0;
+    LOG(INFO) << "frame length (ms): " << opt.frame_opts.frame_length_ms;
+    LOG(INFO) << "frame shift (ms): " << opt.frame_opts.frame_shift_ms;
+
+    std::unique_ptr<ppspeech::FrontendInterface> linear_spectrogram(
+        new ppspeech::LinearSpectrogram(opt, std::move(db_norm)));
+
+    std::unique_ptr<ppspeech::FrontendInterface> cmvn(new ppspeech::CMVN(
+        FLAGS_cmvn_write_path, std::move(linear_spectrogram)));
+
+    ppspeech::FeatureCache feature_cache(kint16max, std::move(cmvn));
+    LOG(INFO) << "feat dim: " << feature_cache.Dim();
+
+    int sample_rate = 16000;
+    float streaming_chunk = 0.36;
+    int chunk_sample_size = streaming_chunk * sample_rate;
+    LOG(INFO) << "sr: " << sample_rate;
+    LOG(INFO) << "chunk size (s): " << streaming_chunk;
+    LOG(INFO) << "chunk size (sample): " << chunk_sample_size;
+
+
+    for (; !wav_reader.Done(); wav_reader.Next()) {
+        std::string utt = wav_reader.Key();
+        const kaldi::WaveData& wave_data = wav_reader.Value();
+        LOG(INFO) << "process utt: " << utt;
+
+        int32 this_channel = 0;
+        kaldi::SubVector<kaldi::BaseFloat> waveform(wave_data.Data(),
+                                                    this_channel);
+        int tot_samples = waveform.Dim();
+        LOG(INFO) << "wav len (sample): " << tot_samples;
+
+        int sample_offset = 0;
+        std::vector<kaldi::Vector<BaseFloat>> feats;
+        int feature_rows = 0;
+        while (sample_offset < tot_samples) {
+            int cur_chunk_size =
+                std::min(chunk_sample_size, tot_samples - sample_offset);
+
+            kaldi::Vector<kaldi::BaseFloat> wav_chunk(cur_chunk_size);
+            for (int i = 0; i < cur_chunk_size; ++i) {
+                wav_chunk(i) = waveform(sample_offset + i);
+            }
+
+            kaldi::Vector<BaseFloat> features;
+            feature_cache.Accept(wav_chunk);
+            if (cur_chunk_size < chunk_sample_size) {
+                feature_cache.SetFinished();
+            }
+            feature_cache.Read(&features);
+            if (features.Dim() == 0) break;
+
+            feats.push_back(features);
+            sample_offset += cur_chunk_size;
+            feature_rows += features.Dim() / feature_cache.Dim();
+        }
+
+        int cur_idx = 0;
+        kaldi::Matrix<kaldi::BaseFloat> features(feature_rows,
+                                                 feature_cache.Dim());
+        for (auto feat : feats) {
+            int num_rows = feat.Dim() / feature_cache.Dim();
+            for (int row_idx = 0; row_idx < num_rows; ++row_idx) {
+                for (size_t col_idx = 0; col_idx < feature_cache.Dim();
+                     ++col_idx) {
+                    features(cur_idx, col_idx) =
+                        feat(row_idx * feature_cache.Dim() + col_idx);
+                }
+                ++cur_idx;
+            }
+        }
+        feat_writer.Write(utt, features);
+
+        if (num_done % 50 == 0 && num_done != 0)
+            KALDI_VLOG(2) << "Processed " << num_done << " utterances";
+        num_done++;
+    }
+    KALDI_LOG << "Done " << num_done << " utterances, " << num_err
+              << " with errors.";
+    return (num_done != 0 ? 0 : 1);
+}
diff --git a/speechx/examples/feat/path.sh b/speechx/examples/feat/path.sh
new file mode 100644
index 000000000..8ab7ee299
--- /dev/null
+++ b/speechx/examples/feat/path.sh
@@ -0,0 +1,14 @@
+# This contains the locations of binarys build required for running the examples.
+
+SPEECHX_ROOT=$PWD/../..
+SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples
+
+SPEECHX_TOOLS=$SPEECHX_ROOT/tools
+TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
+
+[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; }
+
+export LC_AL=C
+
+SPEECHX_BIN=$SPEECHX_EXAMPLES/feat
+export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN
diff --git a/speechx/examples/feat/run.sh b/speechx/examples/feat/run.sh
new file mode 100755
index 000000000..29c49d325
--- /dev/null
+++ b/speechx/examples/feat/run.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+set +x
+set -e
+
+. ./path.sh
+
+# 1. compile
+if [ ! -d ${SPEECHX_EXAMPLES} ]; then
+    pushd ${SPEECHX_ROOT} 
+    bash build.sh
+    popd
+fi
+
+# 2. download model
+if [ ! -d ../paddle_asr_model ]; then
+    wget https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/paddle_asr_model.tar.gz
+    tar xzfv paddle_asr_model.tar.gz
+    mv ./paddle_asr_model ../
+    # produce wav scp
+    echo "utt1 " $PWD/../paddle_asr_model/BAC009S0764W0290.wav > ../paddle_asr_model/wav.scp
+fi
+
+model_dir=../paddle_asr_model
+feat_wspecifier=./feats.ark
+cmvn=./cmvn.ark
+
+# 3. run feat
+export GLOG_logtostderr=1
+linear_spectrogram_main \
+    --wav_rspecifier=scp:$model_dir/wav.scp \
+    --feature_wspecifier=ark,t:$feat_wspecifier \
+    --cmvn_write_path=$cmvn
diff --git a/speechx/examples/feat/valgrind.sh b/speechx/examples/feat/valgrind.sh
new file mode 100755
index 000000000..f8aab63f8
--- /dev/null
+++ b/speechx/examples/feat/valgrind.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+# this script is for memory check, so please run ./run.sh first.
+
+set +x
+set -e
+
+. ./path.sh
+
+if [ ! -d ${SPEECHX_TOOLS}/valgrind/install ]; then
+  echo "please install valgrind in the speechx tools dir.\n" 
+  exit 1
+fi
+
+model_dir=../paddle_asr_model
+feat_wspecifier=./feats.ark
+cmvn=./cmvn.ark
+
+valgrind --tool=memcheck --track-origins=yes --leak-check=full --show-leak-kinds=all \
+  linear_spectrogram_main \
+  --wav_rspecifier=scp:$model_dir/wav.scp \
+  --feature_wspecifier=ark,t:$feat_wspecifier \
+  --cmvn_write_path=$cmvn
+
diff --git a/speechx/examples/glog/CMakeLists.txt b/speechx/examples/glog/CMakeLists.txt
new file mode 100644
index 000000000..b4b0e6358
--- /dev/null
+++ b/speechx/examples/glog/CMakeLists.txt
@@ -0,0 +1,8 @@
+cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
+
+add_executable(glog_test ${CMAKE_CURRENT_SOURCE_DIR}/glog_test.cc)
+target_link_libraries(glog_test glog)
+
+
+add_executable(glog_logtostderr_test ${CMAKE_CURRENT_SOURCE_DIR}/glog_logtostderr_test.cc)
+target_link_libraries(glog_logtostderr_test glog)
\ No newline at end of file
diff --git a/speechx/examples/glog/README.md b/speechx/examples/glog/README.md
new file mode 100644
index 000000000..996e192e9
--- /dev/null
+++ b/speechx/examples/glog/README.md
@@ -0,0 +1,25 @@
+# [GLOG](https://rpg.ifi.uzh.ch/docs/glog.html)
+
+Unless otherwise specified, glog writes to the filename `/tmp/<program name>.<hostname>.<user name>.log.<severity level>.<date>.<time>.<pid>` (e.g., "/tmp/hello_world.example.com.hamaji.log.INFO.20080709-222411.10474"). By default, glog copies the log messages of severity level ERROR or FATAL to standard error (stderr) in addition to log files.
+
+Several flags influence glog's output behavior. If the Google gflags library is installed on your machine, the configure script (see the INSTALL file in the package for detail of this script) will automatically detect and use it, allowing you to pass flags on the command line. For example, if you want to turn the flag --logtostderr on, you can start your application with the following command line:
+
+   `./your_application --logtostderr=1`
+
+If the Google gflags library isn't installed, you set flags via environment variables, prefixing the flag name with "GLOG_", e.g.
+
+   `GLOG_logtostderr=1 ./your_application`
+
+You can also modify flag values in your program by modifying global variables `FLAGS_*` . Most settings start working immediately after you update `FLAGS_*` . The exceptions are the flags related to destination files. For example, you might want to set `FLAGS_log_dir` before calling `google::InitGoogleLogging` . Here is an example:
+∂∂
+```c++
+   LOG(INFO) << "file";
+   // Most flags work immediately after updating values.
+   FLAGS_logtostderr = 1;
+   LOG(INFO) << "stderr";
+   FLAGS_logtostderr = 0;
+   // This won't change the log destination. If you want to set this
+   // value, you should do this before google::InitGoogleLogging .
+   FLAGS_log_dir = "/some/log/directory";
+   LOG(INFO) << "the same file";
+```
diff --git a/speechx/examples/glog/glog_logtostderr_test.cc b/speechx/examples/glog/glog_logtostderr_test.cc
new file mode 100644
index 000000000..b0616a7de
--- /dev/null
+++ b/speechx/examples/glog/glog_logtostderr_test.cc
@@ -0,0 +1,25 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <glog/logging.h>
+
+int main(int argc, char* argv[]) {
+    // Initialize Google’s logging library.
+    google::InitGoogleLogging(argv[0]);
+
+    FLAGS_logtostderr = 1;
+
+    LOG(INFO) << "Found " << 10 << " cookies";
+    LOG(ERROR) << "Found " << 10 << " error";
+}
\ No newline at end of file
diff --git a/speechx/examples/glog/glog_test.cc b/speechx/examples/glog/glog_test.cc
new file mode 100644
index 000000000..b6275119e
--- /dev/null
+++ b/speechx/examples/glog/glog_test.cc
@@ -0,0 +1,23 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <glog/logging.h>
+
+int main(int argc, char* argv[]) {
+    // Initialize Google’s logging library.
+    google::InitGoogleLogging(argv[0]);
+
+    LOG(INFO) << "Found " << 10 << " cookies";
+    LOG(ERROR) << "Found " << 10 << " error";
+}
\ No newline at end of file
diff --git a/speechx/examples/glog/path.sh b/speechx/examples/glog/path.sh
new file mode 100644
index 000000000..e2c7b2fcf
--- /dev/null
+++ b/speechx/examples/glog/path.sh
@@ -0,0 +1,14 @@
+# This contains the locations of binarys build required for running the examples.
+
+SPEECHX_ROOT=$PWD/../..
+SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples
+
+SPEECHX_TOOLS=$SPEECHX_ROOT/tools
+TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
+
+[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; }
+
+export LC_AL=C
+
+SPEECHX_BIN=$SPEECHX_EXAMPLES/glog
+export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN
diff --git a/speechx/examples/glog/run.sh b/speechx/examples/glog/run.sh
new file mode 100755
index 000000000..d3fcdb643
--- /dev/null
+++ b/speechx/examples/glog/run.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+set +x
+set -e
+
+. ./path.sh
+
+# 1. compile
+if [ ! -d ${SPEECHX_EXAMPLES} ]; then
+    pushd ${SPEECHX_ROOT} 
+    bash build.sh
+    popd
+fi
+
+# 2. run 
+glog_test
+
+echo "------"
+export FLAGS_logtostderr=1 
+glog_test
+
+echo "------"
+glog_logtostderr_test
diff --git a/speechx/examples/nnet/CMakeLists.txt b/speechx/examples/nnet/CMakeLists.txt
new file mode 100644
index 000000000..20f4008ce
--- /dev/null
+++ b/speechx/examples/nnet/CMakeLists.txt
@@ -0,0 +1,5 @@
+cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
+
+add_executable(pp-model-test ${CMAKE_CURRENT_SOURCE_DIR}/pp-model-test.cc)
+target_include_directories(pp-model-test PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
+target_link_libraries(pp-model-test PUBLIC nnet gflags ${DEPS})
\ No newline at end of file
diff --git a/speechx/examples/nnet/path.sh b/speechx/examples/nnet/path.sh
new file mode 100644
index 000000000..f70e70eea
--- /dev/null
+++ b/speechx/examples/nnet/path.sh
@@ -0,0 +1,14 @@
+# This contains the locations of binarys build required for running the examples.
+
+SPEECHX_ROOT=$PWD/../..
+SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples
+
+SPEECHX_TOOLS=$SPEECHX_ROOT/tools
+TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
+
+[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; }
+
+export LC_AL=C
+
+SPEECHX_BIN=$SPEECHX_EXAMPLES/nnet
+export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN
diff --git a/speechx/examples/nnet/pp-model-test.cc b/speechx/examples/nnet/pp-model-test.cc
new file mode 100644
index 000000000..2db354a79
--- /dev/null
+++ b/speechx/examples/nnet/pp-model-test.cc
@@ -0,0 +1,193 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <algorithm>
+#include <fstream>
+#include <functional>
+#include <iostream>
+#include <iterator>
+#include <numeric>
+#include <thread>
+#include "paddle_inference_api.h"
+
+using std::cout;
+using std::endl;
+
+DEFINE_string(model_path, "avg_1.jit.pdmodel", "xxx.pdmodel");
+DEFINE_string(param_path, "avg_1.jit.pdiparams", "xxx.pdiparams");
+
+
+void produce_data(std::vector<std::vector<float>>* data);
+void model_forward_test();
+
+void produce_data(std::vector<std::vector<float>>* data) {
+    int chunk_size = 35;  // chunk_size in frame
+    int col_size = 161;   // feat dim
+    cout << "chunk size: " << chunk_size << endl;
+    cout << "feat dim: " << col_size << endl;
+
+    data->reserve(chunk_size);
+    data->back().reserve(col_size);
+    for (int row = 0; row < chunk_size; ++row) {
+        data->push_back(std::vector<float>());
+        for (int col_idx = 0; col_idx < col_size; ++col_idx) {
+            data->back().push_back(0.201);
+        }
+    }
+}
+
+void model_forward_test() {
+    std::cout << "1. read the data" << std::endl;
+    std::vector<std::vector<float>> feats;
+    produce_data(&feats);
+
+    std::cout << "2. load the model" << std::endl;
+    ;
+    std::string model_graph = FLAGS_model_path;
+    std::string model_params = FLAGS_param_path;
+    cout << "model path: " << model_graph << endl;
+    cout << "model param path : " << model_params << endl;
+
+    paddle_infer::Config config;
+    config.SetModel(model_graph, model_params);
+    config.SwitchIrOptim(false);
+    cout << "SwitchIrOptim: " << false << endl;
+    config.DisableFCPadding();
+    cout << "DisableFCPadding: " << endl;
+    auto predictor = paddle_infer::CreatePredictor(config);
+
+    std::cout << "3. feat shape, row=" << feats.size()
+              << ",col=" << feats[0].size() << std::endl;
+    std::vector<float> pp_input_mat;
+    for (const auto& item : feats) {
+        pp_input_mat.insert(pp_input_mat.end(), item.begin(), item.end());
+    }
+
+    std::cout << "4. fead the data to model" << std::endl;
+    int row = feats.size();
+    int col = feats[0].size();
+    std::vector<std::string> input_names = predictor->GetInputNames();
+    std::vector<std::string> output_names = predictor->GetOutputNames();
+    for (auto name : input_names) {
+        cout << "model input names: " << name << endl;
+    }
+    for (auto name : output_names) {
+        cout << "model output names: " << name << endl;
+    }
+
+    // input
+    std::unique_ptr<paddle_infer::Tensor> input_tensor =
+        predictor->GetInputHandle(input_names[0]);
+    std::vector<int> INPUT_SHAPE = {1, row, col};
+    input_tensor->Reshape(INPUT_SHAPE);
+    input_tensor->CopyFromCpu(pp_input_mat.data());
+
+    // input length
+    std::unique_ptr<paddle_infer::Tensor> input_len =
+        predictor->GetInputHandle(input_names[1]);
+    std::vector<int> input_len_size = {1};
+    input_len->Reshape(input_len_size);
+    std::vector<int64_t> audio_len;
+    audio_len.push_back(row);
+    input_len->CopyFromCpu(audio_len.data());
+
+    // state_h
+    std::unique_ptr<paddle_infer::Tensor> chunk_state_h_box =
+        predictor->GetInputHandle(input_names[2]);
+    std::vector<int> chunk_state_h_box_shape = {3, 1, 1024};
+    chunk_state_h_box->Reshape(chunk_state_h_box_shape);
+    int chunk_state_h_box_size =
+        std::accumulate(chunk_state_h_box_shape.begin(),
+                        chunk_state_h_box_shape.end(),
+                        1,
+                        std::multiplies<int>());
+    std::vector<float> chunk_state_h_box_data(chunk_state_h_box_size, 0.0f);
+    chunk_state_h_box->CopyFromCpu(chunk_state_h_box_data.data());
+
+    // state_c
+    std::unique_ptr<paddle_infer::Tensor> chunk_state_c_box =
+        predictor->GetInputHandle(input_names[3]);
+    std::vector<int> chunk_state_c_box_shape = {3, 1, 1024};
+    chunk_state_c_box->Reshape(chunk_state_c_box_shape);
+    int chunk_state_c_box_size =
+        std::accumulate(chunk_state_c_box_shape.begin(),
+                        chunk_state_c_box_shape.end(),
+                        1,
+                        std::multiplies<int>());
+    std::vector<float> chunk_state_c_box_data(chunk_state_c_box_size, 0.0f);
+    chunk_state_c_box->CopyFromCpu(chunk_state_c_box_data.data());
+
+    // run
+    bool success = predictor->Run();
+
+    // state_h out
+    std::unique_ptr<paddle_infer::Tensor> h_out =
+        predictor->GetOutputHandle(output_names[2]);
+    std::vector<int> h_out_shape = h_out->shape();
+    int h_out_size = std::accumulate(
+        h_out_shape.begin(), h_out_shape.end(), 1, std::multiplies<int>());
+    std::vector<float> h_out_data(h_out_size);
+    h_out->CopyToCpu(h_out_data.data());
+
+    // stage_c out
+    std::unique_ptr<paddle_infer::Tensor> c_out =
+        predictor->GetOutputHandle(output_names[3]);
+    std::vector<int> c_out_shape = c_out->shape();
+    int c_out_size = std::accumulate(
+        c_out_shape.begin(), c_out_shape.end(), 1, std::multiplies<int>());
+    std::vector<float> c_out_data(c_out_size);
+    c_out->CopyToCpu(c_out_data.data());
+
+    // output tensor
+    std::unique_ptr<paddle_infer::Tensor> output_tensor =
+        predictor->GetOutputHandle(output_names[0]);
+    std::vector<int> output_shape = output_tensor->shape();
+    std::vector<float> output_probs;
+    int output_size = std::accumulate(
+        output_shape.begin(), output_shape.end(), 1, std::multiplies<int>());
+    output_probs.resize(output_size);
+    output_tensor->CopyToCpu(output_probs.data());
+    row = output_shape[1];
+    col = output_shape[2];
+
+    // probs
+    std::vector<std::vector<float>> probs;
+    probs.reserve(row);
+    for (int i = 0; i < row; i++) {
+        probs.push_back(std::vector<float>());
+        probs.back().reserve(col);
+
+        for (int j = 0; j < col; j++) {
+            probs.back().push_back(output_probs[i * col + j]);
+        }
+    }
+
+    std::vector<std::vector<float>> log_feat = probs;
+    std::cout << "probs, row: " << log_feat.size()
+              << " col: " << log_feat[0].size() << std::endl;
+    for (size_t row_idx = 0; row_idx < log_feat.size(); ++row_idx) {
+        for (size_t col_idx = 0; col_idx < log_feat[row_idx].size();
+             ++col_idx) {
+            std::cout << log_feat[row_idx][col_idx] << " ";
+        }
+        std::cout << std::endl;
+    }
+}
+
+int main(int argc, char* argv[]) {
+    gflags::ParseCommandLineFlags(&argc, &argv, true);
+    model_forward_test();
+    return 0;
+}
diff --git a/speechx/examples/nnet/run.sh b/speechx/examples/nnet/run.sh
new file mode 100755
index 000000000..4d67d1988
--- /dev/null
+++ b/speechx/examples/nnet/run.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+set +x
+set -e
+
+. path.sh
+
+# 1. compile
+if [ ! -d ${SPEECHX_EXAMPLES} ]; then
+    pushd ${SPEECHX_ROOT} 
+    bash build.sh
+    popd
+fi
+
+# 2. download model
+if [ ! -d ../paddle_asr_model ]; then
+    wget https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/paddle_asr_model.tar.gz
+    tar xzfv paddle_asr_model.tar.gz
+    mv ./paddle_asr_model ../
+    # produce wav scp
+    echo "utt1 " $PWD/../paddle_asr_model/BAC009S0764W0290.wav > ../paddle_asr_model/wav.scp
+fi
+
+model_dir=../paddle_asr_model
+
+# 4. run decoder
+pp-model-test \
+    --model_path=$model_dir/avg_1.jit.pdmodel \
+    --param_path=$model_dir/avg_1.jit.pdparams
+
diff --git a/speechx/examples/nnet/valgrind.sh b/speechx/examples/nnet/valgrind.sh
new file mode 100755
index 000000000..2a08c6082
--- /dev/null
+++ b/speechx/examples/nnet/valgrind.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+# this script is for memory check, so please run ./run.sh first.
+
+set +x
+set -e
+
+. ./path.sh
+
+if [ ! -d ${SPEECHX_TOOLS}/valgrind/install ]; then
+  echo "please install valgrind in the speechx tools dir.\n" 
+  exit 1
+fi
+
+model_dir=../paddle_asr_model
+
+valgrind --tool=memcheck --track-origins=yes --leak-check=full --show-leak-kinds=all \
+  pp-model-test \
+  --model_path=$model_dir/avg_1.jit.pdmodel \
+  --param_path=$model_dir/avg_1.jit.pdparams
\ No newline at end of file
diff --git a/speechx/patch/CPPLINT.cfg b/speechx/patch/CPPLINT.cfg
new file mode 100644
index 000000000..51ff339c1
--- /dev/null
+++ b/speechx/patch/CPPLINT.cfg
@@ -0,0 +1 @@
+exclude_files=.*
diff --git a/speechx/patch/openfst/src/include/fst/flags.h b/speechx/patch/openfst/src/include/fst/flags.h
new file mode 100644
index 000000000..b5ec8ff74
--- /dev/null
+++ b/speechx/patch/openfst/src/include/fst/flags.h
@@ -0,0 +1,228 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// See www.openfst.org for extensive documentation on this weighted
+// finite-state transducer library.
+//
+// Google-style flag handling declarations and inline definitions.
+
+#ifndef FST_LIB_FLAGS_H_
+#define FST_LIB_FLAGS_H_
+
+#include <cstdlib>
+
+#include <iostream>
+#include <map>
+#include <set>
+#include <sstream>
+#include <string>
+
+#include <fst/types.h>
+#include <fst/lock.h>
+
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+
+using std::string;
+
+// FLAGS USAGE:
+//
+// Definition example:
+//
+//    DEFINE_int32(length, 0, "length");
+//
+// This defines variable FLAGS_length, initialized to 0.
+//
+// Declaration example:
+//
+//    DECLARE_int32(length);
+//
+// SET_FLAGS() can be used to set flags from the command line
+// using, for example, '--length=2'.
+//
+// ShowUsage() can be used to print out command and flag usage.
+
+// #define DECLARE_bool(name) extern bool FLAGS_ ## name
+// #define DECLARE_string(name) extern string FLAGS_ ## name
+// #define DECLARE_int32(name) extern int32 FLAGS_ ## name
+// #define DECLARE_int64(name) extern int64 FLAGS_ ## name
+// #define DECLARE_double(name) extern double FLAGS_ ## name
+
+template <typename T>
+struct FlagDescription {
+  FlagDescription(T *addr, const char *doc, const char *type,
+      const char *file, const T val)
+      : address(addr),
+    doc_string(doc),
+    type_name(type),
+    file_name(file),
+    default_value(val) {}
+
+  T *address;
+  const char *doc_string;
+  const char *type_name;
+  const char *file_name;
+  const T default_value;
+};
+
+template <typename T>
+class FlagRegister {
+ public:
+  static FlagRegister<T> *GetRegister() {
+    static auto reg = new FlagRegister<T>;
+    return reg;
+  }
+
+  const FlagDescription<T> &GetFlagDescription(const string &name) const {
+    fst::MutexLock l(&flag_lock_);
+    auto it = flag_table_.find(name);
+    return it != flag_table_.end() ? it->second : 0;
+  }
+
+  void SetDescription(const string &name,
+                      const FlagDescription<T> &desc) {
+    fst::MutexLock l(&flag_lock_);
+    flag_table_.insert(make_pair(name, desc));
+  }
+
+  bool SetFlag(const string &val, bool *address) const {
+    if (val == "true" || val == "1" || val.empty()) {
+      *address = true;
+      return true;
+    } else if (val == "false" || val == "0") {
+      *address = false;
+      return true;
+    }
+    else {
+      return false;
+    }
+  }
+
+  bool SetFlag(const string &val, string *address) const {
+    *address = val;
+    return true;
+  }
+
+  bool SetFlag(const string &val, int32 *address) const {
+    char *p = 0;
+    *address = strtol(val.c_str(), &p, 0);
+    return !val.empty() && *p == '\0';
+  }
+
+  bool SetFlag(const string &val, int64 *address) const {
+    char *p = 0;
+    *address = strtoll(val.c_str(), &p, 0);
+    return !val.empty() && *p == '\0';
+  }
+
+  bool SetFlag(const string &val, double *address) const {
+    char *p = 0;
+    *address = strtod(val.c_str(), &p);
+    return !val.empty() && *p == '\0';
+  }
+
+  bool SetFlag(const string &arg, const string &val) const {
+    for (typename std::map< string, FlagDescription<T> >::const_iterator it =
+           flag_table_.begin();
+         it != flag_table_.end();
+         ++it) {
+      const string &name = it->first;
+      const FlagDescription<T> &desc = it->second;
+      if (arg == name)
+        return SetFlag(val, desc.address);
+    }
+    return false;
+  }
+
+  void GetUsage(std::set<std::pair<string, string>> *usage_set) const {
+    for (auto it = flag_table_.begin(); it != flag_table_.end(); ++it) {
+      const string &name = it->first;
+      const FlagDescription<T> &desc = it->second;
+      string usage = "  --" + name;
+      usage += ": type = ";
+      usage += desc.type_name;
+      usage += ", default = ";
+      usage += GetDefault(desc.default_value) + "\n  ";
+      usage += desc.doc_string;
+      usage_set->insert(make_pair(desc.file_name, usage));
+    }
+  }
+
+ private:
+  string GetDefault(bool default_value) const {
+    return default_value ? "true" : "false";
+  }
+
+  string GetDefault(const string &default_value) const {
+    return "\"" + default_value + "\"";
+  }
+
+  template <class V>
+  string GetDefault(const V &default_value) const {
+    std::ostringstream strm;
+    strm << default_value;
+    return strm.str();
+  }
+
+  mutable fst::Mutex flag_lock_;        // Multithreading lock.
+  std::map<string, FlagDescription<T>> flag_table_;
+};
+
+template <typename T>
+class FlagRegisterer {
+ public:
+  FlagRegisterer(const string &name, const FlagDescription<T> &desc) {
+    auto registr = FlagRegister<T>::GetRegister();
+    registr->SetDescription(name, desc);
+  }
+
+ private:
+  FlagRegisterer(const FlagRegisterer &) = delete;
+  FlagRegisterer &operator=(const FlagRegisterer &) = delete;
+};
+
+
+#define DEFINE_VAR(type, name, value, doc)                                \
+  type FLAGS_ ## name = value;                                            \
+  static FlagRegisterer<type>                                             \
+  name ## _flags_registerer(#name, FlagDescription<type>(&FLAGS_ ## name, \
+                                                         doc,             \
+                                                         #type,           \
+                                                         __FILE__,        \
+                                                         value))
+
+// #define DEFINE_bool(name, value, doc) DEFINE_VAR(bool, name, value, doc)
+// #define DEFINE_string(name, value, doc) \
+//   DEFINE_VAR(string, name, value, doc)
+// #define DEFINE_int32(name, value, doc) DEFINE_VAR(int32, name, value, doc)
+// #define DEFINE_int64(name, value, doc) DEFINE_VAR(int64, name, value, doc)
+// #define DEFINE_double(name, value, doc) DEFINE_VAR(double, name, value, doc)
+
+
+// Temporary directory.
+DECLARE_string(tmpdir);
+
+void SetFlags(const char *usage, int *argc, char ***argv, bool remove_flags,
+              const char *src = "");
+
+#define SET_FLAGS(usage, argc, argv, rmflags) \
+gflags::ParseCommandLineFlags(argc, argv, true)
+// SetFlags(usage, argc, argv, rmflags, __FILE__)
+
+// Deprecated; for backward compatibility.
+inline void InitFst(const char *usage, int *argc, char ***argv, bool rmflags) {
+  return SetFlags(usage, argc, argv, rmflags);
+}
+
+void ShowUsage(bool long_usage = true);
+
+#endif  // FST_LIB_FLAGS_H_
diff --git a/speechx/patch/openfst/src/include/fst/log.h b/speechx/patch/openfst/src/include/fst/log.h
new file mode 100644
index 000000000..bf041c58e
--- /dev/null
+++ b/speechx/patch/openfst/src/include/fst/log.h
@@ -0,0 +1,82 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// See www.openfst.org for extensive documentation on this weighted
+// finite-state transducer library.
+//
+// Google-style logging declarations and inline definitions.
+
+#ifndef FST_LIB_LOG_H_
+#define FST_LIB_LOG_H_
+
+#include <cassert>
+#include <iostream>
+#include <string>
+
+#include <fst/types.h>
+#include <fst/flags.h>
+
+using std::string;
+
+DECLARE_int32(v);
+
+class LogMessage {
+ public:
+  LogMessage(const string &type) : fatal_(type == "FATAL") {
+    std::cerr << type << ": ";
+  }
+  ~LogMessage() {
+    std::cerr << std::endl;
+    if(fatal_)
+      exit(1);
+  }
+  std::ostream &stream() { return std::cerr; }
+
+ private:
+  bool fatal_;
+};
+
+// #define LOG(type) LogMessage(#type).stream()
+// #define VLOG(level) if ((level) <= FLAGS_v) LOG(INFO)
+
+// Checks
+inline void FstCheck(bool x, const char* expr,
+                const char *file, int line) {
+  if (!x) {
+    LOG(FATAL) << "Check failed: \"" << expr
+               << "\" file: " << file
+               << " line: " << line;
+  }
+}
+
+// #define CHECK(x) FstCheck(static_cast<bool>(x), #x, __FILE__, __LINE__)
+// #define CHECK_EQ(x, y) CHECK((x) == (y))
+// #define CHECK_LT(x, y) CHECK((x) < (y))
+// #define CHECK_GT(x, y) CHECK((x) > (y))
+// #define CHECK_LE(x, y) CHECK((x) <= (y))
+// #define CHECK_GE(x, y) CHECK((x) >= (y))
+// #define CHECK_NE(x, y) CHECK((x) != (y))
+
+// Debug checks
+// #define DCHECK(x) assert(x)
+// #define DCHECK_EQ(x, y) DCHECK((x) == (y))
+// #define DCHECK_LT(x, y) DCHECK((x) < (y))
+// #define DCHECK_GT(x, y) DCHECK((x) > (y))
+// #define DCHECK_LE(x, y) DCHECK((x) <= (y))
+// #define DCHECK_GE(x, y) DCHECK((x) >= (y))
+// #define DCHECK_NE(x, y) DCHECK((x) != (y))
+
+
+// Ports
+#define ATTRIBUTE_DEPRECATED __attribute__((deprecated))
+
+#endif  // FST_LIB_LOG_H_
diff --git a/speechx/patch/openfst/src/lib/flags.cc b/speechx/patch/openfst/src/lib/flags.cc
new file mode 100644
index 000000000..95f7e2e9a
--- /dev/null
+++ b/speechx/patch/openfst/src/lib/flags.cc
@@ -0,0 +1,166 @@
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+// Google-style flag handling definitions.
+
+#include <cstring>
+
+#if _MSC_VER
+#include <io.h>
+#include <fcntl.h>
+#endif
+
+#include <fst/compat.h>
+#include <fst/flags.h>
+
+static const char *private_tmpdir = getenv("TMPDIR");
+
+// DEFINE_int32(v, 0, "verbosity level");
+// DEFINE_bool(help, false, "show usage information");
+// DEFINE_bool(helpshort, false, "show brief usage information");
+#ifndef _MSC_VER
+DEFINE_string(tmpdir, private_tmpdir ? private_tmpdir : "/tmp",
+              "temporary directory");
+#else
+DEFINE_string(tmpdir, private_tmpdir ? private_tmpdir : getenv("TEMP"),
+              "temporary directory");
+#endif // !_MSC_VER
+
+using namespace std;
+
+static string flag_usage;
+static string prog_src;
+
+// Sets prog_src to src.
+static void SetProgSrc(const char *src) {
+  prog_src = src;
+#if _MSC_VER
+  // This common code is invoked by all FST binaries, and only by them. Switch
+  // stdin and stdout into "binary" mode, so that 0x0A won't be translated into
+  // a 0x0D 0x0A byte pair in a pipe or a shell redirect. Other streams are
+  // already using ios::binary where binary files are read or written.
+  // Kudos to @daanzu for the suggested fix.
+  //    https://github.com/kkm000/openfst/issues/20
+  //    https://github.com/kkm000/openfst/pull/23
+  //    https://github.com/kkm000/openfst/pull/32
+  _setmode(_fileno(stdin), O_BINARY);
+  _setmode(_fileno(stdout), O_BINARY);
+#endif
+  // Remove "-main" in src filename. Flags are defined in fstx.cc but SetFlags()
+  // is called in fstx-main.cc, which results in a filename mismatch in
+  // ShowUsageRestrict() below.
+  static constexpr char kMainSuffix[] = "-main.cc";
+  const int prefix_length = prog_src.size() - strlen(kMainSuffix);
+  if (prefix_length > 0 && prog_src.substr(prefix_length) == kMainSuffix) {
+    prog_src.erase(prefix_length, strlen("-main"));
+  }
+}
+
+void SetFlags(const char *usage, int *argc, char ***argv,
+              bool remove_flags, const char *src) {
+  flag_usage = usage;
+  SetProgSrc(src);
+
+  int index = 1;
+  for (; index < *argc; ++index) {
+    string argval = (*argv)[index];
+    if (argval[0] != '-' || argval == "-") break;
+    while (argval[0] == '-') argval = argval.substr(1);  // Removes initial '-'.
+    string arg = argval;
+    string val = "";
+    // Splits argval (arg=val) into arg and val.
+    auto pos = argval.find("=");
+    if (pos != string::npos) {
+      arg = argval.substr(0, pos);
+      val = argval.substr(pos + 1);
+    }
+    auto bool_register = FlagRegister<bool>::GetRegister();
+    if (bool_register->SetFlag(arg, val))
+      continue;
+    auto string_register = FlagRegister<string>::GetRegister();
+    if (string_register->SetFlag(arg, val))
+      continue;
+    auto int32_register = FlagRegister<int32>::GetRegister();
+    if (int32_register->SetFlag(arg, val))
+      continue;
+    auto int64_register = FlagRegister<int64>::GetRegister();
+    if (int64_register->SetFlag(arg, val))
+      continue;
+    auto double_register = FlagRegister<double>::GetRegister();
+    if (double_register->SetFlag(arg, val))
+      continue;
+    LOG(FATAL) << "SetFlags: Bad option: " << (*argv)[index];
+  }
+  if (remove_flags) {
+    for (auto i = 0; i < *argc - index; ++i) {
+      (*argv)[i + 1] = (*argv)[i + index];
+    }
+    *argc -= index - 1;
+  }
+  // if (FLAGS_help) {
+  //   ShowUsage(true);
+  //   exit(1);
+  // }
+  // if (FLAGS_helpshort) {
+  //   ShowUsage(false);
+  //   exit(1);
+  // }
+}
+
+// If flag is defined in file 'src' and 'in_src' true or is not
+// defined in file 'src' and 'in_src' is false, then print usage.
+static void
+ShowUsageRestrict(const std::set<pair<string, string>> &usage_set,
+                  const string &src, bool in_src, bool show_file) {
+  string old_file;
+  bool file_out = false;
+  bool usage_out = false;
+  for (const auto &pair : usage_set) {
+    const auto &file = pair.first;
+    const auto &usage = pair.second;
+    bool match = file == src;
+    if ((match && !in_src) || (!match && in_src)) continue;
+    if (file != old_file) {
+      if (show_file) {
+        if (file_out) cout << "\n";
+        cout << "Flags from: " << file << "\n";
+        file_out = true;
+      }
+      old_file = file;
+    }
+    cout << usage << "\n";
+    usage_out = true;
+  }
+  if (usage_out) cout << "\n";
+}
+
+void ShowUsage(bool long_usage) {
+  std::set<pair<string, string>> usage_set;
+  cout << flag_usage << "\n";
+  auto bool_register = FlagRegister<bool>::GetRegister();
+  bool_register->GetUsage(&usage_set);
+  auto string_register = FlagRegister<string>::GetRegister();
+  string_register->GetUsage(&usage_set);
+  auto int32_register = FlagRegister<int32>::GetRegister();
+  int32_register->GetUsage(&usage_set);
+  auto int64_register = FlagRegister<int64>::GetRegister();
+  int64_register->GetUsage(&usage_set);
+  auto double_register = FlagRegister<double>::GetRegister();
+  double_register->GetUsage(&usage_set);
+  if (!prog_src.empty()) {
+    cout << "PROGRAM FLAGS:\n\n";
+    ShowUsageRestrict(usage_set, prog_src, true, false);
+  }
+  if (!long_usage) return;
+  if (!prog_src.empty()) cout << "LIBRARY FLAGS:\n\n";
+  ShowUsageRestrict(usage_set, prog_src, false, true);
+}
diff --git a/speechx/speechx/CMakeLists.txt b/speechx/speechx/CMakeLists.txt
index 71c7eb7cd..225abee7c 100644
--- a/speechx/speechx/CMakeLists.txt
+++ b/speechx/speechx/CMakeLists.txt
@@ -2,13 +2,32 @@ cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
 
 project(speechx LANGUAGES CXX)
 
-link_directories(${CMAKE_CURRENT_SOURCE_DIR}/third_party/openblas)
-
 include_directories(
 ${CMAKE_CURRENT_SOURCE_DIR}
 ${CMAKE_CURRENT_SOURCE_DIR}/kaldi
 )
 add_subdirectory(kaldi)
 
-add_executable(mfcc-test codelab/feat_test/feature-mfcc-test.cc)
-target_link_libraries(mfcc-test kaldi-mfcc)
+include_directories(
+${CMAKE_CURRENT_SOURCE_DIR}
+${CMAKE_CURRENT_SOURCE_DIR}/utils
+)
+add_subdirectory(utils)
+
+include_directories(
+${CMAKE_CURRENT_SOURCE_DIR}
+${CMAKE_CURRENT_SOURCE_DIR}/frontend
+)
+add_subdirectory(frontend)
+
+include_directories(
+${CMAKE_CURRENT_SOURCE_DIR}
+${CMAKE_CURRENT_SOURCE_DIR}/nnet
+)
+add_subdirectory(nnet)
+
+include_directories(
+${CMAKE_CURRENT_SOURCE_DIR}
+${CMAKE_CURRENT_SOURCE_DIR}/decoder
+)
+add_subdirectory(decoder)
\ No newline at end of file
diff --git a/speechx/speechx/base/basic_types.h b/speechx/speechx/base/basic_types.h
index 1966c021e..206b7be67 100644
--- a/speechx/speechx/base/basic_types.h
+++ b/speechx/speechx/base/basic_types.h
@@ -16,45 +16,45 @@
 
 #include "kaldi/base/kaldi-types.h"
 
-#include <limits.h>
+#include <limits>
 
-typedef float               BaseFloat;
-typedef double              double64;
+typedef float BaseFloat;
+typedef double double64;
 
-typedef signed char         int8;
-typedef short               int16;
-typedef int                 int32;
+typedef signed char int8;
+typedef short int16;
+typedef int int32;
 
 #if defined(__LP64__) && !defined(OS_MACOSX) && !defined(OS_OPENBSD)
-typedef long                int64;
+typedef long int64;
 #else
-typedef long long           int64;
+typedef long long int64;
 #endif
 
-typedef unsigned char      uint8;
-typedef unsigned short     uint16;
-typedef unsigned int       uint32;
+typedef unsigned char uint8;
+typedef unsigned short uint16;
+typedef unsigned int uint32;
 
-if defined(__LP64__) && !defined(OS_MACOSX) && !defined(OS_OPENBSD)
+#if defined(__LP64__) && !defined(OS_MACOSX) && !defined(OS_OPENBSD)
 typedef unsigned long uint64;
 #else
 typedef unsigned long long uint64;
 #endif
 
-typedef signed int         char32;
-
-const uint8  kuint8max  = (( uint8) 0xFF);
-const uint16 kuint16max = ((uint16) 0xFFFF);
-const uint32 kuint32max = ((uint32) 0xFFFFFFFF);
-const uint64 kuint64max = ((uint64) (0xFFFFFFFFFFFFFFFFLL));
-const  int8  kint8min   = ((  int8) 0x80);
-const  int8  kint8max   = ((  int8) 0x7F);
-const  int16 kint16min  = (( int16) 0x8000);
-const  int16 kint16max  = (( int16) 0x7FFF);
-const  int32 kint32min  = (( int32) 0x80000000);
-const  int32 kint32max  = (( int32) 0x7FFFFFFF);
-const  int64 kint64min  = (( int64) (0x8000000000000000LL));
-const  int64 kint64max  = (( int64) (0x7FFFFFFFFFFFFFFFLL));
-
-const  BaseFloat kBaseFloatMax = std::numeric_limits<BaseFloat>::max();
-const  BaseFloat kBaseFloatMin = std::numeric_limits<BaseFloat>::min();
+typedef signed int char32;
+
+const uint8 kuint8max = ((uint8)0xFF);
+const uint16 kuint16max = ((uint16)0xFFFF);
+const uint32 kuint32max = ((uint32)0xFFFFFFFF);
+const uint64 kuint64max = ((uint64)(0xFFFFFFFFFFFFFFFFLL));
+const int8 kint8min = ((int8)0x80);
+const int8 kint8max = ((int8)0x7F);
+const int16 kint16min = ((int16)0x8000);
+const int16 kint16max = ((int16)0x7FFF);
+const int32 kint32min = ((int32)0x80000000);
+const int32 kint32max = ((int32)0x7FFFFFFF);
+const int64 kint64min = ((int64)(0x8000000000000000LL));
+const int64 kint64max = ((int64)(0x7FFFFFFFFFFFFFFFLL));
+
+const BaseFloat kBaseFloatMax = std::numeric_limits<BaseFloat>::max();
+const BaseFloat kBaseFloatMin = std::numeric_limits<BaseFloat>::min();
diff --git a/speechx/speechx/base/common.h b/speechx/speechx/base/common.h
new file mode 100644
index 000000000..7502bc5eb
--- /dev/null
+++ b/speechx/speechx/base/common.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <condition_variable>
+#include <deque>
+#include <fstream>
+#include <iostream>
+#include <istream>
+#include <map>
+#include <memory>
+#include <mutex>
+#include <ostream>
+#include <queue>
+#include <set>
+#include <sstream>
+#include <stack>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "base/basic_types.h"
+#include "base/flags.h"
+#include "base/log.h"
+#include "base/macros.h"
diff --git a/speechx/speechx/base/flags.h b/speechx/speechx/base/flags.h
new file mode 100644
index 000000000..41df0d452
--- /dev/null
+++ b/speechx/speechx/base/flags.h
@@ -0,0 +1,17 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "fst/flags.h"
diff --git a/speechx/speechx/base/log.h b/speechx/speechx/base/log.h
new file mode 100644
index 000000000..c613b98c3
--- /dev/null
+++ b/speechx/speechx/base/log.h
@@ -0,0 +1,17 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "fst/log.h"
diff --git a/speechx/speechx/base/macros.h b/speechx/speechx/base/macros.h
index c8d254d66..d7d5a78d1 100644
--- a/speechx/speechx/base/macros.h
+++ b/speechx/speechx/base/macros.h
@@ -16,8 +16,10 @@
 
 namespace ppspeech {
 
+#ifndef DISALLOW_COPY_AND_ASSIGN
 #define DISALLOW_COPY_AND_ASSIGN(TypeName) \
-  TypeName(const TypeName&) = delete;               \
-  void operator=(const TypeName&) = delete
+    TypeName(const TypeName&) = delete;    \
+    void operator=(const TypeName&) = delete
+#endif
 
 }  // namespace pp_speech
\ No newline at end of file
diff --git a/speechx/speechx/base/thread_pool.h b/speechx/speechx/base/thread_pool.h
new file mode 100644
index 000000000..ba895f714
--- /dev/null
+++ b/speechx/speechx/base/thread_pool.h
@@ -0,0 +1,110 @@
+// Copyright (c) 2012 Jakob Progsch, Václav Zeman
+
+// This software is provided 'as-is', without any express or implied
+// warranty. In no event will the authors be held liable for any damages
+// arising from the use of this software.
+
+// Permission is granted to anyone to use this software for any purpose,
+// including commercial applications, and to alter it and redistribute it
+// freely, subject to the following restrictions:
+
+//   1. The origin of this software must not be misrepresented; you must not
+//   claim that you wrote the original software. If you use this software
+//   in a product, an acknowledgment in the product documentation would be
+//   appreciated but is not required.
+
+//   2. Altered source versions must be plainly marked as such, and must not be
+//   misrepresented as being the original software.
+
+//   3. This notice may not be removed or altered from any source
+//   distribution.
+// this code is from https://github.com/progschj/ThreadPool
+
+#ifndef BASE_THREAD_POOL_H
+#define BASE_THREAD_POOL_H
+
+#include <condition_variable>
+#include <functional>
+#include <future>
+#include <memory>
+#include <mutex>
+#include <queue>
+#include <stdexcept>
+#include <thread>
+#include <vector>
+
+class ThreadPool {
+  public:
+    ThreadPool(size_t);
+    template <class F, class... Args>
+    auto enqueue(F&& f, Args&&... args)
+        -> std::future<typename std::result_of<F(Args...)>::type>;
+    ~ThreadPool();
+
+  private:
+    // need to keep track of threads so we can join them
+    std::vector<std::thread> workers;
+    // the task queue
+    std::queue<std::function<void()>> tasks;
+
+    // synchronization
+    std::mutex queue_mutex;
+    std::condition_variable condition;
+    bool stop;
+};
+
+// the constructor just launches some amount of workers
+inline ThreadPool::ThreadPool(size_t threads) : stop(false) {
+    for (size_t i = 0; i < threads; ++i)
+        workers.emplace_back([this] {
+            for (;;) {
+                std::function<void()> task;
+
+                {
+                    std::unique_lock<std::mutex> lock(this->queue_mutex);
+                    this->condition.wait(lock, [this] {
+                        return this->stop || !this->tasks.empty();
+                    });
+                    if (this->stop && this->tasks.empty()) return;
+                    task = std::move(this->tasks.front());
+                    this->tasks.pop();
+                }
+
+                task();
+            }
+        });
+}
+
+// add new work item to the pool
+template <class F, class... Args>
+auto ThreadPool::enqueue(F&& f, Args&&... args)
+    -> std::future<typename std::result_of<F(Args...)>::type> {
+    using return_type = typename std::result_of<F(Args...)>::type;
+
+    auto task = std::make_shared<std::packaged_task<return_type()>>(
+        std::bind(std::forward<F>(f), std::forward<Args>(args)...));
+
+    std::future<return_type> res = task->get_future();
+    {
+        std::unique_lock<std::mutex> lock(queue_mutex);
+
+        // don't allow enqueueing after stopping the pool
+        if (stop) throw std::runtime_error("enqueue on stopped ThreadPool");
+
+        tasks.emplace([task]() { (*task)(); });
+    }
+    condition.notify_one();
+    return res;
+}
+
+// the destructor joins all threads
+inline ThreadPool::~ThreadPool() {
+    {
+        std::unique_lock<std::mutex> lock(queue_mutex);
+        stop = true;
+    }
+    condition.notify_all();
+    for (std::thread& worker : workers) worker.join();
+}
+
+#endif
diff --git a/speechx/speechx/codelab/README.md b/speechx/speechx/codelab/README.md
deleted file mode 100644
index 95c95db13..000000000
--- a/speechx/speechx/codelab/README.md
+++ /dev/null
@@ -1,4 +0,0 @@
-# codelab
-
-This directory is here for testing some funcitons temporaril.
-
diff --git a/speechx/speechx/codelab/feat_test/feature-mfcc-test.cc b/speechx/speechx/codelab/feat_test/feature-mfcc-test.cc
deleted file mode 100644
index c43671397..000000000
--- a/speechx/speechx/codelab/feat_test/feature-mfcc-test.cc
+++ /dev/null
@@ -1,686 +0,0 @@
-// feat/feature-mfcc-test.cc
-
-// Copyright 2009-2011  Karel Vesely;  Petr Motlicek
-
-// See ../../COPYING for clarification regarding multiple authors
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//  http://www.apache.org/licenses/LICENSE-2.0
-//
-// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-// MERCHANTABLITY OR NON-INFRINGEMENT.
-// See the Apache 2 License for the specific language governing permissions and
-// limitations under the License.
-
-
-#include <iostream>
-
-#include "feat/feature-mfcc.h"
-#include "base/kaldi-math.h"
-#include "matrix/kaldi-matrix-inl.h"
-#include "feat/wave-reader.h"
-
-using namespace kaldi;
-
-
-
-static void UnitTestReadWave() {
-
-  std::cout << "=== UnitTestReadWave() ===\n";
-
-  Vector<BaseFloat> v, v2;
-
-  std::cout << "<<<=== Reading waveform\n";
-
-  {
-    std::ifstream is("test_data/test.wav", std::ios_base::binary);
-    WaveData wave;
-    wave.Read(is);
-    const Matrix<BaseFloat> data(wave.Data());
-    KALDI_ASSERT(data.NumRows() == 1);
-    v.Resize(data.NumCols());
-    v.CopyFromVec(data.Row(0));
-  }
-
-  std::cout << "<<<=== Reading Vector<BaseFloat> waveform, prepared by matlab\n";
-  std::ifstream input(
-    "test_data/test_matlab.ascii"
-  );
-  KALDI_ASSERT(input.good());
-  v2.Read(input, false);
-  input.close();
-
-  std::cout << "<<<=== Comparing freshly read waveform to 'libsndfile' waveform\n";
-  KALDI_ASSERT(v.Dim() == v2.Dim());
-  for (int32 i = 0; i < v.Dim(); i++) {
-    KALDI_ASSERT(v(i) == v2(i));
-  }
-  std::cout << "<<<=== Comparing done\n";
-
-  // std::cout << "== The Waveform Samples == \n";
-  // std::cout << v;
-
-  std::cout << "Test passed :)\n\n";
-
-}
-
-
-
-/**
- */
-static void UnitTestSimple() {
-  std::cout << "=== UnitTestSimple() ===\n";
-
-  Vector<BaseFloat> v(100000);
-  Matrix<BaseFloat> m;
-
-  // init with noise
-  for (int32 i = 0; i < v.Dim(); i++) {
-    v(i) = (abs( i * 433024253 ) % 65535) - (65535 / 2);
-  }
-
-  std::cout << "<<<=== Just make sure it runs... Nothing is compared\n";
-  // the parametrization object
-  MfccOptions op;
-  // trying to have same opts as baseline.
-  op.frame_opts.dither = 0.0;
-  op.frame_opts.preemph_coeff = 0.0;
-  op.frame_opts.window_type = "rectangular";
-  op.frame_opts.remove_dc_offset = false;
-  op.frame_opts.round_to_power_of_two = true;
-  op.mel_opts.low_freq = 0.0;
-  op.mel_opts.htk_mode = true;
-  op.htk_compat = true;
-
-  Mfcc mfcc(op);
-  // use default parameters
-
-  // compute mfccs.
-  mfcc.Compute(v, 1.0, &m);
-
-  // possibly dump
-  //   std::cout << "== Output features == \n" << m;
-  std::cout << "Test passed :)\n\n";
-}
-
-
-static void UnitTestHTKCompare1() {
-  std::cout << "=== UnitTestHTKCompare1() ===\n";
-
-  std::ifstream is("test_data/test.wav", std::ios_base::binary);
-  WaveData wave;
-  wave.Read(is);
-  KALDI_ASSERT(wave.Data().NumRows() == 1);
-  SubVector<BaseFloat> waveform(wave.Data(), 0);
-
-  // read the HTK features
-  Matrix<BaseFloat> htk_features;
-  {
-    std::ifstream is("test_data/test.wav.fea_htk.1",
-                     std::ios::in | std::ios_base::binary);
-    bool ans = ReadHtk(is, &htk_features, 0);
-    KALDI_ASSERT(ans);
-  }
-
-  // use mfcc with default configuration...
-  MfccOptions op;
-  op.frame_opts.dither = 0.0;
-  op.frame_opts.preemph_coeff = 0.0;
-  op.frame_opts.window_type = "hamming";
-  op.frame_opts.remove_dc_offset = false;
-  op.frame_opts.round_to_power_of_two = true;
-  op.mel_opts.low_freq = 0.0;
-  op.mel_opts.htk_mode = true;
-  op.htk_compat = true;
-  op.use_energy = false;  // C0 not energy.
-
-  Mfcc mfcc(op);
-
-  // calculate kaldi features
-  Matrix<BaseFloat> kaldi_raw_features;
-  mfcc.Compute(waveform, 1.0, &kaldi_raw_features);
-
-  DeltaFeaturesOptions delta_opts;
-  Matrix<BaseFloat> kaldi_features;
-  ComputeDeltas(delta_opts,
-                kaldi_raw_features,
-                &kaldi_features);
-
-  // compare the results
-  bool passed = true;
-  int32 i_old = -1;
-  KALDI_ASSERT(kaldi_features.NumRows() == htk_features.NumRows());
-  KALDI_ASSERT(kaldi_features.NumCols() == htk_features.NumCols());
-  // Ignore ends-- we make slightly different choices than
-  // HTK about how to treat the deltas at the ends.
-  for (int32 i = 10; i+10 < kaldi_features.NumRows(); i++) {
-    for (int32 j = 0; j < kaldi_features.NumCols(); j++) {
-      BaseFloat a = kaldi_features(i, j), b = htk_features(i, j);
-      if ((std::abs(b - a)) > 1.0) {  //<< TOLERANCE TO DIFFERENCES!!!!!
-        // print the non-matching data only once per-line
-        if (i_old != i) {
-          std::cout << "\n\n\n[HTK-row: " << i << "] " << htk_features.Row(i) << "\n";
-          std::cout << "[Kaldi-row: " << i << "] " << kaldi_features.Row(i) << "\n\n\n";
-          i_old = i;
-        }
-        // print indices of non-matching cells
-        std::cout << "[" << i << ", " << j << "]";
-        passed = false;
-  }}}
-  if (!passed) KALDI_ERR << "Test failed";
-
-  // write the htk features for later inspection
-  HtkHeader header = {
-    kaldi_features.NumRows(),
-    100000,  // 10ms
-    static_cast<int16>(sizeof(float)*kaldi_features.NumCols()),
-    021406  // MFCC_D_A_0
-  };
-  {
-    std::ofstream os("tmp.test.wav.fea_kaldi.1",
-                     std::ios::out|std::ios::binary);
-    WriteHtk(os, kaldi_features, header);
-  }
-
-  std::cout << "Test passed :)\n\n";
-  
-  unlink("tmp.test.wav.fea_kaldi.1");
-}
-
-
-static void UnitTestHTKCompare2() {
-  std::cout << "=== UnitTestHTKCompare2() ===\n";
-
-  std::ifstream is("test_data/test.wav", std::ios_base::binary);
-  WaveData wave;
-  wave.Read(is);
-  KALDI_ASSERT(wave.Data().NumRows() == 1);
-  SubVector<BaseFloat> waveform(wave.Data(), 0);
-
-  // read the HTK features
-  Matrix<BaseFloat> htk_features;
-  {
-    std::ifstream is("test_data/test.wav.fea_htk.2",
-                     std::ios::in | std::ios_base::binary);
-    bool ans = ReadHtk(is, &htk_features, 0);
-    KALDI_ASSERT(ans);
-  }
-
-  // use mfcc with default configuration...
-  MfccOptions op;
-  op.frame_opts.dither = 0.0;
-  op.frame_opts.preemph_coeff = 0.0;
-  op.frame_opts.window_type = "hamming";
-  op.frame_opts.remove_dc_offset = false;
-  op.frame_opts.round_to_power_of_two = true;
-  op.mel_opts.low_freq = 0.0;
-  op.mel_opts.htk_mode = true;
-  op.htk_compat = true;
-  op.use_energy = true;  // Use energy.
-
-  Mfcc mfcc(op);
-
-  // calculate kaldi features
-  Matrix<BaseFloat> kaldi_raw_features;
-  mfcc.Compute(waveform, 1.0, &kaldi_raw_features);
-
-  DeltaFeaturesOptions delta_opts;
-  Matrix<BaseFloat> kaldi_features;
-  ComputeDeltas(delta_opts,
-                kaldi_raw_features,
-                &kaldi_features);
-
-  // compare the results
-  bool passed = true;
-  int32 i_old = -1;
-  KALDI_ASSERT(kaldi_features.NumRows() == htk_features.NumRows());
-  KALDI_ASSERT(kaldi_features.NumCols() == htk_features.NumCols());
-  // Ignore ends-- we make slightly different choices than
-  // HTK about how to treat the deltas at the ends.
-  for (int32 i = 10; i+10 < kaldi_features.NumRows(); i++) {
-    for (int32 j = 0; j < kaldi_features.NumCols(); j++) {
-      BaseFloat a = kaldi_features(i, j), b = htk_features(i, j);
-      if ((std::abs(b - a)) > 1.0) {  //<< TOLERANCE TO DIFFERENCES!!!!!
-        // print the non-matching data only once per-line
-        if (i_old != i) {
-          std::cout << "\n\n\n[HTK-row: " << i << "] " << htk_features.Row(i) << "\n";
-          std::cout << "[Kaldi-row: " << i << "] " << kaldi_features.Row(i) << "\n\n\n";
-          i_old = i;
-        }
-        // print indices of non-matching cells
-        std::cout << "[" << i << ", " << j << "]";
-        passed = false;
-  }}}
-  if (!passed) KALDI_ERR << "Test failed";
-
-  // write the htk features for later inspection
-  HtkHeader header = {
-    kaldi_features.NumRows(),
-    100000,  // 10ms
-    static_cast<int16>(sizeof(float)*kaldi_features.NumCols()),
-    021406  // MFCC_D_A_0
-  };
-  {
-    std::ofstream os("tmp.test.wav.fea_kaldi.2",
-                     std::ios::out|std::ios::binary);
-    WriteHtk(os, kaldi_features, header);
-  }
-
-  std::cout << "Test passed :)\n\n";
-  
-  unlink("tmp.test.wav.fea_kaldi.2");
-}
-
-
-static void UnitTestHTKCompare3() {
-  std::cout << "=== UnitTestHTKCompare3() ===\n";
-
-  std::ifstream is("test_data/test.wav", std::ios_base::binary);
-  WaveData wave;
-  wave.Read(is);
-  KALDI_ASSERT(wave.Data().NumRows() == 1);
-  SubVector<BaseFloat> waveform(wave.Data(), 0);
-
-  // read the HTK features
-  Matrix<BaseFloat> htk_features;
-  {
-    std::ifstream is("test_data/test.wav.fea_htk.3",
-                     std::ios::in | std::ios_base::binary);
-    bool ans = ReadHtk(is, &htk_features, 0);
-    KALDI_ASSERT(ans);
-  }
-
-  // use mfcc with default configuration...
-  MfccOptions op;
-  op.frame_opts.dither = 0.0;
-  op.frame_opts.preemph_coeff = 0.0;
-  op.frame_opts.window_type = "hamming";
-  op.frame_opts.remove_dc_offset = false;
-  op.frame_opts.round_to_power_of_two = true;
-  op.htk_compat = true;
-  op.use_energy = true;  // Use energy.
-  op.mel_opts.low_freq = 20.0;
-  //op.mel_opts.debug_mel = true;
-  op.mel_opts.htk_mode = true;
-
-  Mfcc mfcc(op);
-
-  // calculate kaldi features
-  Matrix<BaseFloat> kaldi_raw_features;
-  mfcc.Compute(waveform, 1.0, &kaldi_raw_features);
-
-  DeltaFeaturesOptions delta_opts;
-  Matrix<BaseFloat> kaldi_features;
-  ComputeDeltas(delta_opts,
-                kaldi_raw_features,
-                &kaldi_features);
-
-  // compare the results
-  bool passed = true;
-  int32 i_old = -1;
-  KALDI_ASSERT(kaldi_features.NumRows() == htk_features.NumRows());
-  KALDI_ASSERT(kaldi_features.NumCols() == htk_features.NumCols());
-  // Ignore ends-- we make slightly different choices than
-  // HTK about how to treat the deltas at the ends.
-  for (int32 i = 10; i+10 < kaldi_features.NumRows(); i++) {
-    for (int32 j = 0; j < kaldi_features.NumCols(); j++) {
-      BaseFloat a = kaldi_features(i, j), b = htk_features(i, j);
-      if ((std::abs(b - a)) > 1.0) {  //<< TOLERANCE TO DIFFERENCES!!!!!
-        // print the non-matching data only once per-line
-        if (static_cast<int32>(i_old) != i) {
-          std::cout << "\n\n\n[HTK-row: " << i << "] " << htk_features.Row(i) << "\n";
-          std::cout << "[Kaldi-row: " << i << "] " << kaldi_features.Row(i) << "\n\n\n";
-          i_old = i;
-        }
-        // print indices of non-matching cells
-        std::cout << "[" << i << ", " << j << "]";
-        passed = false;
-  }}}
-  if (!passed) KALDI_ERR << "Test failed";
-
-  // write the htk features for later inspection
-  HtkHeader header = {
-    kaldi_features.NumRows(),
-    100000,  // 10ms
-    static_cast<int16>(sizeof(float)*kaldi_features.NumCols()),
-    021406  // MFCC_D_A_0
-  };
-  {
-    std::ofstream os("tmp.test.wav.fea_kaldi.3",
-                     std::ios::out|std::ios::binary);
-    WriteHtk(os, kaldi_features, header);
-  }
-
-  std::cout << "Test passed :)\n\n";
-  
-  unlink("tmp.test.wav.fea_kaldi.3");
-}
-
-
-static void UnitTestHTKCompare4() {
-  std::cout << "=== UnitTestHTKCompare4() ===\n";
-
-  std::ifstream is("test_data/test.wav", std::ios_base::binary);
-  WaveData wave;
-  wave.Read(is);
-  KALDI_ASSERT(wave.Data().NumRows() == 1);
-  SubVector<BaseFloat> waveform(wave.Data(), 0);
-
-  // read the HTK features
-  Matrix<BaseFloat> htk_features;
-  {
-    std::ifstream is("test_data/test.wav.fea_htk.4",
-                     std::ios::in | std::ios_base::binary);
-    bool ans = ReadHtk(is, &htk_features, 0);
-    KALDI_ASSERT(ans);
-  }
-
-  // use mfcc with default configuration...
-  MfccOptions op;
-  op.frame_opts.dither = 0.0;
-  op.frame_opts.window_type = "hamming";
-  op.frame_opts.remove_dc_offset = false;
-  op.frame_opts.round_to_power_of_two = true;
-  op.mel_opts.low_freq = 0.0;
-  op.htk_compat = true;
-  op.use_energy = true;  // Use energy.
-  op.mel_opts.htk_mode = true;
-
-  Mfcc mfcc(op);
-
-  // calculate kaldi features
-  Matrix<BaseFloat> kaldi_raw_features;
-  mfcc.Compute(waveform, 1.0, &kaldi_raw_features);
-
-  DeltaFeaturesOptions delta_opts;
-  Matrix<BaseFloat> kaldi_features;
-  ComputeDeltas(delta_opts,
-                kaldi_raw_features,
-                &kaldi_features);
-
-  // compare the results
-  bool passed = true;
-  int32 i_old = -1;
-  KALDI_ASSERT(kaldi_features.NumRows() == htk_features.NumRows());
-  KALDI_ASSERT(kaldi_features.NumCols() == htk_features.NumCols());
-  // Ignore ends-- we make slightly different choices than
-  // HTK about how to treat the deltas at the ends.
-  for (int32 i = 10; i+10 < kaldi_features.NumRows(); i++) {
-    for (int32 j = 0; j < kaldi_features.NumCols(); j++) {
-      BaseFloat a = kaldi_features(i, j), b = htk_features(i, j);
-      if ((std::abs(b - a)) > 1.0) {  //<< TOLERANCE TO DIFFERENCES!!!!!
-        // print the non-matching data only once per-line
-        if (static_cast<int32>(i_old) != i) {
-          std::cout << "\n\n\n[HTK-row: " << i << "] " << htk_features.Row(i) << "\n";
-          std::cout << "[Kaldi-row: " << i << "] " << kaldi_features.Row(i) << "\n\n\n";
-          i_old = i;
-        }
-        // print indices of non-matching cells
-        std::cout << "[" << i << ", " << j << "]";
-        passed = false;
-  }}}
-  if (!passed) KALDI_ERR << "Test failed";
-
-  // write the htk features for later inspection
-  HtkHeader header = {
-    kaldi_features.NumRows(),
-    100000,  // 10ms
-    static_cast<int16>(sizeof(float)*kaldi_features.NumCols()),
-    021406  // MFCC_D_A_0
-  };
-  {
-    std::ofstream os("tmp.test.wav.fea_kaldi.4",
-                     std::ios::out|std::ios::binary);
-    WriteHtk(os, kaldi_features, header);
-  }
-
-  std::cout << "Test passed :)\n\n";
-  
-  unlink("tmp.test.wav.fea_kaldi.4");
-}
-
-
-static void UnitTestHTKCompare5() {
-  std::cout << "=== UnitTestHTKCompare5() ===\n";
-
-  std::ifstream is("test_data/test.wav", std::ios_base::binary);
-  WaveData wave;
-  wave.Read(is);
-  KALDI_ASSERT(wave.Data().NumRows() == 1);
-  SubVector<BaseFloat> waveform(wave.Data(), 0);
-
-  // read the HTK features
-  Matrix<BaseFloat> htk_features;
-  {
-    std::ifstream is("test_data/test.wav.fea_htk.5",
-                     std::ios::in | std::ios_base::binary);
-    bool ans = ReadHtk(is, &htk_features, 0);
-    KALDI_ASSERT(ans);
-  }
-
-  // use mfcc with default configuration...
-  MfccOptions op;
-  op.frame_opts.dither = 0.0;
-  op.frame_opts.window_type = "hamming";
-  op.frame_opts.remove_dc_offset = false;
-  op.frame_opts.round_to_power_of_two = true;
-  op.htk_compat = true;
-  op.use_energy = true;  // Use energy.
-  op.mel_opts.low_freq = 0.0;
-  op.mel_opts.vtln_low = 100.0;
-  op.mel_opts.vtln_high = 7500.0;
-  op.mel_opts.htk_mode = true;
-
-  BaseFloat vtln_warp = 1.1; // our approach identical to htk for warp factor >1,
-  // differs slightly for higher mel bins if warp_factor <0.9
-
-  Mfcc mfcc(op);
-
-  // calculate kaldi features
-  Matrix<BaseFloat> kaldi_raw_features;
-  mfcc.Compute(waveform, vtln_warp, &kaldi_raw_features);
-
-  DeltaFeaturesOptions delta_opts;
-  Matrix<BaseFloat> kaldi_features;
-  ComputeDeltas(delta_opts,
-                kaldi_raw_features,
-                &kaldi_features);
-
-  // compare the results
-  bool passed = true;
-  int32 i_old = -1;
-  KALDI_ASSERT(kaldi_features.NumRows() == htk_features.NumRows());
-  KALDI_ASSERT(kaldi_features.NumCols() == htk_features.NumCols());
-  // Ignore ends-- we make slightly different choices than
-  // HTK about how to treat the deltas at the ends.
-  for (int32 i = 10; i+10 < kaldi_features.NumRows(); i++) {
-    for (int32 j = 0; j < kaldi_features.NumCols(); j++) {
-      BaseFloat a = kaldi_features(i, j), b = htk_features(i, j);
-      if ((std::abs(b - a)) > 1.0) {  //<< TOLERANCE TO DIFFERENCES!!!!!
-        // print the non-matching data only once per-line
-        if (static_cast<int32>(i_old) != i) {
-          std::cout << "\n\n\n[HTK-row: " << i << "] " << htk_features.Row(i) << "\n";
-          std::cout << "[Kaldi-row: " << i << "] " << kaldi_features.Row(i) << "\n\n\n";
-          i_old = i;
-        }
-        // print indices of non-matching cells
-        std::cout << "[" << i << ", " << j << "]";
-        passed = false;
-  }}}
-  if (!passed) KALDI_ERR << "Test failed";
-
-  // write the htk features for later inspection
-  HtkHeader header = {
-    kaldi_features.NumRows(),
-    100000,  // 10ms
-    static_cast<int16>(sizeof(float)*kaldi_features.NumCols()),
-    021406  // MFCC_D_A_0
-  };
-  {
-    std::ofstream os("tmp.test.wav.fea_kaldi.5",
-                     std::ios::out|std::ios::binary);
-    WriteHtk(os, kaldi_features, header);
-  }
-
-  std::cout << "Test passed :)\n\n";
-  
-  unlink("tmp.test.wav.fea_kaldi.5");
-}
-
-static void UnitTestHTKCompare6() {
-  std::cout << "=== UnitTestHTKCompare6() ===\n";
-
-
-  std::ifstream is("test_data/test.wav", std::ios_base::binary);
-  WaveData wave;
-  wave.Read(is);
-  KALDI_ASSERT(wave.Data().NumRows() == 1);
-  SubVector<BaseFloat> waveform(wave.Data(), 0);
-
-  // read the HTK features
-  Matrix<BaseFloat> htk_features;
-  {
-    std::ifstream is("test_data/test.wav.fea_htk.6",
-                     std::ios::in | std::ios_base::binary);
-    bool ans = ReadHtk(is, &htk_features, 0);
-    KALDI_ASSERT(ans);
-  }
-
-  // use mfcc with default configuration...
-  MfccOptions op;
-  op.frame_opts.dither = 0.0;
-  op.frame_opts.preemph_coeff = 0.97;
-  op.frame_opts.window_type = "hamming";
-  op.frame_opts.remove_dc_offset = false;
-  op.frame_opts.round_to_power_of_two = true;
-  op.mel_opts.num_bins = 24;
-  op.mel_opts.low_freq = 125.0;
-  op.mel_opts.high_freq = 7800.0;
-  op.htk_compat = true;
-  op.use_energy = false;  // C0 not energy.
-
-  Mfcc mfcc(op);
-
-  // calculate kaldi features
-  Matrix<BaseFloat> kaldi_raw_features;
-  mfcc.Compute(waveform, 1.0, &kaldi_raw_features);
-
-  DeltaFeaturesOptions delta_opts;
-  Matrix<BaseFloat> kaldi_features;
-  ComputeDeltas(delta_opts,
-                kaldi_raw_features,
-                &kaldi_features);
-
-  // compare the results
-  bool passed = true;
-  int32 i_old = -1;
-  KALDI_ASSERT(kaldi_features.NumRows() == htk_features.NumRows());
-  KALDI_ASSERT(kaldi_features.NumCols() == htk_features.NumCols());
-  // Ignore ends-- we make slightly different choices than
-  // HTK about how to treat the deltas at the ends.
-  for (int32 i = 10; i+10 < kaldi_features.NumRows(); i++) {
-    for (int32 j = 0; j < kaldi_features.NumCols(); j++) {
-      BaseFloat a = kaldi_features(i, j), b = htk_features(i, j);
-      if ((std::abs(b - a)) > 1.0) {  //<< TOLERANCE TO DIFFERENCES!!!!!
-        // print the non-matching data only once per-line
-        if (static_cast<int32>(i_old) != i) {
-          std::cout << "\n\n\n[HTK-row: " << i << "] " << htk_features.Row(i) << "\n";
-          std::cout << "[Kaldi-row: " << i << "] " << kaldi_features.Row(i) << "\n\n\n";
-          i_old = i;
-        }
-        // print indices of non-matching cells
-        std::cout << "[" << i << ", " << j << "]";
-        passed = false;
-  }}}
-  if (!passed) KALDI_ERR << "Test failed";
-
-  // write the htk features for later inspection
-  HtkHeader header = {
-    kaldi_features.NumRows(),
-    100000,  // 10ms
-    static_cast<int16>(sizeof(float)*kaldi_features.NumCols()),
-    021406  // MFCC_D_A_0
-  };
-  {
-    std::ofstream os("tmp.test.wav.fea_kaldi.6",
-                     std::ios::out|std::ios::binary);
-    WriteHtk(os, kaldi_features, header);
-  }
-
-  std::cout << "Test passed :)\n\n";
-  
-  unlink("tmp.test.wav.fea_kaldi.6");
-}
-
-void UnitTestVtln() {
-  // Test the function VtlnWarpFreq.
-  BaseFloat low_freq = 10, high_freq = 7800,
-      vtln_low_cutoff = 20, vtln_high_cutoff = 7400;
-
-  for (size_t i = 0; i < 100; i++) {
-    BaseFloat freq = 5000, warp_factor = 0.9 + RandUniform() * 0.2;
-    AssertEqual(MelBanks::VtlnWarpFreq(vtln_low_cutoff, vtln_high_cutoff,
-                             low_freq, high_freq, warp_factor,
-                             freq),
-                freq / warp_factor);
-
-    AssertEqual(MelBanks::VtlnWarpFreq(vtln_low_cutoff, vtln_high_cutoff,
-                             low_freq, high_freq, warp_factor,
-                             low_freq),
-                low_freq);
-    AssertEqual(MelBanks::VtlnWarpFreq(vtln_low_cutoff, vtln_high_cutoff,
-                             low_freq, high_freq, warp_factor,
-                             high_freq),
-                high_freq);
-    BaseFloat freq2 = low_freq + (high_freq-low_freq) * RandUniform(),
-        freq3 = freq2 +  (high_freq-freq2) * RandUniform();  // freq3>=freq2
-    BaseFloat w2 = MelBanks::VtlnWarpFreq(vtln_low_cutoff, vtln_high_cutoff,
-                                low_freq, high_freq, warp_factor,
-                                freq2);
-    BaseFloat w3 = MelBanks::VtlnWarpFreq(vtln_low_cutoff, vtln_high_cutoff,
-                                low_freq, high_freq, warp_factor,
-                                freq3);
-    KALDI_ASSERT(w3 >= w2);  // increasing function.
-    BaseFloat w3dash = MelBanks::VtlnWarpFreq(vtln_low_cutoff, vtln_high_cutoff,
-                                    low_freq, high_freq, 1.0,
-                                    freq3);
-    AssertEqual(w3dash, freq3);
-  }
-}
-
-static void UnitTestFeat() {
-  UnitTestVtln();
-  UnitTestReadWave();
-  UnitTestSimple();
-  UnitTestHTKCompare1();
-  UnitTestHTKCompare2();
-  // commenting out this one as it doesn't compare right now I normalized
-  // the way the FFT bins are treated (removed offset of 0.5)... this seems
-  // to relate to the way frequency zero behaves.
-  UnitTestHTKCompare3();
-  UnitTestHTKCompare4();
-  UnitTestHTKCompare5();
-  UnitTestHTKCompare6();
-  std::cout << "Tests succeeded.\n";
-}
-
-
-
-int main() {
-  try {
-    for (int i = 0; i < 5; i++)
-      UnitTestFeat();
-    std::cout << "Tests succeeded.\n";
-    return 0;
-  } catch (const std::exception &e) {
-    std::cerr << e.what();
-    return 1;
-  }
-}
-
-
diff --git a/speechx/speechx/decoder/CMakeLists.txt b/speechx/speechx/decoder/CMakeLists.txt
index 259261bdf..7cd281b66 100644
--- a/speechx/speechx/decoder/CMakeLists.txt
+++ b/speechx/speechx/decoder/CMakeLists.txt
@@ -1,2 +1,10 @@
-aux_source_directory(. DIR_LIB_SRCS)
-add_library(decoder STATIC ${DIR_LIB_SRCS})
+project(decoder)
+
+include_directories(${CMAKE_CURRENT_SOURCE_DIR/ctc_decoders})
+add_library(decoder STATIC
+  ctc_beam_search_decoder.cc
+  ctc_decoders/decoder_utils.cpp
+  ctc_decoders/path_trie.cpp
+  ctc_decoders/scorer.cpp
+)
+target_link_libraries(decoder PUBLIC kenlm utils fst)
\ No newline at end of file
diff --git a/speechx/speechx/decoder/common.h b/speechx/speechx/decoder/common.h
new file mode 100644
index 000000000..52deffac9
--- /dev/null
+++ b/speechx/speechx/decoder/common.h
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/basic_types.h"
+
+struct DecoderResult {
+    BaseFloat acoustic_score;
+    std::vector<int32> words_idx;
+    std::vector<pair<int32, int32>> time_stamp;
+};
diff --git a/speechx/speechx/decoder/ctc_beam_search_decoder.cc b/speechx/speechx/decoder/ctc_beam_search_decoder.cc
new file mode 100644
index 000000000..5d7a4f77a
--- /dev/null
+++ b/speechx/speechx/decoder/ctc_beam_search_decoder.cc
@@ -0,0 +1,316 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "decoder/ctc_beam_search_decoder.h"
+
+#include "base/basic_types.h"
+#include "decoder/ctc_decoders/decoder_utils.h"
+#include "utils/file_utils.h"
+
+namespace ppspeech {
+
+using std::vector;
+using FSTMATCH = fst::SortedMatcher<fst::StdVectorFst>;
+
+CTCBeamSearch::CTCBeamSearch(const CTCBeamSearchOptions& opts)
+    : opts_(opts),
+      init_ext_scorer_(nullptr),
+      blank_id_(-1),
+      space_id_(-1),
+      num_frame_decoded_(0),
+      root_(nullptr) {
+    LOG(INFO) << "dict path: " << opts_.dict_file;
+    if (!ReadFileToVector(opts_.dict_file, &vocabulary_)) {
+        LOG(INFO) << "load the dict failed";
+    }
+    LOG(INFO) << "read the vocabulary success, dict size: "
+              << vocabulary_.size();
+
+    LOG(INFO) << "language model path: " << opts_.lm_path;
+    if (opts_.lm_path != "") {
+        init_ext_scorer_ = std::make_shared<Scorer>(
+            opts_.alpha, opts_.beta, opts_.lm_path, vocabulary_);
+    }
+
+    blank_id_ = 0;
+    auto it = std::find(vocabulary_.begin(), vocabulary_.end(), " ");
+
+    space_id_ = it - vocabulary_.begin();
+    // if no space in vocabulary
+    if ((size_t)space_id_ >= vocabulary_.size()) {
+        space_id_ = -2;
+    }
+}
+
+void CTCBeamSearch::Reset() {
+    // num_frame_decoded_ = 0;
+    // ResetPrefixes();
+    InitDecoder();
+}
+
+void CTCBeamSearch::InitDecoder() {
+    num_frame_decoded_ = 0;
+    // ResetPrefixes();
+    prefixes_.clear();
+
+    root_ = std::make_shared<PathTrie>();
+    root_->score = root_->log_prob_b_prev = 0.0;
+    prefixes_.push_back(root_.get());
+    if (init_ext_scorer_ != nullptr &&
+        !init_ext_scorer_->is_character_based()) {
+        auto fst_dict =
+            static_cast<fst::StdVectorFst*>(init_ext_scorer_->dictionary);
+        fst::StdVectorFst* dict_ptr = fst_dict->Copy(true);
+        root_->set_dictionary(dict_ptr);
+
+        auto matcher = std::make_shared<FSTMATCH>(*dict_ptr, fst::MATCH_INPUT);
+        root_->set_matcher(matcher);
+    }
+}
+
+void CTCBeamSearch::Decode(
+    std::shared_ptr<kaldi::DecodableInterface> decodable) {
+    return;
+}
+
+int32 CTCBeamSearch::NumFrameDecoded() { return num_frame_decoded_ + 1; }
+
+// todo rename, refactor
+void CTCBeamSearch::AdvanceDecode(
+    const std::shared_ptr<kaldi::DecodableInterface>& decodable) {
+    while (1) {
+        vector<vector<BaseFloat>> likelihood;
+        vector<BaseFloat> frame_prob;
+        bool flag =
+            decodable->FrameLogLikelihood(num_frame_decoded_, &frame_prob);
+        if (flag == false) break;
+        likelihood.push_back(frame_prob);
+        AdvanceDecoding(likelihood);
+    }
+}
+
+void CTCBeamSearch::ResetPrefixes() {
+    for (size_t i = 0; i < prefixes_.size(); i++) {
+        if (prefixes_[i] != nullptr) {
+            delete prefixes_[i];
+            prefixes_[i] = nullptr;
+        }
+    }
+    prefixes_.clear();
+}
+
+int CTCBeamSearch::DecodeLikelihoods(const vector<vector<float>>& probs,
+                                     vector<string>& nbest_words) {
+    kaldi::Timer timer;
+    timer.Reset();
+    AdvanceDecoding(probs);
+    LOG(INFO) << "ctc decoding elapsed time(s) "
+              << static_cast<float>(timer.Elapsed()) / 1000.0f;
+    return 0;
+}
+
+vector<std::pair<double, string>> CTCBeamSearch::GetNBestPath() {
+    return get_beam_search_result(prefixes_, vocabulary_, opts_.beam_size);
+}
+
+string CTCBeamSearch::GetBestPath() {
+    std::vector<std::pair<double, std::string>> result;
+    result = get_beam_search_result(prefixes_, vocabulary_, opts_.beam_size);
+    return result[0].second;
+}
+
+string CTCBeamSearch::GetFinalBestPath() {
+    CalculateApproxScore();
+    LMRescore();
+    return GetBestPath();
+}
+
+void CTCBeamSearch::AdvanceDecoding(const vector<vector<BaseFloat>>& probs) {
+    size_t num_time_steps = probs.size();
+    size_t beam_size = opts_.beam_size;
+    double cutoff_prob = opts_.cutoff_prob;
+    size_t cutoff_top_n = opts_.cutoff_top_n;
+
+    vector<vector<double>> probs_seq(probs.size(),
+                                     vector<double>(probs[0].size(), 0));
+
+    int row = probs.size();
+    int col = probs[0].size();
+    for (int i = 0; i < row; i++) {
+        for (int j = 0; j < col; j++) {
+            probs_seq[i][j] = static_cast<double>(probs[i][j]);
+        }
+    }
+
+    for (size_t time_step = 0; time_step < num_time_steps; time_step++) {
+        const auto& prob = probs_seq[time_step];
+
+        float min_cutoff = -NUM_FLT_INF;
+        bool full_beam = false;
+        if (init_ext_scorer_ != nullptr) {
+            size_t num_prefixes_ = std::min(prefixes_.size(), beam_size);
+            std::sort(prefixes_.begin(),
+                      prefixes_.begin() + num_prefixes_,
+                      prefix_compare);
+
+            if (num_prefixes_ == 0) {
+                continue;
+            }
+            min_cutoff = prefixes_[num_prefixes_ - 1]->score +
+                         std::log(prob[blank_id_]) -
+                         std::max(0.0, init_ext_scorer_->beta);
+
+            full_beam = (num_prefixes_ == beam_size);
+        }
+
+        vector<std::pair<size_t, float>> log_prob_idx =
+            get_pruned_log_probs(prob, cutoff_prob, cutoff_top_n);
+
+        // loop over chars
+        size_t log_prob_idx_len = log_prob_idx.size();
+        for (size_t index = 0; index < log_prob_idx_len; index++) {
+            SearchOneChar(full_beam, log_prob_idx[index], min_cutoff);
+        }
+
+        prefixes_.clear();
+
+        // update log probs
+        root_->iterate_to_vec(prefixes_);
+        // only preserve top beam_size prefixes_
+        if (prefixes_.size() >= beam_size) {
+            std::nth_element(prefixes_.begin(),
+                             prefixes_.begin() + beam_size,
+                             prefixes_.end(),
+                             prefix_compare);
+            for (size_t i = beam_size; i < prefixes_.size(); ++i) {
+                prefixes_[i]->remove();
+            }
+        }  // if
+        num_frame_decoded_++;
+    }  // for probs_seq
+}
+
+int32 CTCBeamSearch::SearchOneChar(
+    const bool& full_beam,
+    const std::pair<size_t, BaseFloat>& log_prob_idx,
+    const BaseFloat& min_cutoff) {
+    size_t beam_size = opts_.beam_size;
+    const auto& c = log_prob_idx.first;
+    const auto& log_prob_c = log_prob_idx.second;
+    size_t prefixes_len = std::min(prefixes_.size(), beam_size);
+
+    for (size_t i = 0; i < prefixes_len; ++i) {
+        auto prefix = prefixes_[i];
+        if (full_beam && log_prob_c + prefix->score < min_cutoff) {
+            break;
+        }
+
+        if (c == blank_id_) {
+            prefix->log_prob_b_cur =
+                log_sum_exp(prefix->log_prob_b_cur, log_prob_c + prefix->score);
+            continue;
+        }
+
+        // repeated character
+        if (c == prefix->character) {
+            // p_{nb}(l;x_{1:t}) = p(c;x_{t})p(l;x_{1:t-1})
+            prefix->log_prob_nb_cur = log_sum_exp(
+                prefix->log_prob_nb_cur, log_prob_c + prefix->log_prob_nb_prev);
+        }
+
+        // get new prefix
+        auto prefix_new = prefix->get_path_trie(c);
+        if (prefix_new != nullptr) {
+            float log_p = -NUM_FLT_INF;
+            if (c == prefix->character &&
+                prefix->log_prob_b_prev > -NUM_FLT_INF) {
+                // p_{nb}(l^{+};x_{1:t}) = p(c;x_{t})p_{b}(l;x_{1:t-1})
+                log_p = log_prob_c + prefix->log_prob_b_prev;
+            } else if (c != prefix->character) {
+                // p_{nb}(l^{+};x_{1:t}) = p(c;x_{t}) p(l;x_{1:t-1})
+                log_p = log_prob_c + prefix->score;
+            }
+
+            // language model scoring
+            if (init_ext_scorer_ != nullptr &&
+                (c == space_id_ || init_ext_scorer_->is_character_based())) {
+                PathTrie* prefix_to_score = nullptr;
+                // skip scoring the space
+                if (init_ext_scorer_->is_character_based()) {
+                    prefix_to_score = prefix_new;
+                } else {
+                    prefix_to_score = prefix;
+                }
+
+                float score = 0.0;
+                vector<string> ngram;
+                ngram = init_ext_scorer_->make_ngram(prefix_to_score);
+                // lm score: p_{lm}(W)^{\alpha} + \beta
+                score = init_ext_scorer_->get_log_cond_prob(ngram) *
+                        init_ext_scorer_->alpha;
+                log_p += score;
+                log_p += init_ext_scorer_->beta;
+            }
+            // p_{nb}(l;x_{1:t})
+            prefix_new->log_prob_nb_cur =
+                log_sum_exp(prefix_new->log_prob_nb_cur, log_p);
+        }
+    }  // end of loop over prefix
+    return 0;
+}
+
+void CTCBeamSearch::CalculateApproxScore() {
+    size_t beam_size = opts_.beam_size;
+    size_t num_prefixes_ = std::min(prefixes_.size(), beam_size);
+    std::sort(
+        prefixes_.begin(), prefixes_.begin() + num_prefixes_, prefix_compare);
+
+    // compute aproximate ctc score as the return score, without affecting the
+    // return order of decoding result. To delete when decoder gets stable.
+    for (size_t i = 0; i < beam_size && i < prefixes_.size(); ++i) {
+        double approx_ctc = prefixes_[i]->score;
+        if (init_ext_scorer_ != nullptr) {
+            vector<int> output;
+            prefixes_[i]->get_path_vec(output);
+            auto prefix_length = output.size();
+            auto words = init_ext_scorer_->split_labels(output);
+            // remove word insert
+            approx_ctc = approx_ctc - prefix_length * init_ext_scorer_->beta;
+            // remove language model weight:
+            approx_ctc -= (init_ext_scorer_->get_sent_log_prob(words)) *
+                          init_ext_scorer_->alpha;
+        }
+        prefixes_[i]->approx_ctc = approx_ctc;
+    }
+}
+
+void CTCBeamSearch::LMRescore() {
+    size_t beam_size = opts_.beam_size;
+    if (init_ext_scorer_ != nullptr &&
+        !init_ext_scorer_->is_character_based()) {
+        for (size_t i = 0; i < beam_size && i < prefixes_.size(); ++i) {
+            auto prefix = prefixes_[i];
+            if (!prefix->is_empty() && prefix->character != space_id_) {
+                float score = 0.0;
+                vector<string> ngram = init_ext_scorer_->make_ngram(prefix);
+                score = init_ext_scorer_->get_log_cond_prob(ngram) *
+                        init_ext_scorer_->alpha;
+                score += init_ext_scorer_->beta;
+                prefix->score += score;
+            }
+        }
+    }
+}
+
+}  // namespace ppspeech
diff --git a/speechx/speechx/decoder/ctc_beam_search_decoder.h b/speechx/speechx/decoder/ctc_beam_search_decoder.h
new file mode 100644
index 000000000..1387eee79
--- /dev/null
+++ b/speechx/speechx/decoder/ctc_beam_search_decoder.h
@@ -0,0 +1,94 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/common.h"
+#include "decoder/ctc_decoders/path_trie.h"
+#include "decoder/ctc_decoders/scorer.h"
+#include "nnet/decodable-itf.h"
+#include "util/parse-options.h"
+
+#pragma once
+
+namespace ppspeech {
+
+struct CTCBeamSearchOptions {
+    std::string dict_file;
+    std::string lm_path;
+    BaseFloat alpha;
+    BaseFloat beta;
+    BaseFloat cutoff_prob;
+    int beam_size;
+    int cutoff_top_n;
+    int num_proc_bsearch;
+    CTCBeamSearchOptions()
+        : dict_file("vocab.txt"),
+          lm_path(""),
+          alpha(1.9f),
+          beta(5.0),
+          beam_size(300),
+          cutoff_prob(0.99f),
+          cutoff_top_n(40),
+          num_proc_bsearch(10) {}
+
+    void Register(kaldi::OptionsItf* opts) {
+        opts->Register("dict", &dict_file, "dict file ");
+        opts->Register("lm-path", &lm_path, "language model file");
+        opts->Register("alpha", &alpha, "alpha");
+        opts->Register("beta", &beta, "beta");
+        opts->Register(
+            "beam-size", &beam_size, "beam size for beam search method");
+        opts->Register("cutoff-prob", &cutoff_prob, "cutoff probs");
+        opts->Register("cutoff-top-n", &cutoff_top_n, "cutoff top n");
+        opts->Register(
+            "num-proc-bsearch", &num_proc_bsearch, "num proc bsearch");
+    }
+};
+
+class CTCBeamSearch {
+  public:
+    explicit CTCBeamSearch(const CTCBeamSearchOptions& opts);
+    ~CTCBeamSearch() {}
+    void InitDecoder();
+    void Decode(std::shared_ptr<kaldi::DecodableInterface> decodable);
+    std::string GetBestPath();
+    std::vector<std::pair<double, std::string>> GetNBestPath();
+    std::string GetFinalBestPath();
+    int NumFrameDecoded();
+    int DecodeLikelihoods(const std::vector<std::vector<BaseFloat>>& probs,
+                          std::vector<std::string>& nbest_words);
+    void AdvanceDecode(
+        const std::shared_ptr<kaldi::DecodableInterface>& decodable);
+    void Reset();
+
+  private:
+    void ResetPrefixes();
+    int32 SearchOneChar(const bool& full_beam,
+                        const std::pair<size_t, BaseFloat>& log_prob_idx,
+                        const BaseFloat& min_cutoff);
+    void CalculateApproxScore();
+    void LMRescore();
+    void AdvanceDecoding(const std::vector<std::vector<BaseFloat>>& probs);
+
+    CTCBeamSearchOptions opts_;
+    std::shared_ptr<Scorer> init_ext_scorer_;  // todo separate later
+    std::vector<std::string> vocabulary_;      // todo remove later
+    size_t blank_id_;
+    int space_id_;
+    std::shared_ptr<PathTrie> root_;
+    std::vector<PathTrie*> prefixes_;
+    int num_frame_decoded_;
+    DISALLOW_COPY_AND_ASSIGN(CTCBeamSearch);
+};
+
+}  // namespace basr
\ No newline at end of file
diff --git a/speechx/speechx/decoder/ctc_decoders b/speechx/speechx/decoder/ctc_decoders
new file mode 120000
index 000000000..b280de096
--- /dev/null
+++ b/speechx/speechx/decoder/ctc_decoders
@@ -0,0 +1 @@
+../../../third_party/ctc_decoders
\ No newline at end of file
diff --git a/speechx/speechx/frontend/CMakeLists.txt b/speechx/speechx/frontend/CMakeLists.txt
index e69de29bb..7d10fdec9 100644
--- a/speechx/speechx/frontend/CMakeLists.txt
+++ b/speechx/speechx/frontend/CMakeLists.txt
@@ -0,0 +1,2 @@
+
+add_subdirectory(audio)
\ No newline at end of file
diff --git a/speechx/speechx/frontend/audio/CMakeLists.txt b/speechx/speechx/frontend/audio/CMakeLists.txt
index e69de29bb..35243b6e3 100644
--- a/speechx/speechx/frontend/audio/CMakeLists.txt
+++ b/speechx/speechx/frontend/audio/CMakeLists.txt
@@ -0,0 +1,11 @@
+project(frontend)
+
+add_library(frontend STATIC
+  cmvn.cc
+  db_norm.cc
+  linear_spectrogram.cc
+  audio_cache.cc
+  feature_cache.cc
+)
+
+target_link_libraries(frontend PUBLIC kaldi-matrix)
\ No newline at end of file
diff --git a/speechx/speechx/frontend/audio/audio_cache.cc b/speechx/speechx/frontend/audio/audio_cache.cc
new file mode 100644
index 000000000..c3233e595
--- /dev/null
+++ b/speechx/speechx/frontend/audio/audio_cache.cc
@@ -0,0 +1,83 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "frontend/audio/audio_cache.h"
+#include "kaldi/base/timer.h"
+
+namespace ppspeech {
+
+using kaldi::BaseFloat;
+using kaldi::VectorBase;
+using kaldi::Vector;
+
+AudioCache::AudioCache(int buffer_size)
+    : finished_(false),
+      capacity_(buffer_size),
+      size_(0),
+      offset_(0),
+      timeout_(1) {
+    ring_buffer_.resize(capacity_);
+}
+
+void AudioCache::Accept(const VectorBase<BaseFloat>& waves) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    while (size_ + waves.Dim() > ring_buffer_.size()) {
+        ready_feed_condition_.wait(lock);
+    }
+    for (size_t idx = 0; idx < waves.Dim(); ++idx) {
+        int32 buffer_idx = (idx + offset_) % ring_buffer_.size();
+        ring_buffer_[buffer_idx] = waves(idx);
+    }
+    size_ += waves.Dim();
+}
+
+bool AudioCache::Read(Vector<BaseFloat>* waves) {
+    size_t chunk_size = waves->Dim();
+    kaldi::Timer timer;
+    std::unique_lock<std::mutex> lock(mutex_);
+    while (chunk_size > size_) {
+        // when audio is empty and no more data feed
+        // ready_read_condition will block in dead lock,
+        // so replace with timeout_
+        // ready_read_condition_.wait(lock);
+        int32 elapsed = static_cast<int32>(timer.Elapsed() * 1000);
+        if (elapsed > timeout_) {
+            if (finished_ == true) {
+                // read last chunk data
+                break;
+            }
+            if (chunk_size > size_) {
+                return false;
+            }
+        }
+        usleep(100);  // sleep 0.1 ms
+    }
+
+    // read last chunk data
+    if (chunk_size > size_) {
+        chunk_size = size_;
+        waves->Resize(chunk_size);
+    }
+
+    for (size_t idx = 0; idx < chunk_size; ++idx) {
+        int buff_idx = (offset_ + idx) % ring_buffer_.size();
+        waves->Data()[idx] = ring_buffer_[buff_idx];
+    }
+    size_ -= chunk_size;
+    offset_ = (offset_ + chunk_size) % ring_buffer_.size();
+    ready_feed_condition_.notify_one();
+    return true;
+}
+
+}  // namespace ppspeech
diff --git a/speechx/speechx/frontend/audio/audio_cache.h b/speechx/speechx/frontend/audio/audio_cache.h
new file mode 100644
index 000000000..17e1a8389
--- /dev/null
+++ b/speechx/speechx/frontend/audio/audio_cache.h
@@ -0,0 +1,61 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#pragma once
+
+#include "base/common.h"
+#include "frontend/audio/frontend_itf.h"
+
+namespace ppspeech {
+
+// waves cache
+class AudioCache : public FrontendInterface {
+  public:
+    explicit AudioCache(int buffer_size = kint16max);
+
+    virtual void Accept(const kaldi::VectorBase<BaseFloat>& waves);
+
+    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* waves);
+
+    // the audio dim is 1, one sample
+    virtual size_t Dim() const { return 1; }
+
+    virtual void SetFinished() {
+        std::lock_guard<std::mutex> lock(mutex_);
+        finished_ = true;
+    }
+
+    virtual bool IsFinished() const { return finished_; }
+
+    virtual void Reset() {
+        offset_ = 0;
+        size_ = 0;
+        finished_ = false;
+    }
+
+  private:
+    std::vector<kaldi::BaseFloat> ring_buffer_;
+    size_t offset_;    // offset in ring_buffer_
+    size_t size_;      // samples in ring_buffer_ now
+    size_t capacity_;  // capacity of ring_buffer_
+    bool finished_;    // reach audio end
+    mutable std::mutex mutex_;
+    std::condition_variable ready_feed_condition_;
+    kaldi::int32 timeout_;  // millisecond
+
+    DISALLOW_COPY_AND_ASSIGN(AudioCache);
+};
+
+}  // namespace ppspeech
diff --git a/speechx/speechx/frontend/audio/cmvn.cc b/speechx/speechx/frontend/audio/cmvn.cc
new file mode 100644
index 000000000..4c1ffd6a1
--- /dev/null
+++ b/speechx/speechx/frontend/audio/cmvn.cc
@@ -0,0 +1,123 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "frontend/audio/cmvn.h"
+#include "kaldi/feat/cmvn.h"
+#include "kaldi/util/kaldi-io.h"
+
+namespace ppspeech {
+
+using kaldi::Vector;
+using kaldi::VectorBase;
+using kaldi::BaseFloat;
+using std::vector;
+using kaldi::SubVector;
+using std::unique_ptr;
+
+
+CMVN::CMVN(std::string cmvn_file, unique_ptr<FrontendInterface> base_extractor)
+    : var_norm_(true) {
+    base_extractor_ = std::move(base_extractor);
+    bool binary;
+    kaldi::Input ki(cmvn_file, &binary);
+    stats_.Read(ki.Stream(), binary);
+    dim_ = stats_.NumCols() - 1;
+}
+
+void CMVN::Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs) {
+    base_extractor_->Accept(inputs);
+    return;
+}
+
+bool CMVN::Read(kaldi::Vector<BaseFloat>* feats) {
+    if (base_extractor_->Read(feats) == false || feats->Dim() == 0) {
+        return false;
+    }
+    Compute(feats);
+    return true;
+}
+
+// feats contain num_frames feature.
+void CMVN::Compute(VectorBase<BaseFloat>* feats) const {
+    KALDI_ASSERT(feats != NULL);
+    int32 dim = stats_.NumCols() - 1;
+    if (stats_.NumRows() > 2 || stats_.NumRows() < 1 ||
+        feats->Dim() % dim != 0) {
+        KALDI_ERR << "Dim mismatch: cmvn " << stats_.NumRows() << 'x'
+                  << stats_.NumCols() << ", feats " << feats->Dim() << 'x';
+    }
+    if (stats_.NumRows() == 1 && var_norm_) {
+        KALDI_ERR
+            << "You requested variance normalization but no variance stats_ "
+            << "are supplied.";
+    }
+
+    double count = stats_(0, dim);
+    // Do not change the threshold of 1.0 here: in the balanced-cmvn code, when
+    // computing an offset and representing it as stats_, we use a count of one.
+    if (count < 1.0)
+        KALDI_ERR << "Insufficient stats_ for cepstral mean and variance "
+                     "normalization: "
+                  << "count = " << count;
+
+    if (!var_norm_) {
+        Vector<BaseFloat> offset(feats->Dim());
+        SubVector<double> mean_stats(stats_.RowData(0), dim);
+        Vector<double> mean_stats_apply(feats->Dim());
+        // fill the datat of mean_stats in mean_stats_appy whose dim is equal
+        // with the dim of feature.
+        // the dim of feats = dim * num_frames;
+        for (int32 idx = 0; idx < feats->Dim() / dim; ++idx) {
+            SubVector<double> stats_tmp(mean_stats_apply.Data() + dim * idx,
+                                        dim);
+            stats_tmp.CopyFromVec(mean_stats);
+        }
+        offset.AddVec(-1.0 / count, mean_stats_apply);
+        feats->AddVec(1.0, offset);
+        return;
+    }
+    // norm(0, d) = mean offset;
+    // norm(1, d) = scale, e.g. x(d) <-- x(d)*norm(1, d) + norm(0, d).
+    kaldi::Matrix<BaseFloat> norm(2, feats->Dim());
+    for (int32 d = 0; d < dim; d++) {
+        double mean, offset, scale;
+        mean = stats_(0, d) / count;
+        double var = (stats_(1, d) / count) - mean * mean, floor = 1.0e-20;
+        if (var < floor) {
+            KALDI_WARN << "Flooring cepstral variance from " << var << " to "
+                       << floor;
+            var = floor;
+        }
+        scale = 1.0 / sqrt(var);
+        if (scale != scale || 1 / scale == 0.0)
+            KALDI_ERR
+                << "NaN or infinity in cepstral mean/variance computation";
+        offset = -(mean * scale);
+        for (int32 d_skip = d; d_skip < feats->Dim();) {
+            norm(0, d_skip) = offset;
+            norm(1, d_skip) = scale;
+            d_skip = d_skip + dim;
+        }
+    }
+    // Apply the normalization.
+    feats->MulElements(norm.Row(1));
+    feats->AddVec(1.0, norm.Row(0));
+}
+
+void CMVN::ApplyCMVN(kaldi::MatrixBase<BaseFloat>* feats) {
+    ApplyCmvn(stats_, var_norm_, feats);
+}
+
+}  // namespace ppspeech
\ No newline at end of file
diff --git a/speechx/speechx/frontend/audio/cmvn.h b/speechx/speechx/frontend/audio/cmvn.h
new file mode 100644
index 000000000..50ef5649b
--- /dev/null
+++ b/speechx/speechx/frontend/audio/cmvn.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "base/common.h"
+#include "frontend/audio/frontend_itf.h"
+#include "kaldi/matrix/kaldi-matrix.h"
+#include "kaldi/util/options-itf.h"
+
+namespace ppspeech {
+
+class CMVN : public FrontendInterface {
+  public:
+    explicit CMVN(std::string cmvn_file,
+                  std::unique_ptr<FrontendInterface> base_extractor);
+    virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
+
+    // the length of feats = feature_row * feature_dim,
+    // the Matrix is squashed into Vector
+    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats);
+    // the dim_ is the feautre dim.
+    virtual size_t Dim() const { return dim_; }
+    virtual void SetFinished() { base_extractor_->SetFinished(); }
+    virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
+    virtual void Reset() { base_extractor_->Reset(); }
+
+  private:
+    void Compute(kaldi::VectorBase<kaldi::BaseFloat>* feats) const;
+    void ApplyCMVN(kaldi::MatrixBase<BaseFloat>* feats);
+    kaldi::Matrix<double> stats_;
+    std::unique_ptr<FrontendInterface> base_extractor_;
+    size_t dim_;
+    bool var_norm_;
+};
+
+}  // namespace ppspeech
\ No newline at end of file
diff --git a/speechx/speechx/frontend/audio/data_cache.h b/speechx/speechx/frontend/audio/data_cache.h
new file mode 100644
index 000000000..a812278ce
--- /dev/null
+++ b/speechx/speechx/frontend/audio/data_cache.h
@@ -0,0 +1,56 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#pragma once
+
+
+#include "base/common.h"
+#include "frontend/audio/frontend_itf.h"
+
+
+namespace ppspeech {
+// A data source for testing different frontend module.
+// It accepts waves or feats.
+class DataCache : public FrontendInterface {
+  public:
+    explicit DataCache() { finished_ = false; }
+
+    virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs) {
+        data_ = inputs;
+    }
+
+    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats) {
+        if (data_.Dim() == 0) {
+            return false;
+        }
+        (*feats) = data_;
+        data_.Resize(0);
+        return true;
+    }
+
+    virtual void SetFinished() { finished_ = true; }
+    virtual bool IsFinished() const { return finished_; }
+    virtual size_t Dim() const { return dim_; }
+    void SetDim(int32 dim) { dim_ = dim; }
+    virtual void Reset() { finished_ = true; }
+
+  private:
+    kaldi::Vector<kaldi::BaseFloat> data_;
+    bool finished_;
+    int32 dim_;
+
+    DISALLOW_COPY_AND_ASSIGN(DataCache);
+};
+}
\ No newline at end of file
diff --git a/speechx/speechx/frontend/audio/db_norm.cc b/speechx/speechx/frontend/audio/db_norm.cc
new file mode 100644
index 000000000..931e932d6
--- /dev/null
+++ b/speechx/speechx/frontend/audio/db_norm.cc
@@ -0,0 +1,95 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "frontend/audio/db_norm.h"
+#include "kaldi/feat/cmvn.h"
+#include "kaldi/util/kaldi-io.h"
+
+namespace ppspeech {
+
+using kaldi::Vector;
+using kaldi::VectorBase;
+using kaldi::BaseFloat;
+using std::vector;
+using kaldi::SubVector;
+using std::unique_ptr;
+
+DecibelNormalizer::DecibelNormalizer(
+    const DecibelNormalizerOptions& opts,
+    std::unique_ptr<FrontendInterface> base_extractor) {
+    base_extractor_ = std::move(base_extractor);
+    opts_ = opts;
+    dim_ = 1;
+}
+
+void DecibelNormalizer::Accept(const kaldi::VectorBase<BaseFloat>& waves) {
+    base_extractor_->Accept(waves);
+}
+
+bool DecibelNormalizer::Read(kaldi::Vector<BaseFloat>* waves) {
+    if (base_extractor_->Read(waves) == false || waves->Dim() == 0) {
+        return false;
+    }
+    Compute(waves);
+    return true;
+}
+
+bool DecibelNormalizer::Compute(VectorBase<BaseFloat>* waves) const {
+    // calculate db rms
+    BaseFloat rms_db = 0.0;
+    BaseFloat mean_square = 0.0;
+    BaseFloat gain = 0.0;
+    BaseFloat wave_float_normlization = 1.0f / (std::pow(2, 16 - 1));
+
+    vector<BaseFloat> samples;
+    samples.resize(waves->Dim());
+    for (size_t i = 0; i < samples.size(); ++i) {
+        samples[i] = (*waves)(i);
+    }
+
+    // square
+    for (auto& d : samples) {
+        if (opts_.convert_int_float) {
+            d = d * wave_float_normlization;
+        }
+        mean_square += d * d;
+    }
+
+    // mean
+    mean_square /= samples.size();
+    rms_db = 10 * std::log10(mean_square);
+    gain = opts_.target_db - rms_db;
+
+    if (gain > opts_.max_gain_db) {
+        LOG(ERROR)
+            << "Unable to normalize segment to " << opts_.target_db << "dB,"
+            << "because the the probable gain have exceeds opts_.max_gain_db"
+            << opts_.max_gain_db << "dB.";
+        return false;
+    }
+
+    // Note that this is an in-place transformation.
+    for (auto& item : samples) {
+        // python item *= 10.0 ** (gain / 20.0)
+        item *= std::pow(10.0, gain / 20.0);
+    }
+
+    std::memcpy(
+        waves->Data(), samples.data(), sizeof(BaseFloat) * samples.size());
+    return true;
+}
+
+
+}  // namespace ppspeech
diff --git a/speechx/speechx/frontend/audio/db_norm.h b/speechx/speechx/frontend/audio/db_norm.h
new file mode 100644
index 000000000..425971437
--- /dev/null
+++ b/speechx/speechx/frontend/audio/db_norm.h
@@ -0,0 +1,65 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#pragma once
+
+#include "base/common.h"
+#include "frontend/audio/frontend_itf.h"
+#include "kaldi/matrix/kaldi-matrix.h"
+#include "kaldi/util/options-itf.h"
+
+namespace ppspeech {
+
+struct DecibelNormalizerOptions {
+    float target_db;
+    float max_gain_db;
+    bool convert_int_float;
+    DecibelNormalizerOptions()
+        : target_db(-20), max_gain_db(300.0), convert_int_float(false) {}
+
+    void Register(kaldi::OptionsItf* opts) {
+        opts->Register(
+            "target-db", &target_db, "target db for db normalization");
+        opts->Register(
+            "max-gain-db", &max_gain_db, "max gain db for db normalization");
+        opts->Register("convert-int-float",
+                       &convert_int_float,
+                       "if convert int samples to float");
+    }
+};
+
+class DecibelNormalizer : public FrontendInterface {
+  public:
+    explicit DecibelNormalizer(
+        const DecibelNormalizerOptions& opts,
+        std::unique_ptr<FrontendInterface> base_extractor);
+    virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& waves);
+    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* waves);
+    // noramlize audio, the dim is 1.
+    virtual size_t Dim() const { return dim_; }
+    virtual void SetFinished() { base_extractor_->SetFinished(); }
+    virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
+    virtual void Reset() { base_extractor_->Reset(); }
+
+  private:
+    bool Compute(kaldi::VectorBase<kaldi::BaseFloat>* waves) const;
+    DecibelNormalizerOptions opts_;
+    size_t dim_;
+    std::unique_ptr<FrontendInterface> base_extractor_;
+    kaldi::Vector<kaldi::BaseFloat> waveform_;
+};
+
+
+}  // namespace ppspeech
\ No newline at end of file
diff --git a/speechx/speechx/frontend/audio/fbank.h b/speechx/speechx/frontend/audio/fbank.h
new file mode 100644
index 000000000..68267b3d0
--- /dev/null
+++ b/speechx/speechx/frontend/audio/fbank.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// wrap the fbank feat of kaldi, todo (SmileGoat)
+
+#include "kaldi/feat/feature-mfcc.h"
+
+#incldue "kaldi/matrix/kaldi-vector.h"
+
+namespace ppspeech {
+
+class FbankExtractor : FrontendInterface {
+  public:
+    explicit FbankExtractor(const FbankOptions& opts,
+                            share_ptr<FrontendInterface> pre_extractor);
+    virtual void AcceptWaveform(
+        const kaldi::Vector<kaldi::BaseFloat>& input) = 0;
+    virtual void Read(kaldi::Vector<kaldi::BaseFloat>* feat) = 0;
+    virtual size_t Dim() const = 0;
+
+  private:
+    bool Compute(const kaldi::Vector<kaldi::BaseFloat>& wave,
+                 kaldi::Vector<kaldi::BaseFloat>* feat) const;
+};
+
+}  // namespace ppspeech
\ No newline at end of file
diff --git a/speechx/speechx/frontend/audio/feature_cache.cc b/speechx/speechx/frontend/audio/feature_cache.cc
new file mode 100644
index 000000000..3f7f6502b
--- /dev/null
+++ b/speechx/speechx/frontend/audio/feature_cache.cc
@@ -0,0 +1,87 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "frontend/audio/feature_cache.h"
+
+namespace ppspeech {
+
+using kaldi::Vector;
+using kaldi::VectorBase;
+using kaldi::BaseFloat;
+using std::vector;
+using kaldi::SubVector;
+using std::unique_ptr;
+
+FeatureCache::FeatureCache(int max_size,
+                           unique_ptr<FrontendInterface> base_extractor) {
+    max_size_ = max_size;
+    base_extractor_ = std::move(base_extractor);
+}
+
+void FeatureCache::Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs) {
+    base_extractor_->Accept(inputs);
+    // feed current data
+    bool result = false;
+    do {
+        result = Compute();
+    } while (result);
+}
+
+// pop feature chunk
+bool FeatureCache::Read(kaldi::Vector<kaldi::BaseFloat>* feats) {
+    kaldi::Timer timer;
+
+    std::unique_lock<std::mutex> lock(mutex_);
+    while (cache_.empty() && base_extractor_->IsFinished() == false) {
+        ready_read_condition_.wait(lock);
+        BaseFloat elapsed = timer.Elapsed() * 1000;
+        // todo replace 1.0 with timeout_
+        if (elapsed > 1.0) {
+            return false;
+        }
+        usleep(1000);  // sleep 1 ms
+    }
+    if (cache_.empty()) return false;
+    feats->Resize(cache_.front().Dim());
+    feats->CopyFromVec(cache_.front());
+    cache_.pop();
+    ready_feed_condition_.notify_one();
+    return true;
+}
+
+// read all data from base_feature_extractor_ into cache_
+bool FeatureCache::Compute() {
+    // compute and feed
+    Vector<BaseFloat> feature_chunk;
+    bool result = base_extractor_->Read(&feature_chunk);
+
+    std::unique_lock<std::mutex> lock(mutex_);
+    while (cache_.size() >= max_size_) {
+        ready_feed_condition_.wait(lock);
+    }
+
+    // feed cache
+    if (feature_chunk.Dim() != 0) {
+        cache_.push(feature_chunk);
+    }
+    ready_read_condition_.notify_one();
+    return result;
+}
+
+void Reset() {
+    // std::lock_guard<std::mutex> lock(mutex_);
+    return;
+}
+
+}  // namespace ppspeech
\ No newline at end of file
diff --git a/speechx/speechx/frontend/audio/feature_cache.h b/speechx/speechx/frontend/audio/feature_cache.h
new file mode 100644
index 000000000..99961b5e2
--- /dev/null
+++ b/speechx/speechx/frontend/audio/feature_cache.h
@@ -0,0 +1,66 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "base/common.h"
+#include "frontend/audio/frontend_itf.h"
+
+namespace ppspeech {
+
+class FeatureCache : public FrontendInterface {
+  public:
+    explicit FeatureCache(
+        int32 max_size = kint16max,
+        std::unique_ptr<FrontendInterface> base_extractor = NULL);
+
+    // Feed feats or waves
+    virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
+
+    // feats size = num_frames * feat_dim
+    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats);
+
+    // feat dim
+    virtual size_t Dim() const { return base_extractor_->Dim(); }
+
+    virtual void SetFinished() {
+        base_extractor_->SetFinished();
+        // read the last chunk data
+        Compute();
+    }
+
+    virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
+
+    virtual void Reset() {
+        base_extractor_->Reset();
+        while (!cache_.empty()) {
+            cache_.pop();
+        }
+    }
+
+  private:
+    bool Compute();
+
+    size_t max_size_;
+    std::unique_ptr<FrontendInterface> base_extractor_;
+
+    std::mutex mutex_;
+    std::queue<kaldi::Vector<BaseFloat>> cache_;
+    std::condition_variable ready_feed_condition_;
+    std::condition_variable ready_read_condition_;
+
+    // DISALLOW_COPY_AND_ASSGIN(FeatureCache);
+};
+
+}  // namespace ppspeech
diff --git a/speechx/speechx/frontend/audio/frontend_itf.h b/speechx/speechx/frontend/audio/frontend_itf.h
new file mode 100644
index 000000000..7913cc7c0
--- /dev/null
+++ b/speechx/speechx/frontend/audio/frontend_itf.h
@@ -0,0 +1,47 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "base/basic_types.h"
+#include "kaldi/matrix/kaldi-vector.h"
+
+namespace ppspeech {
+
+class FrontendInterface {
+  public:
+    // Feed inputs: features(2D saved in 1D) or waveforms(1D).
+    virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs) = 0;
+
+    // Fetch processed data: features or waveforms.
+    // For features(2D saved in 1D), the Matrix is squashed into Vector,
+    //    the length of output = feature_row * feature_dim.
+    // For waveforms(1D), samples saved in vector.
+    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* outputs) = 0;
+
+    // Dim is the feature dim. For waveforms(1D), Dim is zero; else is specific,
+    // e.g 80 for fbank.
+    virtual size_t Dim() const = 0;
+
+    // End Flag for Streaming Data.
+    virtual void SetFinished() = 0;
+
+    // whether is end of Streaming Data.
+    virtual bool IsFinished() const = 0;
+
+    // Reset to start state.
+    virtual void Reset() = 0;
+};
+
+}  // namespace ppspeech
diff --git a/speechx/speechx/frontend/audio/linear_spectrogram.cc b/speechx/speechx/frontend/audio/linear_spectrogram.cc
new file mode 100644
index 000000000..d6ae3d012
--- /dev/null
+++ b/speechx/speechx/frontend/audio/linear_spectrogram.cc
@@ -0,0 +1,109 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "frontend/audio/linear_spectrogram.h"
+#include "kaldi/base/kaldi-math.h"
+#include "kaldi/feat/feature-common.h"
+#include "kaldi/feat/feature-functions.h"
+#include "kaldi/matrix/matrix-functions.h"
+
+namespace ppspeech {
+
+using kaldi::int32;
+using kaldi::BaseFloat;
+using kaldi::Vector;
+using kaldi::SubVector;
+using kaldi::VectorBase;
+using kaldi::Matrix;
+using std::vector;
+
+LinearSpectrogram::LinearSpectrogram(
+    const LinearSpectrogramOptions& opts,
+    std::unique_ptr<FrontendInterface> base_extractor)
+    : opts_(opts), feature_window_funtion_(opts.frame_opts) {
+    base_extractor_ = std::move(base_extractor);
+    int32 window_size = opts.frame_opts.WindowSize();
+    int32 window_shift = opts.frame_opts.WindowShift();
+    dim_ = window_size / 2 + 1;
+    chunk_sample_size_ =
+        static_cast<int32>(opts.streaming_chunk * opts.frame_opts.samp_freq);
+    hanning_window_energy_ = kaldi::VecVec(feature_window_funtion_.window,
+                                           feature_window_funtion_.window);
+}
+
+void LinearSpectrogram::Accept(const VectorBase<BaseFloat>& inputs) {
+    base_extractor_->Accept(inputs);
+}
+
+bool LinearSpectrogram::Read(Vector<BaseFloat>* feats) {
+    Vector<BaseFloat> input_feats(chunk_sample_size_);
+    bool flag = base_extractor_->Read(&input_feats);
+    if (flag == false || input_feats.Dim() == 0) return false;
+
+    int32 feat_len = input_feats.Dim();
+    int32 left_len = reminded_wav_.Dim();
+    Vector<BaseFloat> waves(feat_len + left_len);
+    waves.Range(0, left_len).CopyFromVec(reminded_wav_);
+    waves.Range(left_len, feat_len).CopyFromVec(input_feats);
+    Compute(waves, feats);
+    int32 frame_shift = opts_.frame_opts.WindowShift();
+    int32 num_frames = kaldi::NumFrames(waves.Dim(), opts_.frame_opts);
+    int32 left_samples = waves.Dim() - frame_shift * num_frames;
+    reminded_wav_.Resize(left_samples);
+    reminded_wav_.CopyFromVec(
+        waves.Range(frame_shift * num_frames, left_samples));
+    return true;
+}
+
+// Compute spectrogram feat
+bool LinearSpectrogram::Compute(const Vector<BaseFloat>& waves,
+                                Vector<BaseFloat>* feats) {
+    int32 num_samples = waves.Dim();
+    int32 frame_length = opts_.frame_opts.WindowSize();
+    int32 sample_rate = opts_.frame_opts.samp_freq;
+    BaseFloat scale = 2.0 / (hanning_window_energy_ * sample_rate);
+
+    if (num_samples < frame_length) {
+        return true;
+    }
+
+    int32 num_frames = kaldi::NumFrames(num_samples, opts_.frame_opts);
+    feats->Resize(num_frames * dim_);
+    Vector<BaseFloat> window;
+
+    for (int frame_idx = 0; frame_idx < num_frames; ++frame_idx) {
+        kaldi::ExtractWindow(0,
+                             waves,
+                             frame_idx,
+                             opts_.frame_opts,
+                             feature_window_funtion_,
+                             &window,
+                             NULL);
+
+        SubVector<BaseFloat> output_row(feats->Data() + frame_idx * dim_, dim_);
+        window.Resize(frame_length, kaldi::kCopyData);
+        RealFft(&window, true);
+        kaldi::ComputePowerSpectrum(&window);
+        SubVector<BaseFloat> power_spectrum(window, 0, dim_);
+        power_spectrum.Scale(scale);
+        power_spectrum(0) = power_spectrum(0) / 2;
+        power_spectrum(dim_ - 1) = power_spectrum(dim_ - 1) / 2;
+        power_spectrum.Add(1e-14);
+        power_spectrum.ApplyLog();
+        output_row.CopyFromVec(power_spectrum);
+    }
+    return true;
+}
+
+}  // namespace ppspeech
\ No newline at end of file
diff --git a/speechx/speechx/frontend/audio/linear_spectrogram.h b/speechx/speechx/frontend/audio/linear_spectrogram.h
new file mode 100644
index 000000000..896c494dd
--- /dev/null
+++ b/speechx/speechx/frontend/audio/linear_spectrogram.h
@@ -0,0 +1,66 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#pragma once
+
+#include "base/common.h"
+#include "frontend/audio/frontend_itf.h"
+#include "kaldi/feat/feature-window.h"
+
+namespace ppspeech {
+
+struct LinearSpectrogramOptions {
+    kaldi::FrameExtractionOptions frame_opts;
+    kaldi::BaseFloat streaming_chunk;  // second
+
+    LinearSpectrogramOptions() : streaming_chunk(0.36), frame_opts() {}
+
+    void Register(kaldi::OptionsItf* opts) {
+        opts->Register("streaming-chunk",
+                       &streaming_chunk,
+                       "streaming chunk size, default: 0.36 sec");
+        frame_opts.Register(opts);
+    }
+};
+
+class LinearSpectrogram : public FrontendInterface {
+  public:
+    explicit LinearSpectrogram(
+        const LinearSpectrogramOptions& opts,
+        std::unique_ptr<FrontendInterface> base_extractor);
+    virtual void Accept(const kaldi::VectorBase<kaldi::BaseFloat>& inputs);
+    virtual bool Read(kaldi::Vector<kaldi::BaseFloat>* feats);
+    // the dim_ is the dim of single frame feature
+    virtual size_t Dim() const { return dim_; }
+    virtual void SetFinished() { base_extractor_->SetFinished(); }
+    virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
+    virtual void Reset() { base_extractor_->Reset(); }
+
+  private:
+    bool Compute(const kaldi::Vector<kaldi::BaseFloat>& waves,
+                 kaldi::Vector<kaldi::BaseFloat>* feats);
+
+    size_t dim_;
+    kaldi::FeatureWindowFunction feature_window_funtion_;
+    kaldi::BaseFloat hanning_window_energy_;
+    LinearSpectrogramOptions opts_;
+    std::unique_ptr<FrontendInterface> base_extractor_;
+    kaldi::Vector<kaldi::BaseFloat> reminded_wav_;
+    int chunk_sample_size_;
+    DISALLOW_COPY_AND_ASSIGN(LinearSpectrogram);
+};
+
+
+}  // namespace ppspeech
\ No newline at end of file
diff --git a/speechx/speechx/frontend/audio/mfcc.h b/speechx/speechx/frontend/audio/mfcc.h
new file mode 100644
index 000000000..aa369655e
--- /dev/null
+++ b/speechx/speechx/frontend/audio/mfcc.h
@@ -0,0 +1,16 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// wrap the mfcc feat of kaldi, todo (SmileGoat)
+#include "kaldi/feat/feature-mfcc.h"
\ No newline at end of file
diff --git a/speechx/speechx/frontend/audio/normalizer.h b/speechx/speechx/frontend/audio/normalizer.h
new file mode 100644
index 000000000..dcf721dd2
--- /dev/null
+++ b/speechx/speechx/frontend/audio/normalizer.h
@@ -0,0 +1,18 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "frontend/audio/cmvn.h"
+#include "frontend/audio/db_norm.h"
\ No newline at end of file
diff --git a/speechx/speechx/kaldi/decoder/lattice-faster-decoder.cc b/speechx/speechx/kaldi/decoder/lattice-faster-decoder.cc
new file mode 100644
index 000000000..42d1d2af4
--- /dev/null
+++ b/speechx/speechx/kaldi/decoder/lattice-faster-decoder.cc
@@ -0,0 +1,1020 @@
+// decoder/lattice-faster-decoder.cc
+
+// Copyright 2009-2012  Microsoft Corporation  Mirko Hannemann
+//           2013-2018  Johns Hopkins University (Author: Daniel Povey)
+//                2014  Guoguo Chen
+//                2018  Zhehuai Chen
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "decoder/lattice-faster-decoder.h"
+#include "lat/lattice-functions.h"
+
+namespace kaldi {
+
+// instantiate this class once for each thing you have to decode.
+template <typename FST, typename Token>
+LatticeFasterDecoderTpl<FST, Token>::LatticeFasterDecoderTpl(
+    const FST &fst, const LatticeFasterDecoderConfig &config)
+    : fst_(&fst),
+      delete_fst_(false),
+      config_(config),
+      num_toks_(0),
+      token_pool_(config.memory_pool_tokens_block_size),
+      forward_link_pool_(config.memory_pool_links_block_size) {
+  config.Check();
+  toks_.SetSize(1000);  // just so on the first frame we do something reasonable.
+}
+
+template <typename FST, typename Token>
+LatticeFasterDecoderTpl<FST, Token>::LatticeFasterDecoderTpl(
+    const LatticeFasterDecoderConfig &config, FST *fst)
+    : fst_(fst),
+      delete_fst_(true),
+      config_(config),
+      num_toks_(0),
+      token_pool_(config.memory_pool_tokens_block_size),
+      forward_link_pool_(config.memory_pool_links_block_size) {
+  config.Check();
+  toks_.SetSize(1000);  // just so on the first frame we do something reasonable.
+}
+
+template <typename FST, typename Token>
+LatticeFasterDecoderTpl<FST, Token>::~LatticeFasterDecoderTpl() {
+  DeleteElems(toks_.Clear());
+  ClearActiveTokens();
+  if (delete_fst_) delete fst_;
+}
+
+template <typename FST, typename Token>
+void LatticeFasterDecoderTpl<FST, Token>::InitDecoding() {
+  // clean up from last time:
+  DeleteElems(toks_.Clear());
+  cost_offsets_.clear();
+  ClearActiveTokens();
+  warned_ = false;
+  num_toks_ = 0;
+  decoding_finalized_ = false;
+  final_costs_.clear();
+  StateId start_state = fst_->Start();
+  KALDI_ASSERT(start_state != fst::kNoStateId);
+  active_toks_.resize(1);
+  Token *start_tok =
+      new (token_pool_.Allocate()) Token(0.0, 0.0, NULL, NULL, NULL);
+  active_toks_[0].toks = start_tok;
+  toks_.Insert(start_state, start_tok);
+  num_toks_++;
+  ProcessNonemitting(config_.beam);
+}
+
+// Returns true if any kind of traceback is available (not necessarily from
+// a final state).  It should only very rarely return false; this indicates
+// an unusual search error.
+template <typename FST, typename Token>
+bool LatticeFasterDecoderTpl<FST, Token>::Decode(DecodableInterface *decodable) {
+  InitDecoding();
+  // We use 1-based indexing for frames in this decoder (if you view it in
+  // terms of features), but note that the decodable object uses zero-based
+  // numbering, which we have to correct for when we call it.
+  AdvanceDecoding(decodable);
+  FinalizeDecoding();
+
+  // Returns true if we have any kind of traceback available (not necessarily
+  // to the end state; query ReachedFinal() for that).
+  return !active_toks_.empty() && active_toks_.back().toks != NULL;
+}
+
+
+// Outputs an FST corresponding to the single best path through the lattice.
+template <typename FST, typename Token>
+bool LatticeFasterDecoderTpl<FST, Token>::GetBestPath(Lattice *olat,
+                                       bool use_final_probs) const {
+  Lattice raw_lat;
+  GetRawLattice(&raw_lat, use_final_probs);
+  ShortestPath(raw_lat, olat);
+  return (olat->NumStates() != 0);
+}
+
+
+// Outputs an FST corresponding to the raw, state-level lattice
+template <typename FST, typename Token>
+bool LatticeFasterDecoderTpl<FST, Token>::GetRawLattice(
+    Lattice *ofst,
+    bool use_final_probs) const {
+  typedef LatticeArc Arc;
+  typedef Arc::StateId StateId;
+  typedef Arc::Weight Weight;
+  typedef Arc::Label Label;
+
+  // Note: you can't use the old interface (Decode()) if you want to
+  // get the lattice with use_final_probs = false.  You'd have to do
+  // InitDecoding() and then AdvanceDecoding().
+  if (decoding_finalized_ && !use_final_probs)
+    KALDI_ERR << "You cannot call FinalizeDecoding() and then call "
+              << "GetRawLattice() with use_final_probs == false";
+
+  unordered_map<Token*, BaseFloat> final_costs_local;
+
+  const unordered_map<Token*, BaseFloat> &final_costs =
+      (decoding_finalized_ ? final_costs_ : final_costs_local);
+  if (!decoding_finalized_ && use_final_probs)
+    ComputeFinalCosts(&final_costs_local, NULL, NULL);
+
+  ofst->DeleteStates();
+  // num-frames plus one (since frames are one-based, and we have
+  // an extra frame for the start-state).
+  int32 num_frames = active_toks_.size() - 1;
+  KALDI_ASSERT(num_frames > 0);
+  const int32 bucket_count = num_toks_/2 + 3;
+  unordered_map<Token*, StateId> tok_map(bucket_count);
+  // First create all states.
+  std::vector<Token*> token_list;
+  for (int32 f = 0; f <= num_frames; f++) {
+    if (active_toks_[f].toks == NULL) {
+      KALDI_WARN << "GetRawLattice: no tokens active on frame " << f
+                 << ": not producing lattice.\n";
+      return false;
+    }
+    TopSortTokens(active_toks_[f].toks, &token_list);
+    for (size_t i = 0; i < token_list.size(); i++)
+      if (token_list[i] != NULL)
+        tok_map[token_list[i]] = ofst->AddState();
+  }
+  // The next statement sets the start state of the output FST.  Because we
+  // topologically sorted the tokens, state zero must be the start-state.
+  ofst->SetStart(0);
+
+  KALDI_VLOG(4) << "init:" << num_toks_/2 + 3 << " buckets:"
+                << tok_map.bucket_count() << " load:" << tok_map.load_factor()
+                << " max:" << tok_map.max_load_factor();
+  // Now create all arcs.
+  for (int32 f = 0; f <= num_frames; f++) {
+    for (Token *tok = active_toks_[f].toks; tok != NULL; tok = tok->next) {
+      StateId cur_state = tok_map[tok];
+      for (ForwardLinkT *l = tok->links;
+           l != NULL;
+           l = l->next) {
+        typename unordered_map<Token*, StateId>::const_iterator
+            iter = tok_map.find(l->next_tok);
+        StateId nextstate = iter->second;
+        KALDI_ASSERT(iter != tok_map.end());
+        BaseFloat cost_offset = 0.0;
+        if (l->ilabel != 0) {  // emitting..
+          KALDI_ASSERT(f >= 0 && f < cost_offsets_.size());
+          cost_offset = cost_offsets_[f];
+        }
+        Arc arc(l->ilabel, l->olabel,
+                Weight(l->graph_cost, l->acoustic_cost - cost_offset),
+                nextstate);
+        ofst->AddArc(cur_state, arc);
+      }
+      if (f == num_frames) {
+        if (use_final_probs && !final_costs.empty()) {
+          typename unordered_map<Token*, BaseFloat>::const_iterator
+              iter = final_costs.find(tok);
+          if (iter != final_costs.end())
+            ofst->SetFinal(cur_state, LatticeWeight(iter->second, 0));
+        } else {
+          ofst->SetFinal(cur_state, LatticeWeight::One());
+        }
+      }
+    }
+  }
+  return (ofst->NumStates() > 0);
+}
+
+
+// This function is now deprecated, since now we do determinization from outside
+// the LatticeFasterDecoder class.  Outputs an FST corresponding to the
+// lattice-determinized lattice (one path per word sequence).
+template <typename FST, typename Token>
+bool LatticeFasterDecoderTpl<FST, Token>::GetLattice(CompactLattice *ofst,
+                                           bool use_final_probs) const {
+  Lattice raw_fst;
+  GetRawLattice(&raw_fst, use_final_probs);
+  Invert(&raw_fst);  // make it so word labels are on the input.
+  // (in phase where we get backward-costs).
+  fst::ILabelCompare<LatticeArc> ilabel_comp;
+  ArcSort(&raw_fst, ilabel_comp);  // sort on ilabel; makes
+  // lattice-determinization more efficient.
+
+  fst::DeterminizeLatticePrunedOptions lat_opts;
+  lat_opts.max_mem = config_.det_opts.max_mem;
+
+  DeterminizeLatticePruned(raw_fst, config_.lattice_beam, ofst, lat_opts);
+  raw_fst.DeleteStates();  // Free memory-- raw_fst no longer needed.
+  Connect(ofst);  // Remove unreachable states... there might be
+  // a small number of these, in some cases.
+  // Note: if something went wrong and the raw lattice was empty,
+  // we should still get to this point in the code without warnings or failures.
+  return (ofst->NumStates() != 0);
+}
+
+template <typename FST, typename Token>
+void LatticeFasterDecoderTpl<FST, Token>::PossiblyResizeHash(size_t num_toks) {
+  size_t new_sz = static_cast<size_t>(static_cast<BaseFloat>(num_toks)
+                                      * config_.hash_ratio);
+  if (new_sz > toks_.Size()) {
+    toks_.SetSize(new_sz);
+  }
+}
+
+/*
+  A note on the definition of extra_cost.
+
+  extra_cost is used in pruning tokens, to save memory.
+
+  extra_cost can be thought of as a beta (backward) cost assuming
+  we had set the betas on currently-active tokens to all be the negative
+  of the alphas for those tokens.  (So all currently active tokens would
+  be on (tied) best paths).
+
+  We can use the extra_cost to accurately prune away tokens that we know will
+  never appear in the lattice.  If the extra_cost is greater than the desired
+  lattice beam, the token would provably never appear in the lattice, so we can
+  prune away the token.
+
+  (Note: we don't update all the extra_costs every time we update a frame; we
+  only do it every 'config_.prune_interval' frames).
+ */
+
+// FindOrAddToken either locates a token in hash of toks_,
+// or if necessary inserts a new, empty token (i.e. with no forward links)
+// for the current frame.  [note: it's inserted if necessary into hash toks_
+// and also into the singly linked list of tokens active on this frame
+// (whose head is at active_toks_[frame]).
+template <typename FST, typename Token>
+inline typename LatticeFasterDecoderTpl<FST, Token>::Elem*
+LatticeFasterDecoderTpl<FST, Token>::FindOrAddToken(
+      StateId state, int32 frame_plus_one, BaseFloat tot_cost,
+      Token *backpointer, bool *changed) {
+  // Returns the Token pointer.  Sets "changed" (if non-NULL) to true
+  // if the token was newly created or the cost changed.
+  KALDI_ASSERT(frame_plus_one < active_toks_.size());
+  Token *&toks = active_toks_[frame_plus_one].toks;
+  Elem *e_found = toks_.Insert(state, NULL);
+  if (e_found->val == NULL) {  // no such token presently.
+    const BaseFloat extra_cost = 0.0;
+    // tokens on the currently final frame have zero extra_cost
+    // as any of them could end up
+    // on the winning path.
+    Token *new_tok = new (token_pool_.Allocate())
+        Token(tot_cost, extra_cost, NULL, toks, backpointer);
+    // NULL: no forward links yet
+    toks = new_tok;
+    num_toks_++;
+    e_found->val = new_tok;
+    if (changed) *changed = true;
+    return e_found;
+  } else {
+    Token *tok = e_found->val;  // There is an existing Token for this state.
+    if (tok->tot_cost > tot_cost) {  // replace old token
+      tok->tot_cost = tot_cost;
+      // SetBackpointer() just does tok->backpointer = backpointer in
+      // the case where Token == BackpointerToken, else nothing.
+      tok->SetBackpointer(backpointer);
+      // we don't allocate a new token, the old stays linked in active_toks_
+      // we only replace the tot_cost
+      // in the current frame, there are no forward links (and no extra_cost)
+      // only in ProcessNonemitting we have to delete forward links
+      // in case we visit a state for the second time
+      // those forward links, that lead to this replaced token before:
+      // they remain and will hopefully be pruned later (PruneForwardLinks...)
+      if (changed) *changed = true;
+    } else {
+      if (changed) *changed = false;
+    }
+    return e_found;
+  }
+}
+
+// prunes outgoing links for all tokens in active_toks_[frame]
+// it's called by PruneActiveTokens
+// all links, that have link_extra_cost > lattice_beam are pruned
+template <typename FST, typename Token>
+void LatticeFasterDecoderTpl<FST, Token>::PruneForwardLinks(
+    int32 frame_plus_one, bool *extra_costs_changed,
+    bool *links_pruned, BaseFloat delta) {
+  // delta is the amount by which the extra_costs must change
+  // If delta is larger,  we'll tend to go back less far
+  //    toward the beginning of the file.
+  // extra_costs_changed is set to true if extra_cost was changed for any token
+  // links_pruned is set to true if any link in any token was pruned
+
+  *extra_costs_changed = false;
+  *links_pruned = false;
+  KALDI_ASSERT(frame_plus_one >= 0 && frame_plus_one < active_toks_.size());
+  if (active_toks_[frame_plus_one].toks == NULL) {  // empty list; should not happen.
+    if (!warned_) {
+      KALDI_WARN << "No tokens alive [doing pruning].. warning first "
+          "time only for each utterance\n";
+      warned_ = true;
+    }
+  }
+
+  // We have to iterate until there is no more change, because the links
+  // are not guaranteed to be in topological order.
+  bool changed = true;  // difference new minus old extra cost >= delta ?
+  while (changed) {
+    changed = false;
+    for (Token *tok = active_toks_[frame_plus_one].toks;
+         tok != NULL; tok = tok->next) {
+      ForwardLinkT *link, *prev_link = NULL;
+      // will recompute tok_extra_cost for tok.
+      BaseFloat tok_extra_cost = std::numeric_limits<BaseFloat>::infinity();
+      // tok_extra_cost is the best (min) of link_extra_cost of outgoing links
+      for (link = tok->links; link != NULL; ) {
+        // See if we need to excise this link...
+        Token *next_tok = link->next_tok;
+        BaseFloat link_extra_cost = next_tok->extra_cost +
+            ((tok->tot_cost + link->acoustic_cost + link->graph_cost)
+             - next_tok->tot_cost);  // difference in brackets is >= 0
+        // link_exta_cost is the difference in score between the best paths
+        // through link source state and through link destination state
+        KALDI_ASSERT(link_extra_cost == link_extra_cost);  // check for NaN
+        if (link_extra_cost > config_.lattice_beam) {  // excise link
+          ForwardLinkT *next_link = link->next;
+          if (prev_link != NULL) prev_link->next = next_link;
+          else tok->links = next_link;
+          forward_link_pool_.Free(link);
+          link = next_link;  // advance link but leave prev_link the same.
+          *links_pruned = true;
+        } else {   // keep the link and update the tok_extra_cost if needed.
+          if (link_extra_cost < 0.0) {  // this is just a precaution.
+            if (link_extra_cost < -0.01)
+              KALDI_WARN << "Negative extra_cost: " << link_extra_cost;
+            link_extra_cost = 0.0;
+          }
+          if (link_extra_cost < tok_extra_cost)
+            tok_extra_cost = link_extra_cost;
+          prev_link = link;  // move to next link
+          link = link->next;
+        }
+      }  // for all outgoing links
+      if (fabs(tok_extra_cost - tok->extra_cost) > delta)
+        changed = true;   // difference new minus old is bigger than delta
+      tok->extra_cost = tok_extra_cost;
+      // will be +infinity or <= lattice_beam_.
+      // infinity indicates, that no forward link survived pruning
+    }  // for all Token on active_toks_[frame]
+    if (changed) *extra_costs_changed = true;
+
+    // Note: it's theoretically possible that aggressive compiler
+    // optimizations could cause an infinite loop here for small delta and
+    // high-dynamic-range scores.
+  } // while changed
+}
+
+// PruneForwardLinksFinal is a version of PruneForwardLinks that we call
+// on the final frame.  If there are final tokens active, it uses
+// the final-probs for pruning, otherwise it treats all tokens as final.
+template <typename FST, typename Token>
+void LatticeFasterDecoderTpl<FST, Token>::PruneForwardLinksFinal() {
+  KALDI_ASSERT(!active_toks_.empty());
+  int32 frame_plus_one = active_toks_.size() - 1;
+
+  if (active_toks_[frame_plus_one].toks == NULL)  // empty list; should not happen.
+    KALDI_WARN << "No tokens alive at end of file";
+
+  typedef typename unordered_map<Token*, BaseFloat>::const_iterator IterType;
+  ComputeFinalCosts(&final_costs_, &final_relative_cost_, &final_best_cost_);
+  decoding_finalized_ = true;
+  // We call DeleteElems() as a nicety, not because it's really necessary;
+  // otherwise there would be a time, after calling PruneTokensForFrame() on the
+  // final frame, when toks_.GetList() or toks_.Clear() would contain pointers
+  // to nonexistent tokens.
+  DeleteElems(toks_.Clear());
+
+  // Now go through tokens on this frame, pruning forward links...  may have to
+  // iterate a few times until there is no more change, because the list is not
+  // in topological order.  This is a modified version of the code in
+  // PruneForwardLinks, but here we also take account of the final-probs.
+  bool changed = true;
+  BaseFloat delta = 1.0e-05;
+  while (changed) {
+    changed = false;
+    for (Token *tok = active_toks_[frame_plus_one].toks;
+         tok != NULL; tok = tok->next) {
+      ForwardLinkT *link, *prev_link = NULL;
+      // will recompute tok_extra_cost.  It has a term in it that corresponds
+      // to the "final-prob", so instead of initializing tok_extra_cost to infinity
+      // below we set it to the difference between the (score+final_prob) of this token,
+      // and the best such (score+final_prob).
+      BaseFloat final_cost;
+      if (final_costs_.empty()) {
+        final_cost = 0.0;
+      } else {
+        IterType iter = final_costs_.find(tok);
+        if (iter != final_costs_.end())
+          final_cost = iter->second;
+        else
+          final_cost = std::numeric_limits<BaseFloat>::infinity();
+      }
+      BaseFloat tok_extra_cost = tok->tot_cost + final_cost - final_best_cost_;
+      // tok_extra_cost will be a "min" over either directly being final, or
+      // being indirectly final through other links, and the loop below may
+      // decrease its value:
+      for (link = tok->links; link != NULL; ) {
+        // See if we need to excise this link...
+        Token *next_tok = link->next_tok;
+        BaseFloat link_extra_cost = next_tok->extra_cost +
+            ((tok->tot_cost + link->acoustic_cost + link->graph_cost)
+             - next_tok->tot_cost);
+        if (link_extra_cost > config_.lattice_beam) {  // excise link
+          ForwardLinkT *next_link = link->next;
+          if (prev_link != NULL) prev_link->next = next_link;
+          else tok->links = next_link;
+          forward_link_pool_.Free(link);
+          link = next_link; // advance link but leave prev_link the same.
+        } else { // keep the link and update the tok_extra_cost if needed.
+          if (link_extra_cost < 0.0) { // this is just a precaution.
+            if (link_extra_cost < -0.01)
+              KALDI_WARN << "Negative extra_cost: " << link_extra_cost;
+            link_extra_cost = 0.0;
+          }
+          if (link_extra_cost < tok_extra_cost)
+            tok_extra_cost = link_extra_cost;
+          prev_link = link;
+          link = link->next;
+        }
+      }
+      // prune away tokens worse than lattice_beam above best path.  This step
+      // was not necessary in the non-final case because then, this case
+      // showed up as having no forward links.  Here, the tok_extra_cost has
+      // an extra component relating to the final-prob.
+      if (tok_extra_cost > config_.lattice_beam)
+        tok_extra_cost = std::numeric_limits<BaseFloat>::infinity();
+      // to be pruned in PruneTokensForFrame
+
+      if (!ApproxEqual(tok->extra_cost, tok_extra_cost, delta))
+        changed = true;
+      tok->extra_cost = tok_extra_cost; // will be +infinity or <= lattice_beam_.
+    }
+  } // while changed
+}
+
+template <typename FST, typename Token>
+BaseFloat LatticeFasterDecoderTpl<FST, Token>::FinalRelativeCost() const {
+  if (!decoding_finalized_) {
+    BaseFloat relative_cost;
+    ComputeFinalCosts(NULL, &relative_cost, NULL);
+    return relative_cost;
+  } else {
+    // we're not allowed to call that function if FinalizeDecoding() has
+    // been called; return a cached value.
+    return final_relative_cost_;
+  }
+}
+
+
+// Prune away any tokens on this frame that have no forward links.
+// [we don't do this in PruneForwardLinks because it would give us
+// a problem with dangling pointers].
+// It's called by PruneActiveTokens if any forward links have been pruned
+template <typename FST, typename Token>
+void LatticeFasterDecoderTpl<FST, Token>::PruneTokensForFrame(int32 frame_plus_one) {
+  KALDI_ASSERT(frame_plus_one >= 0 && frame_plus_one < active_toks_.size());
+  Token *&toks = active_toks_[frame_plus_one].toks;
+  if (toks == NULL)
+    KALDI_WARN << "No tokens alive [doing pruning]";
+  Token *tok, *next_tok, *prev_tok = NULL;
+  for (tok = toks; tok != NULL; tok = next_tok) {
+    next_tok = tok->next;
+    if (tok->extra_cost == std::numeric_limits<BaseFloat>::infinity()) {
+      // token is unreachable from end of graph; (no forward links survived)
+      // excise tok from list and delete tok.
+      if (prev_tok != NULL) prev_tok->next = tok->next;
+      else toks = tok->next;
+      token_pool_.Free(tok);
+      num_toks_--;
+    } else {  // fetch next Token
+      prev_tok = tok;
+    }
+  }
+}
+
+// Go backwards through still-alive tokens, pruning them, starting not from
+// the current frame (where we want to keep all tokens) but from the frame before
+// that.  We go backwards through the frames and stop when we reach a point
+// where the delta-costs are not changing (and the delta controls when we consider
+// a cost to have "not changed").
+template <typename FST, typename Token>
+void LatticeFasterDecoderTpl<FST, Token>::PruneActiveTokens(BaseFloat delta) {
+  int32 cur_frame_plus_one = NumFramesDecoded();
+  int32 num_toks_begin = num_toks_;
+  // The index "f" below represents a "frame plus one", i.e. you'd have to subtract
+  // one to get the corresponding index for the decodable object.
+  for (int32 f = cur_frame_plus_one - 1; f >= 0; f--) {
+    // Reason why we need to prune forward links in this situation:
+    // (1) we have never pruned them (new TokenList)
+    // (2) we have not yet pruned the forward links to the next f,
+    // after any of those tokens have changed their extra_cost.
+    if (active_toks_[f].must_prune_forward_links) {
+      bool extra_costs_changed = false, links_pruned = false;
+      PruneForwardLinks(f, &extra_costs_changed, &links_pruned, delta);
+      if (extra_costs_changed && f > 0) // any token has changed extra_cost
+        active_toks_[f-1].must_prune_forward_links = true;
+      if (links_pruned) // any link was pruned
+        active_toks_[f].must_prune_tokens = true;
+      active_toks_[f].must_prune_forward_links = false; // job done
+    }
+    if (f+1 < cur_frame_plus_one &&      // except for last f (no forward links)
+        active_toks_[f+1].must_prune_tokens) {
+      PruneTokensForFrame(f+1);
+      active_toks_[f+1].must_prune_tokens = false;
+    }
+  }
+  KALDI_VLOG(4) << "PruneActiveTokens: pruned tokens from " << num_toks_begin
+                << " to " << num_toks_;
+}
+
+template <typename FST, typename Token>
+void LatticeFasterDecoderTpl<FST, Token>::ComputeFinalCosts(
+    unordered_map<Token*, BaseFloat> *final_costs,
+    BaseFloat *final_relative_cost,
+    BaseFloat *final_best_cost) const {
+  KALDI_ASSERT(!decoding_finalized_);
+  if (final_costs != NULL)
+    final_costs->clear();
+  const Elem *final_toks = toks_.GetList();
+  BaseFloat infinity = std::numeric_limits<BaseFloat>::infinity();
+  BaseFloat best_cost = infinity,
+      best_cost_with_final = infinity;
+
+  while (final_toks != NULL) {
+    StateId state = final_toks->key;
+    Token *tok = final_toks->val;
+    const Elem *next = final_toks->tail;
+    BaseFloat final_cost = fst_->Final(state).Value();
+    BaseFloat cost = tok->tot_cost,
+        cost_with_final = cost + final_cost;
+    best_cost = std::min(cost, best_cost);
+    best_cost_with_final = std::min(cost_with_final, best_cost_with_final);
+    if (final_costs != NULL && final_cost != infinity)
+      (*final_costs)[tok] = final_cost;
+    final_toks = next;
+  }
+  if (final_relative_cost != NULL) {
+    if (best_cost == infinity && best_cost_with_final == infinity) {
+      // Likely this will only happen if there are no tokens surviving.
+      // This seems the least bad way to handle it.
+      *final_relative_cost = infinity;
+    } else {
+      *final_relative_cost = best_cost_with_final - best_cost;
+    }
+  }
+  if (final_best_cost != NULL) {
+    if (best_cost_with_final != infinity) { // final-state exists.
+      *final_best_cost = best_cost_with_final;
+    } else { // no final-state exists.
+      *final_best_cost = best_cost;
+    }
+  }
+}
+
+template <typename FST, typename Token>
+void LatticeFasterDecoderTpl<FST, Token>::AdvanceDecoding(DecodableInterface *decodable,
+                                                int32 max_num_frames) {
+  if (std::is_same<FST, fst::Fst<fst::StdArc> >::value) {
+    // if the type 'FST' is the FST base-class, then see if the FST type of fst_
+    // is actually VectorFst or ConstFst.  If so, call the AdvanceDecoding()
+    // function after casting *this to the more specific type.
+    if (fst_->Type() == "const") {
+      LatticeFasterDecoderTpl<fst::ConstFst<fst::StdArc>, Token> *this_cast =
+          reinterpret_cast<LatticeFasterDecoderTpl<fst::ConstFst<fst::StdArc>, Token>* >(this);
+      this_cast->AdvanceDecoding(decodable, max_num_frames);
+      return;
+    } else if (fst_->Type() == "vector") {
+      LatticeFasterDecoderTpl<fst::VectorFst<fst::StdArc>, Token> *this_cast =
+          reinterpret_cast<LatticeFasterDecoderTpl<fst::VectorFst<fst::StdArc>, Token>* >(this);
+      this_cast->AdvanceDecoding(decodable, max_num_frames);
+      return;
+    }
+  }
+
+
+  KALDI_ASSERT(!active_toks_.empty() && !decoding_finalized_ &&
+               "You must call InitDecoding() before AdvanceDecoding");
+  int32 num_frames_ready = decodable->NumFramesReady();
+  // num_frames_ready must be >= num_frames_decoded, or else
+  // the number of frames ready must have decreased (which doesn't
+  // make sense) or the decodable object changed between calls
+  // (which isn't allowed).
+  KALDI_ASSERT(num_frames_ready >= NumFramesDecoded());
+  int32 target_frames_decoded = num_frames_ready;
+  if (max_num_frames >= 0)
+    target_frames_decoded = std::min(target_frames_decoded,
+                                     NumFramesDecoded() + max_num_frames);
+  while (NumFramesDecoded() < target_frames_decoded) {
+    if (NumFramesDecoded() % config_.prune_interval == 0) {
+      PruneActiveTokens(config_.lattice_beam * config_.prune_scale);
+    }
+    BaseFloat cost_cutoff = ProcessEmitting(decodable);
+    ProcessNonemitting(cost_cutoff);
+  }
+}
+
+// FinalizeDecoding() is a version of PruneActiveTokens that we call
+// (optionally) on the final frame.  Takes into account the final-prob of
+// tokens.  This function used to be called PruneActiveTokensFinal().
+template <typename FST, typename Token>
+void LatticeFasterDecoderTpl<FST, Token>::FinalizeDecoding() {
+  int32 final_frame_plus_one = NumFramesDecoded();
+  int32 num_toks_begin = num_toks_;
+  // PruneForwardLinksFinal() prunes final frame (with final-probs), and
+  // sets decoding_finalized_.
+  PruneForwardLinksFinal();
+  for (int32 f = final_frame_plus_one - 1; f >= 0; f--) {
+    bool b1, b2; // values not used.
+    BaseFloat dontcare = 0.0; // delta of zero means we must always update
+    PruneForwardLinks(f, &b1, &b2, dontcare);
+    PruneTokensForFrame(f + 1);
+  }
+  PruneTokensForFrame(0);
+  KALDI_VLOG(4) << "pruned tokens from " << num_toks_begin
+                << " to " << num_toks_;
+}
+
+/// Gets the weight cutoff.  Also counts the active tokens.
+template <typename FST, typename Token>
+BaseFloat LatticeFasterDecoderTpl<FST, Token>::GetCutoff(Elem *list_head, size_t *tok_count,
+                                          BaseFloat *adaptive_beam, Elem **best_elem) {
+  BaseFloat best_weight = std::numeric_limits<BaseFloat>::infinity();
+  // positive == high cost == bad.
+  size_t count = 0;
+  if (config_.max_active == std::numeric_limits<int32>::max() &&
+      config_.min_active == 0) {
+    for (Elem *e = list_head; e != NULL; e = e->tail, count++) {
+      BaseFloat w = static_cast<BaseFloat>(e->val->tot_cost);
+      if (w < best_weight) {
+        best_weight = w;
+        if (best_elem) *best_elem = e;
+      }
+    }
+    if (tok_count != NULL) *tok_count = count;
+    if (adaptive_beam != NULL) *adaptive_beam = config_.beam;
+    return best_weight + config_.beam;
+  } else {
+    tmp_array_.clear();
+    for (Elem *e = list_head; e != NULL; e = e->tail, count++) {
+      BaseFloat w = e->val->tot_cost;
+      tmp_array_.push_back(w);
+      if (w < best_weight) {
+        best_weight = w;
+        if (best_elem) *best_elem = e;
+      }
+    }
+    if (tok_count != NULL) *tok_count = count;
+
+    BaseFloat beam_cutoff = best_weight + config_.beam,
+        min_active_cutoff = std::numeric_limits<BaseFloat>::infinity(),
+        max_active_cutoff = std::numeric_limits<BaseFloat>::infinity();
+
+    KALDI_VLOG(6) << "Number of tokens active on frame " << NumFramesDecoded()
+                  << " is " << tmp_array_.size();
+
+    if (tmp_array_.size() > static_cast<size_t>(config_.max_active)) {
+      std::nth_element(tmp_array_.begin(),
+                       tmp_array_.begin() + config_.max_active,
+                       tmp_array_.end());
+      max_active_cutoff = tmp_array_[config_.max_active];
+    }
+    if (max_active_cutoff < beam_cutoff) { // max_active is tighter than beam.
+      if (adaptive_beam)
+        *adaptive_beam = max_active_cutoff - best_weight + config_.beam_delta;
+      return max_active_cutoff;
+    }
+    if (tmp_array_.size() > static_cast<size_t>(config_.min_active)) {
+      if (config_.min_active == 0) min_active_cutoff = best_weight;
+      else {
+        std::nth_element(tmp_array_.begin(),
+                         tmp_array_.begin() + config_.min_active,
+                         tmp_array_.size() > static_cast<size_t>(config_.max_active) ?
+                         tmp_array_.begin() + config_.max_active :
+                         tmp_array_.end());
+        min_active_cutoff = tmp_array_[config_.min_active];
+      }
+    }
+    if (min_active_cutoff > beam_cutoff) { // min_active is looser than beam.
+      if (adaptive_beam)
+        *adaptive_beam = min_active_cutoff - best_weight + config_.beam_delta;
+      return min_active_cutoff;
+    } else {
+      *adaptive_beam = config_.beam;
+      return beam_cutoff;
+    }
+  }
+}
+
+template <typename FST, typename Token>
+BaseFloat LatticeFasterDecoderTpl<FST, Token>::ProcessEmitting(
+    DecodableInterface *decodable) {
+  KALDI_ASSERT(active_toks_.size() > 0);
+  int32 frame = active_toks_.size() - 1; // frame is the frame-index
+                                         // (zero-based) used to get likelihoods
+                                         // from the decodable object.
+  active_toks_.resize(active_toks_.size() + 1);
+
+  Elem *final_toks = toks_.Clear(); // analogous to swapping prev_toks_ / cur_toks_
+                                   // in simple-decoder.h.   Removes the Elems from
+                                   // being indexed in the hash in toks_.
+  Elem *best_elem = NULL;
+  BaseFloat adaptive_beam;
+  size_t tok_cnt;
+  BaseFloat cur_cutoff = GetCutoff(final_toks, &tok_cnt, &adaptive_beam, &best_elem);
+  KALDI_VLOG(6) << "Adaptive beam on frame " << NumFramesDecoded() << " is "
+                << adaptive_beam;
+
+  PossiblyResizeHash(tok_cnt);  // This makes sure the hash is always big enough.
+
+  BaseFloat next_cutoff = std::numeric_limits<BaseFloat>::infinity();
+  // pruning "online" before having seen all tokens
+
+  BaseFloat cost_offset = 0.0; // Used to keep probabilities in a good
+                               // dynamic range.
+
+
+  // First process the best token to get a hopefully
+  // reasonably tight bound on the next cutoff.  The only
+  // products of the next block are "next_cutoff" and "cost_offset".
+  if (best_elem) {
+    StateId state = best_elem->key;
+    Token *tok = best_elem->val;
+    cost_offset = - tok->tot_cost;
+    for (fst::ArcIterator<FST> aiter(*fst_, state);
+         !aiter.Done();
+         aiter.Next()) {
+      const Arc &arc = aiter.Value();
+      if (arc.ilabel != 0) {  // propagate..
+        BaseFloat new_weight = arc.weight.Value() + cost_offset -
+            decodable->LogLikelihood(frame, arc.ilabel) + tok->tot_cost;
+        if (new_weight + adaptive_beam < next_cutoff)
+          next_cutoff = new_weight + adaptive_beam;
+      }
+    }
+  }
+
+  // Store the offset on the acoustic likelihoods that we're applying.
+  // Could just do cost_offsets_.push_back(cost_offset), but we
+  // do it this way as it's more robust to future code changes.
+  cost_offsets_.resize(frame + 1, 0.0);
+  cost_offsets_[frame] = cost_offset;
+
+  // the tokens are now owned here, in final_toks, and the hash is empty.
+  // 'owned' is a complex thing here; the point is we need to call DeleteElem
+  // on each elem 'e' to let toks_ know we're done with them.
+  for (Elem *e = final_toks, *e_tail; e != NULL; e = e_tail) {
+    // loop this way because we delete "e" as we go.
+    StateId state = e->key;
+    Token *tok = e->val;
+    if (tok->tot_cost <= cur_cutoff) {
+      for (fst::ArcIterator<FST> aiter(*fst_, state);
+           !aiter.Done();
+           aiter.Next()) {
+        const Arc &arc = aiter.Value();
+        if (arc.ilabel != 0) {  // propagate..
+          BaseFloat ac_cost = cost_offset -
+              decodable->LogLikelihood(frame, arc.ilabel),
+              graph_cost = arc.weight.Value(),
+              cur_cost = tok->tot_cost,
+              tot_cost = cur_cost + ac_cost + graph_cost;
+          if (tot_cost >= next_cutoff) continue;
+          else if (tot_cost + adaptive_beam < next_cutoff)
+            next_cutoff = tot_cost + adaptive_beam; // prune by best current token
+          // Note: the frame indexes into active_toks_ are one-based,
+          // hence the + 1.
+          Elem *e_next = FindOrAddToken(arc.nextstate,
+                                        frame + 1, tot_cost, tok, NULL);
+          // NULL: no change indicator needed
+
+          // Add ForwardLink from tok to next_tok (put on head of list tok->links)
+          tok->links = new (forward_link_pool_.Allocate())
+              ForwardLinkT(e_next->val, arc.ilabel, arc.olabel, graph_cost,
+                           ac_cost, tok->links);
+        }
+      } // for all arcs
+    }
+    e_tail = e->tail;
+    toks_.Delete(e); // delete Elem
+  }
+  return next_cutoff;
+}
+
+// static inline
+template <typename FST, typename Token>
+void LatticeFasterDecoderTpl<FST, Token>::DeleteForwardLinks(Token *tok) {
+  ForwardLinkT *l = tok->links, *m;
+  while (l != NULL) {
+    m = l->next;
+    forward_link_pool_.Free(l);
+    l = m;
+  }
+  tok->links = NULL;
+}
+
+
+template <typename FST, typename Token>
+void LatticeFasterDecoderTpl<FST, Token>::ProcessNonemitting(BaseFloat cutoff) {
+  KALDI_ASSERT(!active_toks_.empty());
+  int32 frame = static_cast<int32>(active_toks_.size()) - 2;
+  // Note: "frame" is the time-index we just processed, or -1 if
+  // we are processing the nonemitting transitions before the
+  // first frame (called from InitDecoding()).
+
+  // Processes nonemitting arcs for one frame.  Propagates within toks_.
+  // Note-- this queue structure is not very optimal as
+  // it may cause us to process states unnecessarily (e.g. more than once),
+  // but in the baseline code, turning this vector into a set to fix this
+  // problem did not improve overall speed.
+
+  KALDI_ASSERT(queue_.empty());
+
+  if (toks_.GetList() == NULL) {
+    if (!warned_) {
+      KALDI_WARN << "Error, no surviving tokens: frame is " << frame;
+      warned_ = true;
+    }
+  }
+
+  for (const Elem *e = toks_.GetList(); e != NULL;  e = e->tail) {
+    StateId state = e->key;
+    if (fst_->NumInputEpsilons(state) != 0)
+      queue_.push_back(e);
+  }
+
+  while (!queue_.empty()) {
+    const Elem *e = queue_.back();
+    queue_.pop_back();
+
+    StateId state = e->key;
+    Token *tok = e->val;  // would segfault if e is a NULL pointer but this can't happen.
+    BaseFloat cur_cost = tok->tot_cost;
+    if (cur_cost >= cutoff) // Don't bother processing successors.
+      continue;
+    // If "tok" has any existing forward links, delete them,
+    // because we're about to regenerate them.  This is a kind
+    // of non-optimality (remember, this is the simple decoder),
+    // but since most states are emitting it's not a huge issue.
+    DeleteForwardLinks(tok); // necessary when re-visiting
+    tok->links = NULL;
+    for (fst::ArcIterator<FST> aiter(*fst_, state);
+         !aiter.Done();
+         aiter.Next()) {
+      const Arc &arc = aiter.Value();
+      if (arc.ilabel == 0) {  // propagate nonemitting only...
+        BaseFloat graph_cost = arc.weight.Value(),
+            tot_cost = cur_cost + graph_cost;
+        if (tot_cost < cutoff) {
+          bool changed;
+
+          Elem *e_new = FindOrAddToken(arc.nextstate, frame + 1, tot_cost,
+                                          tok, &changed);
+
+          tok->links = new (forward_link_pool_.Allocate()) ForwardLinkT(
+              e_new->val, 0, arc.olabel, graph_cost, 0, tok->links);
+
+          // "changed" tells us whether the new token has a different
+          // cost from before, or is new [if so, add into queue].
+          if (changed && fst_->NumInputEpsilons(arc.nextstate) != 0)
+            queue_.push_back(e_new);
+        }
+      }
+    } // for all arcs
+  } // while queue not empty
+}
+
+
+template <typename FST, typename Token>
+void LatticeFasterDecoderTpl<FST, Token>::DeleteElems(Elem *list) {
+  for (Elem *e = list, *e_tail; e != NULL; e = e_tail) {
+    e_tail = e->tail;
+    toks_.Delete(e);
+  }
+}
+
+template <typename FST, typename Token>
+void LatticeFasterDecoderTpl<FST, Token>::ClearActiveTokens() { // a cleanup routine, at utt end/begin
+  for (size_t i = 0; i < active_toks_.size(); i++) {
+    // Delete all tokens alive on this frame, and any forward
+    // links they may have.
+    for (Token *tok = active_toks_[i].toks; tok != NULL; ) {
+      DeleteForwardLinks(tok);
+      Token *next_tok = tok->next;
+      token_pool_.Free(tok);
+      num_toks_--;
+      tok = next_tok;
+    }
+  }
+  active_toks_.clear();
+  KALDI_ASSERT(num_toks_ == 0);
+}
+
+// static
+template <typename FST, typename Token>
+void LatticeFasterDecoderTpl<FST, Token>::TopSortTokens(
+    Token *tok_list, std::vector<Token*> *topsorted_list) {
+  unordered_map<Token*, int32> token2pos;
+  typedef typename unordered_map<Token*, int32>::iterator IterType;
+  int32 num_toks = 0;
+  for (Token *tok = tok_list; tok != NULL; tok = tok->next)
+    num_toks++;
+  int32 cur_pos = 0;
+  // We assign the tokens numbers num_toks - 1, ... , 2, 1, 0.
+  // This is likely to be in closer to topological order than
+  // if we had given them ascending order, because of the way
+  // new tokens are put at the front of the list.
+  for (Token *tok = tok_list; tok != NULL; tok = tok->next)
+    token2pos[tok] = num_toks - ++cur_pos;
+
+  unordered_set<Token*> reprocess;
+
+  for (IterType iter = token2pos.begin(); iter != token2pos.end(); ++iter) {
+    Token *tok = iter->first;
+    int32 pos = iter->second;
+    for (ForwardLinkT *link = tok->links; link != NULL; link = link->next) {
+      if (link->ilabel == 0) {
+        // We only need to consider epsilon links, since non-epsilon links
+        // transition between frames and this function only needs to sort a list
+        // of tokens from a single frame.
+        IterType following_iter = token2pos.find(link->next_tok);
+        if (following_iter != token2pos.end()) { // another token on this frame,
+                                                 // so must consider it.
+          int32 next_pos = following_iter->second;
+          if (next_pos < pos) { // reassign the position of the next Token.
+            following_iter->second = cur_pos++;
+            reprocess.insert(link->next_tok);
+          }
+        }
+      }
+    }
+    // In case we had previously assigned this token to be reprocessed, we can
+    // erase it from that set because it's "happy now" (we just processed it).
+    reprocess.erase(tok);
+  }
+
+  size_t max_loop = 1000000, loop_count; // max_loop is to detect epsilon cycles.
+  for (loop_count = 0;
+       !reprocess.empty() && loop_count < max_loop; ++loop_count) {
+    std::vector<Token*> reprocess_vec;
+    for (typename unordered_set<Token*>::iterator iter = reprocess.begin();
+         iter != reprocess.end(); ++iter)
+      reprocess_vec.push_back(*iter);
+    reprocess.clear();
+    for (typename std::vector<Token*>::iterator iter = reprocess_vec.begin();
+         iter != reprocess_vec.end(); ++iter) {
+      Token *tok = *iter;
+      int32 pos = token2pos[tok];
+      // Repeat the processing we did above (for comments, see above).
+      for (ForwardLinkT *link = tok->links; link != NULL; link = link->next) {
+        if (link->ilabel == 0) {
+          IterType following_iter = token2pos.find(link->next_tok);
+          if (following_iter != token2pos.end()) {
+            int32 next_pos = following_iter->second;
+            if (next_pos < pos) {
+              following_iter->second = cur_pos++;
+              reprocess.insert(link->next_tok);
+            }
+          }
+        }
+      }
+    }
+  }
+  KALDI_ASSERT(loop_count < max_loop && "Epsilon loops exist in your decoding "
+               "graph (this is not allowed!)");
+
+  topsorted_list->clear();
+  topsorted_list->resize(cur_pos, NULL);  // create a list with NULLs in between.
+  for (IterType iter = token2pos.begin(); iter != token2pos.end(); ++iter)
+    (*topsorted_list)[iter->second] = iter->first;
+}
+
+// Instantiate the template for the combination of token types and FST types
+// that we'll need.
+template class LatticeFasterDecoderTpl<fst::Fst<fst::StdArc>, decoder::StdToken>;
+template class LatticeFasterDecoderTpl<fst::VectorFst<fst::StdArc>, decoder::StdToken >;
+template class LatticeFasterDecoderTpl<fst::ConstFst<fst::StdArc>, decoder::StdToken >;
+
+template class LatticeFasterDecoderTpl<fst::ConstGrammarFst, decoder::StdToken>;
+template class LatticeFasterDecoderTpl<fst::VectorGrammarFst, decoder::StdToken>;
+
+template class LatticeFasterDecoderTpl<fst::Fst<fst::StdArc> , decoder::BackpointerToken>;
+template class LatticeFasterDecoderTpl<fst::VectorFst<fst::StdArc>, decoder::BackpointerToken >;
+template class LatticeFasterDecoderTpl<fst::ConstFst<fst::StdArc>, decoder::BackpointerToken >;
+template class LatticeFasterDecoderTpl<fst::ConstGrammarFst, decoder::BackpointerToken>;
+template class LatticeFasterDecoderTpl<fst::VectorGrammarFst, decoder::BackpointerToken>;
+
+
+} // end namespace kaldi.
diff --git a/speechx/speechx/kaldi/decoder/lattice-faster-decoder.h b/speechx/speechx/kaldi/decoder/lattice-faster-decoder.h
new file mode 100644
index 000000000..2016ad571
--- /dev/null
+++ b/speechx/speechx/kaldi/decoder/lattice-faster-decoder.h
@@ -0,0 +1,549 @@
+// decoder/lattice-faster-decoder.h
+
+// Copyright 2009-2013  Microsoft Corporation;  Mirko Hannemann;
+//           2013-2014  Johns Hopkins University (Author: Daniel Povey)
+//                2014  Guoguo Chen
+//                2018  Zhehuai Chen
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_DECODER_LATTICE_FASTER_DECODER_H_
+#define KALDI_DECODER_LATTICE_FASTER_DECODER_H_
+
+#include "decoder/grammar-fst.h"
+#include "fst/fstlib.h"
+#include "fst/memory.h"
+#include "fstext/fstext-lib.h"
+#include "itf/decodable-itf.h"
+#include "lat/determinize-lattice-pruned.h"
+#include "lat/kaldi-lattice.h"
+#include "util/hash-list.h"
+#include "util/stl-utils.h"
+
+namespace kaldi {
+
+struct LatticeFasterDecoderConfig {
+  BaseFloat beam;
+  int32 max_active;
+  int32 min_active;
+  BaseFloat lattice_beam;
+  int32 prune_interval;
+  bool determinize_lattice; // not inspected by this class... used in
+                            // command-line program.
+  BaseFloat beam_delta;
+  BaseFloat hash_ratio;
+  // Note: we don't make prune_scale configurable on the command line, it's not
+  // a very important parameter.  It affects the algorithm that prunes the
+  // tokens as we go.
+  BaseFloat prune_scale;
+
+  // Number of elements in the block for Token and ForwardLink memory
+  // pool allocation.
+  int32 memory_pool_tokens_block_size;
+  int32 memory_pool_links_block_size;
+
+  // Most of the options inside det_opts are not actually queried by the
+  // LatticeFasterDecoder class itself, but by the code that calls it, for
+  // example in the function DecodeUtteranceLatticeFaster.
+  fst::DeterminizeLatticePhonePrunedOptions det_opts;
+
+  LatticeFasterDecoderConfig()
+      : beam(16.0),
+        max_active(std::numeric_limits<int32>::max()),
+        min_active(200),
+        lattice_beam(10.0),
+        prune_interval(25),
+        determinize_lattice(true),
+        beam_delta(0.5),
+        hash_ratio(2.0),
+        prune_scale(0.1),
+        memory_pool_tokens_block_size(1 << 8),
+        memory_pool_links_block_size(1 << 8) {}
+  void Register(OptionsItf *opts) {
+    det_opts.Register(opts);
+    opts->Register("beam", &beam, "Decoding beam.  Larger->slower, more accurate.");
+    opts->Register("max-active", &max_active, "Decoder max active states.  Larger->slower; "
+                   "more accurate");
+    opts->Register("min-active", &min_active, "Decoder minimum #active states.");
+    opts->Register("lattice-beam", &lattice_beam, "Lattice generation beam.  Larger->slower, "
+                   "and deeper lattices");
+    opts->Register("prune-interval", &prune_interval, "Interval (in frames) at "
+                   "which to prune tokens");
+    opts->Register("determinize-lattice", &determinize_lattice, "If true, "
+                   "determinize the lattice (lattice-determinization, keeping only "
+                   "best pdf-sequence for each word-sequence).");
+    opts->Register("beam-delta", &beam_delta, "Increment used in decoding-- this "
+                   "parameter is obscure and relates to a speedup in the way the "
+                   "max-active constraint is applied.  Larger is more accurate.");
+    opts->Register("hash-ratio", &hash_ratio, "Setting used in decoder to "
+                   "control hash behavior");
+    opts->Register("memory-pool-tokens-block-size", &memory_pool_tokens_block_size,
+                   "Memory pool block size suggestion for storing tokens (in elements). "
+                   "Smaller uses less memory but increases cache misses.");
+    opts->Register("memory-pool-links-block-size", &memory_pool_links_block_size,
+                   "Memory pool block size suggestion for storing links (in elements). "
+                   "Smaller uses less memory but increases cache misses.");
+  }
+  void Check() const {
+    KALDI_ASSERT(beam > 0.0 && max_active > 1 && lattice_beam > 0.0
+                 && min_active <= max_active
+                 && prune_interval > 0 && beam_delta > 0.0 && hash_ratio >= 1.0
+                 && prune_scale > 0.0 && prune_scale < 1.0);
+  }
+};
+
+namespace decoder {
+// We will template the decoder on the token type as well as the FST type; this
+// is a mechanism so that we can use the same underlying decoder code for
+// versions of the decoder that support quickly getting the best path
+// (LatticeFasterOnlineDecoder, see lattice-faster-online-decoder.h) and also
+// those that do not (LatticeFasterDecoder).
+
+
+// ForwardLinks are the links from a token to a token on the next frame.
+// or sometimes on the current frame (for input-epsilon links).
+template <typename Token>
+struct ForwardLink {
+  using Label = fst::StdArc::Label;
+
+  Token *next_tok;  // the next token [or NULL if represents final-state]
+  Label ilabel;  // ilabel on arc
+  Label olabel;  // olabel on arc
+  BaseFloat graph_cost;  // graph cost of traversing arc (contains LM, etc.)
+  BaseFloat acoustic_cost;  // acoustic cost (pre-scaled) of traversing arc
+  ForwardLink *next;  // next in singly-linked list of forward arcs (arcs
+                      // in the state-level lattice) from a token.
+  inline ForwardLink(Token *next_tok, Label ilabel, Label olabel,
+                     BaseFloat graph_cost, BaseFloat acoustic_cost,
+                     ForwardLink *next):
+      next_tok(next_tok), ilabel(ilabel), olabel(olabel),
+      graph_cost(graph_cost), acoustic_cost(acoustic_cost),
+      next(next) { }
+};
+
+
+struct StdToken {
+  using ForwardLinkT = ForwardLink<StdToken>;
+  using Token = StdToken;
+
+  // Standard token type for LatticeFasterDecoder.  Each active HCLG
+  // (decoding-graph) state on each frame has one token.
+
+  // tot_cost is the total (LM + acoustic) cost from the beginning of the
+  // utterance up to this point.  (but see cost_offset_, which is subtracted
+  // to keep it in a good numerical range).
+  BaseFloat tot_cost;
+
+  // exta_cost is >= 0.  After calling PruneForwardLinks, this equals the
+  // minimum difference between the cost of the best path that this link is a
+  // part of, and the cost of the absolute best path, under the assumption that
+  // any of the currently active states at the decoding front may eventually
+  // succeed (e.g. if you were to take the currently active states one by one
+  // and compute this difference, and then take the minimum).
+  BaseFloat extra_cost;
+
+  // 'links' is the head of singly-linked list of ForwardLinks, which is what we
+  // use for lattice generation.
+  ForwardLinkT *links;
+
+  //'next' is the next in the singly-linked list of tokens for this frame.
+  Token *next;
+
+  // This function does nothing and should be optimized out; it's needed
+  // so we can share the regular LatticeFasterDecoderTpl code and the code
+  // for LatticeFasterOnlineDecoder that supports fast traceback.
+  inline void SetBackpointer (Token *backpointer) { }
+
+  // This constructor just ignores the 'backpointer' argument.  That argument is
+  // needed so that we can use the same decoder code for LatticeFasterDecoderTpl
+  // and LatticeFasterOnlineDecoderTpl (which needs backpointers to support a
+  // fast way to obtain the best path).
+  inline StdToken(BaseFloat tot_cost, BaseFloat extra_cost, ForwardLinkT *links,
+                  Token *next, Token *backpointer):
+      tot_cost(tot_cost), extra_cost(extra_cost), links(links), next(next) { }
+};
+
+struct BackpointerToken {
+  using ForwardLinkT = ForwardLink<BackpointerToken>;
+  using Token = BackpointerToken;
+
+  // BackpointerToken is like Token but also
+  // Standard token type for LatticeFasterDecoder.  Each active HCLG
+  // (decoding-graph) state on each frame has one token.
+
+  // tot_cost is the total (LM + acoustic) cost from the beginning of the
+  // utterance up to this point.  (but see cost_offset_, which is subtracted
+  // to keep it in a good numerical range).
+  BaseFloat tot_cost;
+
+  // exta_cost is >= 0.  After calling PruneForwardLinks, this equals
+  // the minimum difference between the cost of the best path, and the cost of
+  // this is on, and the cost of the absolute best path, under the assumption
+  // that any of the currently active states at the decoding front may
+  // eventually succeed (e.g. if you were to take the currently active states
+  // one by one and compute this difference, and then take the minimum).
+  BaseFloat extra_cost;
+
+  // 'links' is the head of singly-linked list of ForwardLinks, which is what we
+  // use for lattice generation.
+  ForwardLinkT *links;
+
+  //'next' is the next in the singly-linked list of tokens for this frame.
+  BackpointerToken *next;
+
+  // Best preceding BackpointerToken (could be a on this frame, connected to
+  // this via an epsilon transition, or on a previous frame).  This is only
+  // required for an efficient GetBestPath function in
+  // LatticeFasterOnlineDecoderTpl; it plays no part in the lattice generation
+  // (the "links" list is what stores the forward links, for that).
+  Token *backpointer;
+
+  inline void SetBackpointer (Token *backpointer) {
+    this->backpointer = backpointer;
+  }
+
+  inline BackpointerToken(BaseFloat tot_cost, BaseFloat extra_cost, ForwardLinkT *links,
+                          Token *next, Token *backpointer):
+      tot_cost(tot_cost), extra_cost(extra_cost), links(links), next(next),
+      backpointer(backpointer) { }
+};
+
+}  // namespace decoder
+
+
+/** This is the "normal" lattice-generating decoder.
+    See \ref lattices_generation \ref decoders_faster and \ref decoders_simple
+     for more information.
+
+   The decoder is templated on the FST type and the token type.  The token type
+   will normally be StdToken, but also may be BackpointerToken which is to support
+   quick lookup of the current best path (see lattice-faster-online-decoder.h)
+
+   The FST you invoke this decoder which is expected to equal
+   Fst::Fst<fst::StdArc>, a.k.a. StdFst, or GrammarFst.  If you invoke it with
+   FST == StdFst and it notices that the actual FST type is
+   fst::VectorFst<fst::StdArc> or fst::ConstFst<fst::StdArc>, the decoder object
+   will internally cast itself to one that is templated on those more specific
+   types; this is an optimization for speed.
+ */
+template <typename FST, typename Token = decoder::StdToken>
+class LatticeFasterDecoderTpl {
+ public:
+  using Arc = typename FST::Arc;
+  using Label = typename Arc::Label;
+  using StateId = typename Arc::StateId;
+  using Weight = typename Arc::Weight;
+  using ForwardLinkT = decoder::ForwardLink<Token>;
+
+  // Instantiate this class once for each thing you have to decode.
+  // This version of the constructor does not take ownership of
+  // 'fst'.
+  LatticeFasterDecoderTpl(const FST &fst,
+                          const LatticeFasterDecoderConfig &config);
+
+  // This version of the constructor takes ownership of the fst, and will delete
+  // it when this object is destroyed.
+  LatticeFasterDecoderTpl(const LatticeFasterDecoderConfig &config,
+                          FST *fst);
+
+  void SetOptions(const LatticeFasterDecoderConfig &config) {
+    config_ = config;
+  }
+
+  const LatticeFasterDecoderConfig &GetOptions() const {
+    return config_;
+  }
+
+  ~LatticeFasterDecoderTpl();
+
+  /// Decodes until there are no more frames left in the "decodable" object..
+  /// note, this may block waiting for input if the "decodable" object blocks.
+  /// Returns true if any kind of traceback is available (not necessarily from a
+  /// final state).
+  bool Decode(DecodableInterface *decodable);
+
+
+  /// says whether a final-state was active on the last frame.  If it was not, the
+  /// lattice (or traceback) will end with states that are not final-states.
+  bool ReachedFinal() const {
+    return FinalRelativeCost() != std::numeric_limits<BaseFloat>::infinity();
+  }
+
+  /// Outputs an FST corresponding to the single best path through the lattice.
+  /// Returns true if result is nonempty (using the return status is deprecated,
+  /// it will become void).  If "use_final_probs" is true AND we reached the
+  /// final-state of the graph then it will include those as final-probs, else
+  /// it will treat all final-probs as one.  Note: this just calls GetRawLattice()
+  /// and figures out the shortest path.
+  bool GetBestPath(Lattice *ofst,
+                   bool use_final_probs = true) const;
+
+  /// Outputs an FST corresponding to the raw, state-level
+  /// tracebacks.  Returns true if result is nonempty.
+  /// If "use_final_probs" is true AND we reached the final-state
+  /// of the graph then it will include those as final-probs, else
+  /// it will treat all final-probs as one.
+  /// The raw lattice will be topologically sorted.
+  ///
+  /// See also GetRawLatticePruned in lattice-faster-online-decoder.h,
+  /// which also supports a pruning beam, in case for some reason
+  /// you want it pruned tighter than the regular lattice beam.
+  /// We could put that here in future needed.
+  bool GetRawLattice(Lattice *ofst, bool use_final_probs = true) const;
+
+
+
+  /// [Deprecated, users should now use GetRawLattice and determinize it
+  /// themselves, e.g. using DeterminizeLatticePhonePrunedWrapper].
+  /// Outputs an FST corresponding to the lattice-determinized
+  /// lattice (one path per word sequence).   Returns true if result is nonempty.
+  /// If "use_final_probs" is true AND we reached the final-state of the graph
+  /// then it will include those as final-probs, else it will treat all
+  /// final-probs as one.
+  bool GetLattice(CompactLattice *ofst,
+                  bool use_final_probs = true) const;
+
+  /// InitDecoding initializes the decoding, and should only be used if you
+  /// intend to call AdvanceDecoding().  If you call Decode(), you don't need to
+  /// call this.  You can also call InitDecoding if you have already decoded an
+  /// utterance and want to start with a new utterance.
+  void InitDecoding();
+
+  /// This will decode until there are no more frames ready in the decodable
+  /// object.  You can keep calling it each time more frames become available.
+  /// If max_num_frames is specified, it specifies the maximum number of frames
+  /// the function will decode before returning.
+  void AdvanceDecoding(DecodableInterface *decodable,
+                       int32 max_num_frames = -1);
+
+  /// This function may be optionally called after AdvanceDecoding(), when you
+  /// do not plan to decode any further.  It does an extra pruning step that
+  /// will help to prune the lattices output by GetLattice and (particularly)
+  /// GetRawLattice more completely, particularly toward the end of the
+  /// utterance.  If you call this, you cannot call AdvanceDecoding again (it
+  /// will fail), and you cannot call GetLattice() and related functions with
+  /// use_final_probs = false.  Used to be called PruneActiveTokensFinal().
+  void FinalizeDecoding();
+
+  /// FinalRelativeCost() serves the same purpose as ReachedFinal(), but gives
+  /// more information.  It returns the difference between the best (final-cost
+  /// plus cost) of any token on the final frame, and the best cost of any token
+  /// on the final frame.  If it is infinity it means no final-states were
+  /// present on the final frame.  It will usually be nonnegative.  If it not
+  /// too positive (e.g. < 5 is my first guess, but this is not tested) you can
+  /// take it as a good indication that we reached the final-state with
+  /// reasonable likelihood.
+  BaseFloat FinalRelativeCost() const;
+
+
+  // Returns the number of frames decoded so far.  The value returned changes
+  // whenever we call ProcessEmitting().
+  inline int32 NumFramesDecoded() const { return active_toks_.size() - 1; }
+
+ protected:
+  // we make things protected instead of private, as code in
+  // LatticeFasterOnlineDecoderTpl, which inherits from this, also uses the
+  // internals.
+
+  // Deletes the elements of the singly linked list tok->links.
+  void DeleteForwardLinks(Token *tok);
+
+  // head of per-frame list of Tokens (list is in topological order),
+  // and something saying whether we ever pruned it using PruneForwardLinks.
+  struct TokenList {
+    Token *toks;
+    bool must_prune_forward_links;
+    bool must_prune_tokens;
+    TokenList(): toks(NULL), must_prune_forward_links(true),
+                 must_prune_tokens(true) { }
+  };
+
+  using Elem = typename HashList<StateId, Token*>::Elem;
+  // Equivalent to:
+  //  struct Elem {
+  //    StateId key;
+  //    Token *val;
+  //    Elem *tail;
+  //  };
+
+  void PossiblyResizeHash(size_t num_toks);
+
+  // FindOrAddToken either locates a token in hash of toks_, or if necessary
+  // inserts a new, empty token (i.e. with no forward links) for the current
+  // frame.  [note: it's inserted if necessary into hash toks_ and also into the
+  // singly linked list of tokens active on this frame (whose head is at
+  // active_toks_[frame]).  The frame_plus_one argument is the acoustic frame
+  // index plus one, which is used to index into the active_toks_ array.
+  // Returns the Token pointer.  Sets "changed" (if non-NULL) to true if the
+  // token was newly created or the cost changed.
+  // If Token == StdToken, the 'backpointer' argument has no purpose (and will
+  // hopefully be optimized out).
+  inline Elem *FindOrAddToken(StateId state, int32 frame_plus_one,
+                              BaseFloat tot_cost, Token *backpointer,
+                              bool *changed);
+
+  // prunes outgoing links for all tokens in active_toks_[frame]
+  // it's called by PruneActiveTokens
+  // all links, that have link_extra_cost > lattice_beam are pruned
+  // delta is the amount by which the extra_costs must change
+  // before we set *extra_costs_changed = true.
+  // If delta is larger,  we'll tend to go back less far
+  //    toward the beginning of the file.
+  // extra_costs_changed is set to true if extra_cost was changed for any token
+  // links_pruned is set to true if any link in any token was pruned
+  void PruneForwardLinks(int32 frame_plus_one, bool *extra_costs_changed,
+                         bool *links_pruned,
+                         BaseFloat delta);
+
+  // This function computes the final-costs for tokens active on the final
+  // frame.  It outputs to final-costs, if non-NULL, a map from the Token*
+  // pointer to the final-prob of the corresponding state, for all Tokens
+  // that correspond to states that have final-probs.  This map will be
+  // empty if there were no final-probs.  It outputs to
+  // final_relative_cost, if non-NULL, the difference between the best
+  // forward-cost including the final-prob cost, and the best forward-cost
+  // without including the final-prob cost (this will usually be positive), or
+  // infinity if there were no final-probs.  [c.f. FinalRelativeCost(), which
+  // outputs this quanitity].  It outputs to final_best_cost, if
+  // non-NULL, the lowest for any token t active on the final frame, of
+  // forward-cost[t] + final-cost[t], where final-cost[t] is the final-cost in
+  // the graph of the state corresponding to token t, or the best of
+  // forward-cost[t] if there were no final-probs active on the final frame.
+  // You cannot call this after FinalizeDecoding() has been called; in that
+  // case you should get the answer from class-member variables.
+  void ComputeFinalCosts(unordered_map<Token*, BaseFloat> *final_costs,
+                         BaseFloat *final_relative_cost,
+                         BaseFloat *final_best_cost) const;
+
+  // PruneForwardLinksFinal is a version of PruneForwardLinks that we call
+  // on the final frame.  If there are final tokens active, it uses
+  // the final-probs for pruning, otherwise it treats all tokens as final.
+  void PruneForwardLinksFinal();
+
+  // Prune away any tokens on this frame that have no forward links.
+  // [we don't do this in PruneForwardLinks because it would give us
+  // a problem with dangling pointers].
+  // It's called by PruneActiveTokens if any forward links have been pruned
+  void PruneTokensForFrame(int32 frame_plus_one);
+
+
+  // Go backwards through still-alive tokens, pruning them if the
+  // forward+backward cost is more than lat_beam away from the best path.  It's
+  // possible to prove that this is "correct" in the sense that we won't lose
+  // anything outside of lat_beam, regardless of what happens in the future.
+  // delta controls when it considers a cost to have changed enough to continue
+  // going backward and propagating the change.  larger delta -> will recurse
+  // less far.
+  void PruneActiveTokens(BaseFloat delta);
+
+  /// Gets the weight cutoff.  Also counts the active tokens.
+  BaseFloat GetCutoff(Elem *list_head, size_t *tok_count,
+                      BaseFloat *adaptive_beam, Elem **best_elem);
+
+  /// Processes emitting arcs for one frame.  Propagates from prev_toks_ to
+  /// cur_toks_.  Returns the cost cutoff for subsequent ProcessNonemitting() to
+  /// use.
+  BaseFloat ProcessEmitting(DecodableInterface *decodable);
+
+  /// Processes nonemitting (epsilon) arcs for one frame.  Called after
+  /// ProcessEmitting() on each frame.  The cost cutoff is computed by the
+  /// preceding ProcessEmitting().
+  void ProcessNonemitting(BaseFloat cost_cutoff);
+
+  // HashList defined in ../util/hash-list.h.  It actually allows us to maintain
+  // more than one list (e.g. for current and previous frames), but only one of
+  // them at a time can be indexed by StateId.  It is indexed by frame-index
+  // plus one, where the frame-index is zero-based, as used in decodable object.
+  // That is, the emitting probs of frame t are accounted for in tokens at
+  // toks_[t+1].  The zeroth frame is for nonemitting transition at the start of
+  // the graph.
+  HashList<StateId, Token*> toks_;
+
+  std::vector<TokenList> active_toks_; // Lists of tokens, indexed by
+  // frame (members of TokenList are toks, must_prune_forward_links,
+  // must_prune_tokens).
+  std::vector<const Elem* > queue_;  // temp variable used in ProcessNonemitting,
+  std::vector<BaseFloat> tmp_array_;  // used in GetCutoff.
+
+  // fst_ is a pointer to the FST we are decoding from.
+  const FST *fst_;
+  // delete_fst_ is true if the pointer fst_ needs to be deleted when this
+  // object is destroyed.
+  bool delete_fst_;
+
+  std::vector<BaseFloat> cost_offsets_; // This contains, for each
+  // frame, an offset that was added to the acoustic log-likelihoods on that
+  // frame in order to keep everything in a nice dynamic range i.e.  close to
+  // zero, to reduce roundoff errors.
+  LatticeFasterDecoderConfig config_;
+  int32 num_toks_; // current total #toks allocated...
+  bool warned_;
+
+  /// decoding_finalized_ is true if someone called FinalizeDecoding().  [note,
+  /// calling this is optional].  If true, it's forbidden to decode more.  Also,
+  /// if this is set, then the output of ComputeFinalCosts() is in the next
+  /// three variables.  The reason we need to do this is that after
+  /// FinalizeDecoding() calls PruneTokensForFrame() for the final frame, some
+  /// of the tokens on the last frame are freed, so we free the list from toks_
+  /// to avoid having dangling pointers hanging around.
+  bool decoding_finalized_;
+  /// For the meaning of the next 3 variables, see the comment for
+  /// decoding_finalized_ above., and ComputeFinalCosts().
+  unordered_map<Token*, BaseFloat> final_costs_;
+  BaseFloat final_relative_cost_;
+  BaseFloat final_best_cost_;
+
+  // Memory pools for storing tokens and forward links.
+  // We use it to decrease the work put on allocator and to move some of data
+  // together. Too small block sizes will result in more work to allocator but
+  // bigger ones increase the memory usage.
+  fst::MemoryPool<Token> token_pool_;
+  fst::MemoryPool<ForwardLinkT> forward_link_pool_;
+
+  // There are various cleanup tasks... the toks_ structure contains
+  // singly linked lists of Token pointers, where Elem is the list type.
+  // It also indexes them in a hash, indexed by state (this hash is only
+  // maintained for the most recent frame).  toks_.Clear()
+  // deletes them from the hash and returns the list of Elems.  The
+  // function DeleteElems calls toks_.Delete(elem) for each elem in
+  // the list, which returns ownership of the Elem to the toks_ structure
+  // for reuse, but does not delete the Token pointer.  The Token pointers
+  // are reference-counted and are ultimately deleted in PruneTokensForFrame,
+  // but are also linked together on each frame by their own linked-list,
+  // using the "next" pointer.  We delete them manually.
+  void DeleteElems(Elem *list);
+
+  // This function takes a singly linked list of tokens for a single frame, and
+  // outputs a list of them in topological order (it will crash if no such order
+  // can be found, which will typically be due to decoding graphs with epsilon
+  // cycles, which are not allowed).  Note: the output list may contain NULLs,
+  // which the caller should pass over; it just happens to be more efficient for
+  // the algorithm to output a list that contains NULLs.
+  static void TopSortTokens(Token *tok_list,
+                            std::vector<Token*> *topsorted_list);
+
+  void ClearActiveTokens();
+
+  KALDI_DISALLOW_COPY_AND_ASSIGN(LatticeFasterDecoderTpl);
+};
+
+typedef LatticeFasterDecoderTpl<fst::StdFst, decoder::StdToken> LatticeFasterDecoder;
+
+
+
+} // end namespace kaldi.
+
+#endif
diff --git a/speechx/speechx/kaldi/decoder/lattice-faster-online-decoder.cc b/speechx/speechx/kaldi/decoder/lattice-faster-online-decoder.cc
new file mode 100644
index 000000000..ebdace7e8
--- /dev/null
+++ b/speechx/speechx/kaldi/decoder/lattice-faster-online-decoder.cc
@@ -0,0 +1,285 @@
+// decoder/lattice-faster-online-decoder.cc
+
+// Copyright 2009-2012  Microsoft Corporation  Mirko Hannemann
+//           2013-2014  Johns Hopkins University (Author: Daniel Povey)
+//                2014  Guoguo Chen
+//                2014  IMSL, PKU-HKUST (author: Wei Shi)
+//                2018  Zhehuai Chen
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+// see note at the top of lattice-faster-decoder.cc, about how to maintain this
+// file in sync with lattice-faster-decoder.cc
+
+#include "decoder/lattice-faster-online-decoder.h"
+#include "lat/lattice-functions.h"
+
+namespace kaldi {
+
+template <typename FST>
+bool LatticeFasterOnlineDecoderTpl<FST>::TestGetBestPath(
+    bool use_final_probs) const {
+  Lattice lat1;
+  {
+    Lattice raw_lat;
+    this->GetRawLattice(&raw_lat, use_final_probs);
+    ShortestPath(raw_lat, &lat1);
+  }
+  Lattice lat2;
+  GetBestPath(&lat2, use_final_probs);
+  BaseFloat delta = 0.1;
+  int32 num_paths = 1;
+  if (!fst::RandEquivalent(lat1, lat2, num_paths, delta, rand())) {
+    KALDI_WARN << "Best-path test failed";
+    return false;
+  } else {
+    return true;
+  }
+}
+
+
+// Outputs an FST corresponding to the single best path through the lattice.
+template <typename FST>
+bool LatticeFasterOnlineDecoderTpl<FST>::GetBestPath(Lattice *olat,
+                                                     bool use_final_probs) const {
+  olat->DeleteStates();
+  BaseFloat final_graph_cost;
+  BestPathIterator iter = BestPathEnd(use_final_probs, &final_graph_cost);
+  if (iter.Done())
+    return false;  // would have printed warning.
+  StateId state = olat->AddState();
+  olat->SetFinal(state, LatticeWeight(final_graph_cost, 0.0));
+  while (!iter.Done()) {
+    LatticeArc arc;
+    iter = TraceBackBestPath(iter, &arc);
+    arc.nextstate = state;
+    StateId new_state = olat->AddState();
+    olat->AddArc(new_state, arc);
+    state = new_state;
+  }
+  olat->SetStart(state);
+  return true;
+}
+
+template <typename FST>
+typename LatticeFasterOnlineDecoderTpl<FST>::BestPathIterator LatticeFasterOnlineDecoderTpl<FST>::BestPathEnd(
+    bool use_final_probs,
+    BaseFloat *final_cost_out) const {
+  if (this->decoding_finalized_ && !use_final_probs)
+    KALDI_ERR << "You cannot call FinalizeDecoding() and then call "
+              << "BestPathEnd() with use_final_probs == false";
+  KALDI_ASSERT(this->NumFramesDecoded() > 0 &&
+               "You cannot call BestPathEnd if no frames were decoded.");
+
+  unordered_map<Token*, BaseFloat> final_costs_local;
+
+  const unordered_map<Token*, BaseFloat> &final_costs =
+      (this->decoding_finalized_ ? this->final_costs_ :final_costs_local);
+  if (!this->decoding_finalized_ && use_final_probs)
+    this->ComputeFinalCosts(&final_costs_local, NULL, NULL);
+
+  // Singly linked list of tokens on last frame (access list through "next"
+  // pointer).
+  BaseFloat best_cost = std::numeric_limits<BaseFloat>::infinity();
+  BaseFloat best_final_cost = 0;
+  Token *best_tok = NULL;
+  for (Token *tok = this->active_toks_.back().toks;
+       tok != NULL; tok = tok->next) {
+    BaseFloat cost = tok->tot_cost, final_cost = 0.0;
+    if (use_final_probs && !final_costs.empty()) {
+      // if we are instructed to use final-probs, and any final tokens were
+      // active on final frame, include the final-prob in the cost of the token.
+      typename unordered_map<Token*, BaseFloat>::const_iterator
+          iter = final_costs.find(tok);
+      if (iter != final_costs.end()) {
+        final_cost = iter->second;
+        cost += final_cost;
+      } else {
+        cost = std::numeric_limits<BaseFloat>::infinity();
+      }
+    }
+    if (cost < best_cost) {
+      best_cost = cost;
+      best_tok = tok;
+      best_final_cost = final_cost;
+    }
+  }
+  if (best_tok == NULL) {  // this should not happen, and is likely a code error or
+    // caused by infinities in likelihoods, but I'm not making
+    // it a fatal error for now.
+    KALDI_WARN << "No final token found.";
+  }
+  if (final_cost_out)
+    *final_cost_out = best_final_cost;
+  return BestPathIterator(best_tok, this->NumFramesDecoded() - 1);
+}
+
+
+template <typename FST>
+typename LatticeFasterOnlineDecoderTpl<FST>::BestPathIterator LatticeFasterOnlineDecoderTpl<FST>::TraceBackBestPath(
+    BestPathIterator iter, LatticeArc *oarc) const {
+  KALDI_ASSERT(!iter.Done() && oarc != NULL);
+  Token *tok = static_cast<Token*>(iter.tok);
+  int32 cur_t = iter.frame, step_t = 0;
+  if (tok->backpointer != NULL) {
+    // retrieve the correct forward link(with the best link cost)
+    BaseFloat best_cost = std::numeric_limits<BaseFloat>::infinity();
+    ForwardLinkT *link;
+    for (link = tok->backpointer->links;
+         link != NULL; link = link->next) {
+      if (link->next_tok == tok) { // this is a link to "tok"
+        BaseFloat graph_cost = link->graph_cost, 
+                  acoustic_cost = link->acoustic_cost;
+        BaseFloat cost = graph_cost + acoustic_cost;
+        if (cost < best_cost) {
+          oarc->ilabel = link->ilabel;
+          oarc->olabel = link->olabel;
+          if (link->ilabel != 0) {
+            KALDI_ASSERT(static_cast<size_t>(cur_t) < this->cost_offsets_.size());
+            acoustic_cost -= this->cost_offsets_[cur_t];
+            step_t = -1;
+          } else {
+            step_t = 0;
+          }
+          oarc->weight = LatticeWeight(graph_cost, acoustic_cost);
+          best_cost = cost;
+        }
+      }
+    }
+    if (link == NULL &&
+        best_cost == std::numeric_limits<BaseFloat>::infinity()) { // Did not find correct link.
+      KALDI_ERR << "Error tracing best-path back (likely "
+                << "bug in token-pruning algorithm)";
+    }
+  } else {
+    oarc->ilabel = 0;
+    oarc->olabel = 0;
+    oarc->weight = LatticeWeight::One(); // zero costs.
+  }
+  return BestPathIterator(tok->backpointer, cur_t + step_t);
+}
+
+template <typename FST>
+bool LatticeFasterOnlineDecoderTpl<FST>::GetRawLatticePruned(
+    Lattice *ofst,
+    bool use_final_probs,
+    BaseFloat beam) const {
+  typedef LatticeArc Arc;
+  typedef Arc::StateId StateId;
+  typedef Arc::Weight Weight;
+  typedef Arc::Label Label;
+
+  // Note: you can't use the old interface (Decode()) if you want to
+  // get the lattice with use_final_probs = false.  You'd have to do
+  // InitDecoding() and then AdvanceDecoding().
+  if (this->decoding_finalized_ && !use_final_probs)
+    KALDI_ERR << "You cannot call FinalizeDecoding() and then call "
+              << "GetRawLattice() with use_final_probs == false";
+
+  unordered_map<Token*, BaseFloat> final_costs_local;
+
+  const unordered_map<Token*, BaseFloat> &final_costs =
+      (this->decoding_finalized_ ? this->final_costs_ : final_costs_local);
+  if (!this->decoding_finalized_ && use_final_probs)
+    this->ComputeFinalCosts(&final_costs_local, NULL, NULL);
+
+  ofst->DeleteStates();
+  // num-frames plus one (since frames are one-based, and we have
+  // an extra frame for the start-state).
+  int32 num_frames = this->active_toks_.size() - 1;
+  KALDI_ASSERT(num_frames > 0);
+  for (int32 f = 0; f <= num_frames; f++) {
+    if (this->active_toks_[f].toks == NULL) {
+      KALDI_WARN << "No tokens active on frame " << f
+                 << ": not producing lattice.\n";
+      return false;
+    }
+  }
+  unordered_map<Token*, StateId> tok_map;
+  std::queue<std::pair<Token*, int32> > tok_queue;
+  // First initialize the queue and states.  Put the initial state on the queue;
+  // this is the last token in the list active_toks_[0].toks.
+  for (Token *tok = this->active_toks_[0].toks;
+       tok != NULL; tok = tok->next) {
+    if (tok->next == NULL) {
+      tok_map[tok] = ofst->AddState();
+      ofst->SetStart(tok_map[tok]);
+      std::pair<Token*, int32> tok_pair(tok, 0);  // #frame = 0
+      tok_queue.push(tok_pair);
+    }
+  }
+
+  // Next create states for "good" tokens
+  while (!tok_queue.empty()) {
+    std::pair<Token*, int32> cur_tok_pair = tok_queue.front();
+    tok_queue.pop();
+    Token *cur_tok = cur_tok_pair.first;
+    int32 cur_frame = cur_tok_pair.second;
+    KALDI_ASSERT(cur_frame >= 0 &&
+                 cur_frame <= this->cost_offsets_.size());
+
+    typename unordered_map<Token*, StateId>::const_iterator iter =
+        tok_map.find(cur_tok);
+    KALDI_ASSERT(iter != tok_map.end());
+    StateId cur_state = iter->second;
+
+    for (ForwardLinkT *l = cur_tok->links;
+         l != NULL;
+         l = l->next) {
+      Token *next_tok = l->next_tok;
+      if (next_tok->extra_cost < beam) {
+        // so both the current and the next token are good; create the arc
+        int32 next_frame = l->ilabel == 0 ? cur_frame : cur_frame + 1;
+        StateId nextstate;
+        if (tok_map.find(next_tok) == tok_map.end()) {
+          nextstate = tok_map[next_tok] = ofst->AddState();
+          tok_queue.push(std::pair<Token*, int32>(next_tok, next_frame));
+        } else {
+          nextstate = tok_map[next_tok];
+        }
+        BaseFloat cost_offset = (l->ilabel != 0 ?
+                                 this->cost_offsets_[cur_frame] : 0);
+        Arc arc(l->ilabel, l->olabel,
+                Weight(l->graph_cost, l->acoustic_cost - cost_offset),
+                nextstate);
+        ofst->AddArc(cur_state, arc);
+      }
+    }
+    if (cur_frame == num_frames) {
+      if (use_final_probs && !final_costs.empty()) {
+        typename unordered_map<Token*, BaseFloat>::const_iterator iter =
+            final_costs.find(cur_tok);
+        if (iter != final_costs.end())
+          ofst->SetFinal(cur_state, LatticeWeight(iter->second, 0));
+      } else {
+        ofst->SetFinal(cur_state, LatticeWeight::One());
+      }
+    }
+  }
+  return (ofst->NumStates() != 0);
+}
+
+
+
+// Instantiate the template for the FST types that we'll need.
+template class LatticeFasterOnlineDecoderTpl<fst::Fst<fst::StdArc> >;
+template class LatticeFasterOnlineDecoderTpl<fst::VectorFst<fst::StdArc> >;
+template class LatticeFasterOnlineDecoderTpl<fst::ConstFst<fst::StdArc> >;
+template class LatticeFasterOnlineDecoderTpl<fst::ConstGrammarFst >;
+template class LatticeFasterOnlineDecoderTpl<fst::VectorGrammarFst >;
+
+
+} // end namespace kaldi.
diff --git a/speechx/speechx/kaldi/decoder/lattice-faster-online-decoder.h b/speechx/speechx/kaldi/decoder/lattice-faster-online-decoder.h
new file mode 100644
index 000000000..8b10996fd
--- /dev/null
+++ b/speechx/speechx/kaldi/decoder/lattice-faster-online-decoder.h
@@ -0,0 +1,147 @@
+// decoder/lattice-faster-online-decoder.h
+
+// Copyright 2009-2013  Microsoft Corporation;  Mirko Hannemann;
+//           2013-2014  Johns Hopkins University (Author: Daniel Povey)
+//                2014  Guoguo Chen
+//                2018  Zhehuai Chen
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+// see note at the top of lattice-faster-decoder.h, about how to maintain this
+// file in sync with lattice-faster-decoder.h
+
+
+#ifndef KALDI_DECODER_LATTICE_FASTER_ONLINE_DECODER_H_
+#define KALDI_DECODER_LATTICE_FASTER_ONLINE_DECODER_H_
+
+#include "util/stl-utils.h"
+#include "util/hash-list.h"
+#include "fst/fstlib.h"
+#include "itf/decodable-itf.h"
+#include "fstext/fstext-lib.h"
+#include "lat/determinize-lattice-pruned.h"
+#include "lat/kaldi-lattice.h"
+#include "decoder/lattice-faster-decoder.h"
+
+namespace kaldi {
+
+
+
+/** LatticeFasterOnlineDecoderTpl is as LatticeFasterDecoderTpl but also
+    supports an efficient way to get the best path (see the function
+    BestPathEnd()), which is useful in endpointing and in situations where you
+    might want to frequently access the best path.
+
+    This is only templated on the FST type, since the Token type is required to
+    be BackpointerToken.  Actually it only makes sense to instantiate
+    LatticeFasterDecoderTpl with Token == BackpointerToken if you do so indirectly via
+    this child class.
+ */
+template <typename FST>
+class LatticeFasterOnlineDecoderTpl:
+      public LatticeFasterDecoderTpl<FST, decoder::BackpointerToken> {
+ public:
+  using Arc = typename FST::Arc;
+  using Label = typename Arc::Label;
+  using StateId = typename Arc::StateId;
+  using Weight = typename Arc::Weight;
+  using Token = decoder::BackpointerToken;
+  using ForwardLinkT = decoder::ForwardLink<Token>;
+
+  // Instantiate this class once for each thing you have to decode.
+  // This version of the constructor does not take ownership of
+  // 'fst'.
+  LatticeFasterOnlineDecoderTpl(const FST &fst,
+                                const LatticeFasterDecoderConfig &config):
+      LatticeFasterDecoderTpl<FST, Token>(fst, config) { }
+
+  // This version of the initializer takes ownership of 'fst', and will delete
+  // it when this object is destroyed.
+  LatticeFasterOnlineDecoderTpl(const LatticeFasterDecoderConfig &config,
+                                FST *fst):
+      LatticeFasterDecoderTpl<FST, Token>(config, fst) { }
+
+
+  struct BestPathIterator {
+    void *tok;
+    int32 frame;
+    // note, "frame" is the frame-index of the frame you'll get the
+    // transition-id for next time, if you call TraceBackBestPath on this
+    // iterator (assuming it's not an epsilon transition).  Note that this
+    // is one less than you might reasonably expect, e.g. it's -1 for
+    // the nonemitting transitions before the first frame.
+    BestPathIterator(void *t, int32 f): tok(t), frame(f) { }
+    bool Done() const { return tok == NULL; }
+  };
+
+
+  /// Outputs an FST corresponding to the single best path through the lattice.
+  /// This is quite efficient because it doesn't get the entire raw lattice and find
+  /// the best path through it; instead, it uses the BestPathEnd and BestPathIterator
+  /// so it basically traces it back through the lattice.
+  /// Returns true if result is nonempty (using the return status is deprecated,
+  /// it will become void).  If "use_final_probs" is true AND we reached the
+  /// final-state of the graph then it will include those as final-probs, else
+  /// it will treat all final-probs as one.
+  bool GetBestPath(Lattice *ofst,
+                   bool use_final_probs = true) const;
+
+
+  /// This function does a self-test of GetBestPath().  Returns true on
+  /// success; returns false and prints a warning on failure.
+  bool TestGetBestPath(bool use_final_probs = true) const;
+
+
+  /// This function returns an iterator that can be used to trace back
+  /// the best path.  If use_final_probs == true and at least one final state
+  /// survived till the end, it will use the final-probs in working out the best
+  /// final Token, and will output the final cost to *final_cost (if non-NULL),
+  /// else it will use only the forward likelihood, and will put zero in
+  /// *final_cost (if non-NULL).
+  /// Requires that NumFramesDecoded() > 0.
+  BestPathIterator BestPathEnd(bool use_final_probs,
+                               BaseFloat *final_cost = NULL) const;
+
+
+  /// This function can be used in conjunction with BestPathEnd() to trace back
+  /// the best path one link at a time (e.g. this can be useful in endpoint
+  /// detection).  By "link" we mean a link in the graph; not all links cross
+  /// frame boundaries, but each time you see a nonzero ilabel you can interpret
+  /// that as a frame.  The return value is the updated iterator.  It outputs
+  /// the ilabel and olabel, and the (graph and acoustic) weight to the "arc" pointer,
+  /// while leaving its "nextstate" variable unchanged.
+  BestPathIterator TraceBackBestPath(
+      BestPathIterator iter, LatticeArc *arc) const;
+
+
+  /// Behaves the same as GetRawLattice but only processes tokens whose
+  /// extra_cost is smaller than the best-cost plus the specified beam.
+  /// It is only worthwhile to call this function if beam is less than
+  /// the lattice_beam specified in the config; otherwise, it would
+  /// return essentially the same thing as GetRawLattice, but more slowly.
+  bool GetRawLatticePruned(Lattice *ofst,
+                           bool use_final_probs,
+                           BaseFloat beam) const;
+
+  KALDI_DISALLOW_COPY_AND_ASSIGN(LatticeFasterOnlineDecoderTpl);
+};
+
+typedef LatticeFasterOnlineDecoderTpl<fst::StdFst> LatticeFasterOnlineDecoder;
+
+
+} // end namespace kaldi.
+
+#endif
diff --git a/speechx/speechx/kaldi/feat/CMakeLists.txt b/speechx/speechx/kaldi/feat/CMakeLists.txt
index 8b9149621..c3a996ffb 100644
--- a/speechx/speechx/kaldi/feat/CMakeLists.txt
+++ b/speechx/speechx/kaldi/feat/CMakeLists.txt
@@ -15,5 +15,6 @@ add_library(kaldi-feat-common
   feature-window.cc
   resample.cc
   mel-computations.cc
+  cmvn.cc
 )
-target_link_libraries(kaldi-feat-common PUBLIC kaldi-base kaldi-matrix kaldi-util)
\ No newline at end of file
+target_link_libraries(kaldi-feat-common PUBLIC kaldi-base kaldi-matrix kaldi-util)
diff --git a/speechx/speechx/kaldi/feat/cmvn.cc b/speechx/speechx/kaldi/feat/cmvn.cc
new file mode 100644
index 000000000..b2aa46e4f
--- /dev/null
+++ b/speechx/speechx/kaldi/feat/cmvn.cc
@@ -0,0 +1,183 @@
+// transform/cmvn.cc
+
+// Copyright 2009-2013 Microsoft Corporation
+//                     Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "feat/cmvn.h"
+
+namespace kaldi {
+
+void InitCmvnStats(int32 dim, Matrix<double> *stats) {
+  KALDI_ASSERT(dim > 0);
+  stats->Resize(2, dim+1);
+}
+
+void AccCmvnStats(const VectorBase<BaseFloat> &feats, BaseFloat weight, MatrixBase<double> *stats) {
+  int32 dim = feats.Dim();
+  KALDI_ASSERT(stats != NULL);
+  KALDI_ASSERT(stats->NumRows() == 2 && stats->NumCols() == dim + 1);
+  // Remove these __restrict__ modifiers if they cause compilation problems.
+  // It's just an optimization.
+   double *__restrict__ mean_ptr = stats->RowData(0),
+       *__restrict__ var_ptr = stats->RowData(1),
+       *__restrict__ count_ptr = mean_ptr + dim;
+   const BaseFloat * __restrict__ feats_ptr = feats.Data();
+  *count_ptr += weight;
+  // Careful-- if we change the format of the matrix, the "mean_ptr < count_ptr"
+  // statement below might become wrong.
+  for (; mean_ptr < count_ptr; mean_ptr++, var_ptr++, feats_ptr++) {
+    *mean_ptr += *feats_ptr * weight;
+    *var_ptr +=  *feats_ptr * *feats_ptr * weight;
+  }
+}
+
+void AccCmvnStats(const MatrixBase<BaseFloat> &feats,
+                  const VectorBase<BaseFloat> *weights,
+                  MatrixBase<double> *stats) {
+  int32 num_frames = feats.NumRows();
+  if (weights != NULL) {
+    KALDI_ASSERT(weights->Dim() == num_frames);
+  }
+  for (int32 i = 0; i < num_frames; i++) {
+    SubVector<BaseFloat> this_frame = feats.Row(i);
+    BaseFloat weight = (weights == NULL ? 1.0 : (*weights)(i));
+    if (weight != 0.0)
+      AccCmvnStats(this_frame, weight, stats);
+  }
+}
+
+void ApplyCmvn(const MatrixBase<double> &stats,
+               bool var_norm,
+               MatrixBase<BaseFloat> *feats) {
+  KALDI_ASSERT(feats != NULL);
+  int32 dim = stats.NumCols() - 1;
+  if (stats.NumRows() > 2 || stats.NumRows() < 1 || feats->NumCols() != dim) {
+    KALDI_ERR << "Dim mismatch: cmvn "
+              << stats.NumRows() << 'x' << stats.NumCols()
+              << ", feats " << feats->NumRows() << 'x' << feats->NumCols();
+  }
+  if (stats.NumRows() == 1 && var_norm)
+    KALDI_ERR << "You requested variance normalization but no variance stats "
+              << "are supplied.";
+
+  double count = stats(0, dim);
+  // Do not change the threshold of 1.0 here: in the balanced-cmvn code, when
+  // computing an offset and representing it as stats, we use a count of one.
+  if (count < 1.0)
+    KALDI_ERR << "Insufficient stats for cepstral mean and variance normalization: "
+              << "count = " << count;
+
+  if (!var_norm) {
+    Vector<BaseFloat> offset(dim);
+    SubVector<double> mean_stats(stats.RowData(0), dim);
+    offset.AddVec(-1.0 / count, mean_stats);
+    feats->AddVecToRows(1.0, offset);
+    return;
+  }
+  // norm(0, d) = mean offset;
+  // norm(1, d) = scale, e.g. x(d) <-- x(d)*norm(1, d) + norm(0, d).
+  Matrix<BaseFloat> norm(2, dim);
+  for (int32 d = 0; d < dim; d++) {
+    double mean, offset, scale;
+    mean = stats(0, d)/count;
+    double var = (stats(1, d)/count) - mean*mean,
+        floor = 1.0e-20;
+    if (var < floor) {
+      KALDI_WARN << "Flooring cepstral variance from " << var << " to "
+                 << floor;
+      var = floor;
+    }
+    scale = 1.0 / sqrt(var);
+    if (scale != scale || 1/scale == 0.0)
+      KALDI_ERR << "NaN or infinity in cepstral mean/variance computation";
+    offset = -(mean*scale);
+    norm(0, d) = offset;
+    norm(1, d) = scale;
+  }
+  // Apply the normalization.
+  feats->MulColsVec(norm.Row(1));
+  feats->AddVecToRows(1.0, norm.Row(0));
+}
+
+void ApplyCmvnReverse(const MatrixBase<double> &stats,
+                      bool var_norm,
+                      MatrixBase<BaseFloat> *feats) {
+  KALDI_ASSERT(feats != NULL);
+  int32 dim = stats.NumCols() - 1;
+  if (stats.NumRows() > 2 || stats.NumRows() < 1 || feats->NumCols() != dim) {
+    KALDI_ERR << "Dim mismatch: cmvn "
+              << stats.NumRows() << 'x' << stats.NumCols()
+              << ", feats " << feats->NumRows() << 'x' << feats->NumCols();
+  }
+  if (stats.NumRows() == 1 && var_norm)
+    KALDI_ERR << "You requested variance normalization but no variance stats "
+              << "are supplied.";
+
+  double count = stats(0, dim);
+  // Do not change the threshold of 1.0 here: in the balanced-cmvn code, when
+  // computing an offset and representing it as stats, we use a count of one.
+  if (count < 1.0)
+    KALDI_ERR << "Insufficient stats for cepstral mean and variance normalization: "
+              << "count = " << count;
+
+  Matrix<BaseFloat> norm(2, dim);  // norm(0, d) = mean offset
+  // norm(1, d) = scale, e.g. x(d) <-- x(d)*norm(1, d) + norm(0, d).
+  for (int32 d = 0; d < dim; d++) {
+    double mean, offset, scale;
+    mean = stats(0, d) / count;
+    if (!var_norm) {
+      scale = 1.0;
+      offset = mean;
+    } else {
+      double var = (stats(1, d)/count) - mean*mean,
+          floor = 1.0e-20;
+      if (var < floor) {
+        KALDI_WARN << "Flooring cepstral variance from " << var << " to "
+                   << floor;
+        var = floor;
+      }
+      // we aim to transform zero-mean, unit-variance input into data
+      // with the given mean and variance.
+      scale = sqrt(var);
+      offset = mean;
+    }
+    norm(0, d) = offset;
+    norm(1, d) = scale;
+  }
+  if (var_norm)
+    feats->MulColsVec(norm.Row(1));
+  feats->AddVecToRows(1.0, norm.Row(0));
+}
+
+
+void FakeStatsForSomeDims(const std::vector<int32> &dims,
+                          MatrixBase<double> *stats) {
+  KALDI_ASSERT(stats->NumRows() == 2 && stats->NumCols() > 1);
+  int32 dim = stats->NumCols() - 1;
+  double count = (*stats)(0, dim);
+  for (size_t i = 0; i < dims.size(); i++) {
+    int32 d = dims[i];
+    KALDI_ASSERT(d >= 0 && d < dim);
+    (*stats)(0, d) = 0.0;
+    (*stats)(1, d) = count;
+  }
+}
+
+
+
+}  // namespace kaldi
diff --git a/speechx/speechx/kaldi/feat/cmvn.h b/speechx/speechx/kaldi/feat/cmvn.h
new file mode 100644
index 000000000..c6d1b7f74
--- /dev/null
+++ b/speechx/speechx/kaldi/feat/cmvn.h
@@ -0,0 +1,75 @@
+// transform/cmvn.h
+
+// Copyright 2009-2013 Microsoft Corporation
+//                     Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#ifndef KALDI_TRANSFORM_CMVN_H_
+#define KALDI_TRANSFORM_CMVN_H_
+
+#include "base/kaldi-common.h"
+#include "matrix/matrix-lib.h"
+
+namespace kaldi {
+
+/// This function initializes the matrix to dimension 2 by (dim+1);
+/// 1st "dim" elements of 1st row are mean stats, 1st "dim" elements
+/// of 2nd row are var stats, last element of 1st row is count,
+/// last element of 2nd row is zero.
+void InitCmvnStats(int32 dim, Matrix<double> *stats);
+
+/// Accumulation from a single frame (weighted).
+void AccCmvnStats(const VectorBase<BaseFloat> &feat,
+                  BaseFloat weight,
+                  MatrixBase<double> *stats);
+
+/// Accumulation from a feature file (possibly weighted-- useful in excluding silence).
+void AccCmvnStats(const MatrixBase<BaseFloat> &feats,
+                  const VectorBase<BaseFloat> *weights,  // or NULL
+                  MatrixBase<double> *stats);
+
+/// Apply cepstral mean and variance normalization to a matrix of features.
+/// If norm_vars == true, expects stats to be of dimension 2 by (dim+1), but
+/// if norm_vars == false, will accept stats of dimension 1 by (dim+1); these
+/// are produced by the balanced-cmvn code when it computes an offset and
+/// represents it as "fake stats".
+void ApplyCmvn(const MatrixBase<double> &stats,
+               bool norm_vars,
+               MatrixBase<BaseFloat> *feats);
+
+/// This is as ApplyCmvn, but does so in the reverse sense, i.e. applies a transform
+/// that would take zero-mean, unit-variance input and turn it into output with the
+/// stats of "stats".  This can be useful if you trained without CMVN but later want
+/// to correct a mismatch, so you would first apply CMVN and then do the "reverse"
+/// CMVN with the summed stats of your training data.
+void ApplyCmvnReverse(const MatrixBase<double> &stats,
+                      bool norm_vars,
+                      MatrixBase<BaseFloat> *feats);
+
+
+/// Modify the stats so that for some dimensions (specified in "dims"), we
+/// replace them with "fake" stats that have zero mean and unit variance; this
+/// is done to disable CMVN for those dimensions.
+void FakeStatsForSomeDims(const std::vector<int32> &dims,
+                          MatrixBase<double> *stats);
+
+
+
+}  // namespace kaldi
+
+#endif  // KALDI_TRANSFORM_CMVN_H_
diff --git a/speechx/speechx/kaldi/lat/determinize-lattice-pruned-test.cc b/speechx/speechx/kaldi/lat/determinize-lattice-pruned-test.cc
new file mode 100644
index 000000000..f6684f0b5
--- /dev/null
+++ b/speechx/speechx/kaldi/lat/determinize-lattice-pruned-test.cc
@@ -0,0 +1,147 @@
+// lat/determinize-lattice-pruned-test.cc
+
+// Copyright 2009-2012  Microsoft Corporation
+//           2012-2013  Johns Hopkins University (Author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lat/determinize-lattice-pruned.h"
+#include "fstext/lattice-utils.h"
+#include "fstext/fst-test-utils.h"
+#include "lat/kaldi-lattice.h"
+#include "lat/lattice-functions.h"
+
+namespace fst {
+// Caution: these tests are not as generic as you might think from all the
+// templates in the code.  They are basically only valid for LatticeArc.
+// This is partly due to the fact that certain templates need to be instantiated
+// in other .cc files in this directory.
+
+// test that determinization proceeds correctly on general
+// FSTs (not guaranteed determinzable, but we use the
+// max-states option to stop it getting out of control).
+template<class Arc> void TestDeterminizeLatticePruned() {
+  typedef kaldi::int32 Int;
+  typedef typename Arc::Weight Weight;
+  typedef ArcTpl<CompactLatticeWeightTpl<Weight, Int> > CompactArc;
+
+  for(int i = 0; i < 100; i++) {
+    RandFstOptions opts;
+    opts.n_states = 4;
+    opts.n_arcs = 10;
+    opts.n_final = 2;
+    opts.allow_empty = false;
+    opts.weight_multiplier = 0.5; // impt for the randomly generated weights
+    opts.acyclic = true;
+    // to be exactly representable in float,
+    // or this test fails because numerical differences can cause symmetry in
+    // weights to be broken, which causes the wrong path to be chosen as far
+    // as the string part is concerned.
+
+    VectorFst<Arc> *fst = RandPairFst<Arc>(opts);
+
+    bool sorted = TopSort(fst);
+    KALDI_ASSERT(sorted);
+
+    ILabelCompare<Arc> ilabel_comp;
+    if (kaldi::Rand() % 2 == 0)
+      ArcSort(fst, ilabel_comp);
+
+    std::cout << "FST before lattice-determinizing is:\n";
+    {
+      FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true, "\t");
+      fstprinter.Print(&std::cout, "standard output");
+    }
+    VectorFst<Arc> det_fst;
+    try {
+      DeterminizeLatticePrunedOptions lat_opts;
+      lat_opts.max_mem = ((kaldi::Rand() % 2 == 0) ? 100 : 1000);
+      lat_opts.max_states = ((kaldi::Rand() % 2 == 0) ? -1 : 20);
+      lat_opts.max_arcs = ((kaldi::Rand() % 2 == 0) ? -1 : 30);
+      bool ans = DeterminizeLatticePruned<Weight>(*fst, 10.0, &det_fst, lat_opts);
+
+      std::cout << "FST after lattice-determinizing is:\n";
+      {
+        FstPrinter<Arc> fstprinter(det_fst, NULL, NULL, NULL, false, true, "\t");
+        fstprinter.Print(&std::cout, "standard output");
+      }
+      KALDI_ASSERT(det_fst.Properties(kIDeterministic, true) & kIDeterministic);
+      // OK, now determinize it a different way and check equivalence.
+      // [note: it's not normal determinization, it's taking the best path
+      // for any input-symbol sequence....
+
+
+      VectorFst<Arc> pruned_fst(*fst);
+      if (pruned_fst.NumStates() != 0)
+        kaldi::PruneLattice(10.0, &pruned_fst);
+
+      VectorFst<CompactArc> compact_pruned_fst, compact_pruned_det_fst;
+      ConvertLattice<Weight, Int>(pruned_fst, &compact_pruned_fst, false);
+      std::cout << "Compact pruned FST is:\n";
+      {
+        FstPrinter<CompactArc> fstprinter(compact_pruned_fst, NULL, NULL, NULL, false, true, "\t");
+        fstprinter.Print(&std::cout, "standard output");
+      }
+      ConvertLattice<Weight, Int>(det_fst, &compact_pruned_det_fst, false);
+
+      std::cout << "Compact version of determinized FST is:\n";
+      {
+        FstPrinter<CompactArc> fstprinter(compact_pruned_det_fst, NULL, NULL, NULL, false, true, "\t");
+        fstprinter.Print(&std::cout, "standard output");
+      }
+
+      if (ans)
+        KALDI_ASSERT(RandEquivalent(compact_pruned_det_fst, compact_pruned_fst, 5/*paths*/, 0.01/*delta*/, kaldi::Rand()/*seed*/, 100/*path length, max*/));
+    } catch (...) {
+      std::cout << "Failed to lattice-determinize this FST (probably not determinizable)\n";
+    }
+    delete fst;
+  }
+}
+
+// test that determinization proceeds without crash on acyclic FSTs
+// (guaranteed determinizable in this sense).
+template<class Arc> void TestDeterminizeLatticePruned2() {
+  typedef typename Arc::Weight Weight;
+  RandFstOptions opts;
+  opts.acyclic = true;
+  for(int i = 0; i < 100; i++) {
+    VectorFst<Arc> *fst = RandPairFst<Arc>(opts);
+    std::cout << "FST before lattice-determinizing is:\n";
+    {
+      FstPrinter<Arc> fstprinter(*fst, NULL, NULL, NULL, false, true, "\t");
+      fstprinter.Print(&std::cout, "standard output");
+    }
+    VectorFst<Arc> ofst;
+    DeterminizeLatticePruned<Weight>(*fst, 10.0, &ofst);
+    std::cout << "FST after lattice-determinizing is:\n";
+    {
+      FstPrinter<Arc> fstprinter(ofst, NULL, NULL, NULL, false, true, "\t");
+      fstprinter.Print(&std::cout, "standard output");
+    }
+    delete fst;
+  }
+}
+
+
+} // end namespace fst
+
+int main() {
+  using namespace fst;
+  TestDeterminizeLatticePruned<kaldi::LatticeArc>();
+  TestDeterminizeLatticePruned2<kaldi::LatticeArc>();
+  std::cout << "Tests succeeded\n";
+}
diff --git a/speechx/speechx/kaldi/lat/determinize-lattice-pruned.cc b/speechx/speechx/kaldi/lat/determinize-lattice-pruned.cc
new file mode 100644
index 000000000..dbdd9af46
--- /dev/null
+++ b/speechx/speechx/kaldi/lat/determinize-lattice-pruned.cc
@@ -0,0 +1,1541 @@
+// lat/determinize-lattice-pruned.cc
+
+// Copyright 2009-2012  Microsoft Corporation
+//           2012-2013  Johns Hopkins University (Author: Daniel Povey)
+//                2014  Guoguo Chen
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include <vector>
+#include <climits>
+#include "fstext/determinize-lattice.h" // for LatticeStringRepository
+#include "fstext/fstext-utils.h"
+#include "lat/lattice-functions.h"  // for PruneLattice
+#include "lat/minimize-lattice.h"   // for minimization
+#include "lat/push-lattice.h"       // for minimization
+#include "lat/determinize-lattice-pruned.h"
+
+namespace fst {
+
+using std::vector;
+using std::pair;
+using std::greater;
+
+// class LatticeDeterminizerPruned is templated on the same types that
+// CompactLatticeWeight is templated on: the base weight (Weight), typically
+// LatticeWeightTpl<float> etc. but could also be e.g. TropicalWeight, and the
+// IntType, typically int32, used for the output symbols in the compact
+// representation of strings [note: the output symbols would usually be
+// p.d.f. id's in the anticipated use of this code] It has a special requirement
+// on the Weight type: that there should be a Compare function on the weights
+// such that Compare(w1, w2) returns -1 if w1 < w2, 0 if w1 == w2, and +1 if w1 >
+// w2.  This requires that there be a total order on the weights.
+
+template<class Weight, class IntType> class LatticeDeterminizerPruned {
+ public:
+  // Output to Gallic acceptor (so the strings go on weights, and there is a 1-1 correspondence
+  // between our states and the states in ofst.  If destroy == true, release memory as we go
+  // (but we cannot output again).
+
+  typedef CompactLatticeWeightTpl<Weight, IntType> CompactWeight;
+  typedef ArcTpl<CompactWeight> CompactArc; // arc in compact, acceptor form of lattice
+  typedef ArcTpl<Weight> Arc; // arc in non-compact version of lattice
+
+  // Output to standard FST with CompactWeightTpl<Weight> as its weight type (the
+  // weight stores the original output-symbol strings).  If destroy == true,
+  // release memory as we go (but we cannot output again).
+  void Output(MutableFst<CompactArc>  *ofst, bool destroy = true) {
+    KALDI_ASSERT(determinized_);
+    typedef typename Arc::StateId StateId;
+    StateId nStates = static_cast<StateId>(output_states_.size());
+    if (destroy)
+      FreeMostMemory();
+    ofst->DeleteStates();
+    ofst->SetStart(kNoStateId);
+    if (nStates == 0) {
+      return;
+    }
+    for (StateId s = 0;s < nStates;s++) {
+      OutputStateId news = ofst->AddState();
+      KALDI_ASSERT(news == s);
+    }
+    ofst->SetStart(0);
+    // now process transitions.
+    for (StateId this_state_id = 0; this_state_id < nStates; this_state_id++) {
+      OutputState &this_state = *(output_states_[this_state_id]);
+      vector<TempArc> &this_vec(this_state.arcs);
+      typename vector<TempArc>::const_iterator iter = this_vec.begin(), end = this_vec.end();
+
+      for (;iter != end; ++iter) {
+        const TempArc &temp_arc(*iter);
+        CompactArc new_arc;
+        vector<Label> olabel_seq;
+        repository_.ConvertToVector(temp_arc.string, &olabel_seq);
+        CompactWeight weight(temp_arc.weight, olabel_seq);
+        if (temp_arc.nextstate == kNoStateId) {  // is really final weight.
+          ofst->SetFinal(this_state_id, weight);
+        } else {  // is really an arc.
+          new_arc.nextstate = temp_arc.nextstate;
+          new_arc.ilabel = temp_arc.ilabel;
+          new_arc.olabel = temp_arc.ilabel;  // acceptor.  input == output.
+          new_arc.weight = weight;  // includes string and weight.
+          ofst->AddArc(this_state_id, new_arc);
+        }
+      }
+      // Free up memory.  Do this inside the loop as ofst is also allocating memory,
+      // and we want to reduce the maximum amount ever allocated.
+      if (destroy) { vector<TempArc> temp; temp.swap(this_vec); }
+    }
+    if (destroy) {
+      FreeOutputStates();
+      repository_.Destroy();
+    }
+  }
+
+  // Output to standard FST with Weight as its weight type.  We will create extra
+  // states to handle sequences of symbols on the output.  If destroy == true,
+  // release memory as we go (but we cannot output again).
+  void  Output(MutableFst<Arc> *ofst, bool destroy = true) {
+    // Outputs to standard fst.
+    OutputStateId nStates = static_cast<OutputStateId>(output_states_.size());
+    ofst->DeleteStates();
+    if (nStates == 0) {
+      ofst->SetStart(kNoStateId);
+      return;
+    }
+    if (destroy)
+      FreeMostMemory();
+    // Add basic states-- but we will add extra ones to account for strings on output.
+    for (OutputStateId s = 0; s< nStates;s++) {
+      OutputStateId news = ofst->AddState();
+      KALDI_ASSERT(news == s);
+    }
+    ofst->SetStart(0);
+    for (OutputStateId this_state_id = 0; this_state_id < nStates; this_state_id++) {
+      OutputState &this_state = *(output_states_[this_state_id]);
+      vector<TempArc> &this_vec(this_state.arcs);
+
+      typename vector<TempArc>::const_iterator iter = this_vec.begin(), end = this_vec.end();
+      for (; iter != end; ++iter) {
+        const TempArc &temp_arc(*iter);
+        vector<Label> seq;
+        repository_.ConvertToVector(temp_arc.string, &seq);
+
+        if (temp_arc.nextstate == kNoStateId) {  // Really a final weight.
+          // Make a sequence of states going to a final state, with the strings
+          // as labels.  Put the weight on the first arc.
+          OutputStateId cur_state = this_state_id;
+          for (size_t i = 0; i < seq.size(); i++) {
+            OutputStateId next_state = ofst->AddState();
+            Arc arc;
+            arc.nextstate = next_state;
+            arc.weight = (i == 0 ? temp_arc.weight : Weight::One());
+            arc.ilabel = 0;  // epsilon.
+            arc.olabel = seq[i];
+            ofst->AddArc(cur_state, arc);
+            cur_state = next_state;
+          }
+          ofst->SetFinal(cur_state, (seq.size() == 0 ? temp_arc.weight : Weight::One()));
+        } else {  // Really an arc.
+          OutputStateId cur_state = this_state_id;
+          // Have to be careful with this integer comparison (i+1 < seq.size()) because unsigned.
+          // i < seq.size()-1 could fail for zero-length sequences.
+          for (size_t i = 0; i+1 < seq.size();i++) {
+            // for all but the last element of seq, create new state.
+            OutputStateId next_state = ofst->AddState();
+            Arc arc;
+            arc.nextstate = next_state;
+            arc.weight = (i == 0 ? temp_arc.weight : Weight::One());
+            arc.ilabel = (i == 0 ? temp_arc.ilabel : 0);  // put ilabel on first element of seq.
+            arc.olabel = seq[i];
+            ofst->AddArc(cur_state, arc);
+            cur_state = next_state;
+          }
+          // Add the final arc in the sequence.
+          Arc arc;
+          arc.nextstate = temp_arc.nextstate;
+          arc.weight = (seq.size() <= 1 ? temp_arc.weight : Weight::One());
+          arc.ilabel = (seq.size() <= 1 ? temp_arc.ilabel : 0);
+          arc.olabel = (seq.size() > 0 ? seq.back() : 0);
+          ofst->AddArc(cur_state, arc);
+        }
+      }
+      // Free up memory.  Do this inside the loop as ofst is also allocating memory
+      if (destroy) { vector<TempArc> temp; temp.swap(this_vec); }
+    }
+    if (destroy) {
+      FreeOutputStates();
+      repository_.Destroy();
+    }
+  }
+
+
+  // Initializer.  After initializing the object you will typically
+  // call Determinize() and then call one of the Output functions.
+  // Note: ifst.Copy() will generally do a
+  // shallow copy.  We do it like this for memory safety, rather than
+  // keeping a reference or pointer to ifst_.
+  LatticeDeterminizerPruned(const ExpandedFst<Arc> &ifst,
+                            double beam,
+                            DeterminizeLatticePrunedOptions opts):
+      num_arcs_(0), num_elems_(0), ifst_(ifst.Copy()), beam_(beam), opts_(opts),
+      equal_(opts_.delta), determinized_(false),
+      minimal_hash_(3, hasher_, equal_), initial_hash_(3, hasher_, equal_) {
+    KALDI_ASSERT(Weight::Properties() & kIdempotent); // this algorithm won't
+    // work correctly otherwise.
+  }
+
+  void FreeOutputStates() {
+    for (size_t i = 0; i < output_states_.size(); i++)
+      delete output_states_[i];
+    vector<OutputState*> temp;
+    temp.swap(output_states_);
+  }
+
+  // frees all memory except the info (in output_states_[ ]->arcs)
+  // that we need to output the FST.
+  void FreeMostMemory() {
+    if (ifst_) {
+      delete ifst_;
+      ifst_ = NULL;
+    }
+    { MinimalSubsetHash tmp; tmp.swap(minimal_hash_); }
+
+    for (size_t i = 0; i < output_states_.size(); i++) {
+      vector<Element> empty_subset;
+      empty_subset.swap(output_states_[i]->minimal_subset);
+    }
+
+    for (typename InitialSubsetHash::iterator iter = initial_hash_.begin();
+         iter != initial_hash_.end(); ++iter)
+      delete iter->first;
+    { InitialSubsetHash tmp; tmp.swap(initial_hash_); }
+    { vector<char> tmp;  tmp.swap(isymbol_or_final_); }
+    { // Free up the queue.  I'm not sure how to make sure all
+      // the memory is really freed (no swap() function)... doesn't really
+      // matter much though.
+      while (!queue_.empty()) {
+        Task *t = queue_.top();
+        delete t;
+        queue_.pop();
+      }
+    }
+    { vector<pair<Label, Element> > tmp; tmp.swap(all_elems_tmp_); }
+  }
+
+  ~LatticeDeterminizerPruned() {
+    FreeMostMemory();
+    FreeOutputStates();
+    // rest is deleted by destructors.
+  }
+
+  void RebuildRepository() { // rebuild the string repository,
+    // freeing stuff we don't need.. we call this when memory usage
+    // passes a supplied threshold.  We need to accumulate all the
+    // strings we need the repository to "remember", then tell it
+    // to clean the repository.
+    std::vector<StringId> needed_strings;
+    for (size_t i = 0; i < output_states_.size(); i++) {
+      AddStrings(output_states_[i]->minimal_subset, &needed_strings);
+      for (size_t j = 0; j < output_states_[i]->arcs.size(); j++)
+        needed_strings.push_back(output_states_[i]->arcs[j].string);
+    }
+
+    { // the queue doesn't allow us access to the underlying vector,
+      // so we have to resort to a temporary collection.
+      std::vector<Task*> tasks;
+      while (!queue_.empty()) {
+        Task *task = queue_.top();
+        queue_.pop();
+        tasks.push_back(task);
+        AddStrings(task->subset, &needed_strings);
+      }
+      for (size_t i = 0; i < tasks.size(); i++)
+        queue_.push(tasks[i]);
+    }
+
+    // the following loop covers strings present in initial_hash_.
+    for (typename InitialSubsetHash::const_iterator
+             iter = initial_hash_.begin();
+         iter != initial_hash_.end(); ++iter) {
+      const vector<Element> &vec = *(iter->first);
+      Element elem = iter->second;
+      AddStrings(vec, &needed_strings);
+      needed_strings.push_back(elem.string);
+    }
+    std::sort(needed_strings.begin(), needed_strings.end());
+    needed_strings.erase(std::unique(needed_strings.begin(),
+                                     needed_strings.end()),
+                         needed_strings.end()); // uniq the strings.
+    KALDI_LOG << "Rebuilding repository.";
+
+    repository_.Rebuild(needed_strings);
+  }
+
+  bool CheckMemoryUsage() {
+    int32 repo_size = repository_.MemSize(),
+        arcs_size = num_arcs_ * sizeof(TempArc),
+        elems_size = num_elems_ * sizeof(Element),
+        total_size = repo_size + arcs_size + elems_size;
+    if (opts_.max_mem > 0 && total_size > opts_.max_mem) { // We passed the memory threshold.
+      // This is usually due to the repository getting large, so we
+      // clean this out.
+      RebuildRepository();
+      int32 new_repo_size = repository_.MemSize(),
+          new_total_size = new_repo_size + arcs_size + elems_size;
+
+      KALDI_VLOG(2) << "Rebuilt repository in determinize-lattice: repository shrank from "
+                    << repo_size << " to " << new_repo_size << " bytes (approximately)";
+
+      if (new_total_size > static_cast<int32>(opts_.max_mem * 0.8)) {
+        // Rebuilding didn't help enough-- we need a margin to stop
+        // having to rebuild too often.  We'll just return to the user at
+        // this point, with a partial lattice that's pruned tighter than
+        // the specified beam.  Here we figure out what the effective
+        // beam was.
+        double effective_beam = beam_;
+        if (!queue_.empty()) { // Note: queue should probably not be empty; we're
+          // just being paranoid here.
+          Task *task = queue_.top();
+          double total_weight = backward_costs_[ifst_->Start()]; // best weight of FST.
+          effective_beam = task->priority_cost - total_weight;
+        }
+        KALDI_WARN << "Did not reach requested beam in determinize-lattice: "
+                   << "size exceeds maximum " << opts_.max_mem
+                   << " bytes; (repo,arcs,elems) = (" << repo_size << ","
+                   << arcs_size << "," << elems_size
+                   << "), after rebuilding, repo size was " << new_repo_size
+                   << ", effective beam was " << effective_beam
+                   << " vs. requested beam " << beam_;
+        return false;
+      }
+    }
+    return true;
+  }
+
+  bool Determinize(double *effective_beam) {
+    KALDI_ASSERT(!determinized_);
+    // This determinizes the input fst but leaves it in the "special format"
+    // in "output_arcs_".  Must be called after Initialize().  To get the
+    // output, call one of the Output routines.
+
+    InitializeDeterminization(); // some start-up tasks.
+    while (!queue_.empty()) {
+      Task *task = queue_.top();
+      // Note: the queue contains only tasks that are "within the beam".
+      // We also have to check whether we have reached one of the user-specified
+      // maximums, of estimated memory, arcs, or states.  The condition for
+      // ending is:
+      // num-states is more than user specified, OR
+      // num-arcs is more than user specified, OR
+      // memory passed a user-specified threshold and cleanup failed
+      //  to get it below that threshold.
+      size_t num_states = output_states_.size();
+      if ((opts_.max_states > 0 && num_states > opts_.max_states) ||
+          (opts_.max_arcs > 0 && num_arcs_ > opts_.max_arcs) ||
+          (num_states % 10 == 0 && !CheckMemoryUsage())) { // note: at some point
+        // it was num_states % 100, not num_states % 10, but I encountered an example
+        // where memory was exhausted before we reached state #100.
+        KALDI_VLOG(1) << "Lattice determinization terminated but not "
+                      << " because of lattice-beam.  (#states, #arcs) is ( "
+                      << output_states_.size() << ", " << num_arcs_
+                      << " ), versus limits ( " << opts_.max_states << ", "
+                      << opts_.max_arcs << " ) (else, may be memory limit).";
+        break;
+        // we terminate the determinization here-- whatever we already expanded is
+        // what we'll return...  because we expanded stuff in order of total
+        // (forward-backward) weight, the stuff we returned first is the most
+        // important.
+      }
+      queue_.pop();
+      ProcessTransition(task->state, task->label, &(task->subset));
+      delete task;
+    }
+    determinized_ = true;
+    if (effective_beam != NULL) {
+      if (queue_.empty()) *effective_beam = beam_;
+      else
+        *effective_beam = queue_.top()->priority_cost -
+            backward_costs_[ifst_->Start()];
+    }
+    return (queue_.empty()); // return success if queue was empty, i.e. we processed
+    // all tasks and did not break out of the loop early due to reaching a memory,
+    // arc or state limit.
+  }
+ private:
+
+  typedef typename Arc::Label Label;
+  typedef typename Arc::StateId StateId;  // use this when we don't know if it's input or output.
+  typedef typename Arc::StateId InputStateId;  // state in the input FST.
+  typedef typename Arc::StateId OutputStateId;  // same as above but distinguish
+                                                // states in output Fst.
+
+  typedef LatticeStringRepository<IntType> StringRepositoryType;
+  typedef const typename StringRepositoryType::Entry* StringId;
+
+  // Element of a subset [of original states]
+  struct Element {
+    StateId state; // use StateId as this is usually InputStateId but in one case
+                   // OutputStateId.
+    StringId string;
+    Weight weight;
+    bool operator != (const Element &other) const {
+      return (state != other.state || string != other.string ||
+              weight != other.weight);
+    }
+    // This operator is only intended for the priority_queue in the function
+    // EpsilonClosure().
+    bool operator > (const Element &other) const {
+      return state > other.state;
+    }
+    // This operator is only intended to support sorting in EpsilonClosure()
+    bool operator < (const Element &other) const {
+      return state < other.state;
+    }
+  };
+
+  // Arcs in the format we temporarily create in this class (a representation, essentially of
+  // a Gallic Fst).
+  struct TempArc {
+    Label ilabel;
+    StringId string;  // Look it up in the StringRepository, it's a sequence of Labels.
+    OutputStateId nextstate;  // or kNoState for final weights.
+    Weight weight;
+  };
+
+  // Hashing function used in hash of subsets.
+  // A subset is a pointer to vector<Element>.
+  // The Elements are in sorted order on state id, and without repeated states.
+  // Because the order of Elements is fixed, we can use a hashing function that is
+  // order-dependent.  However the weights are not included in the hashing function--
+  // we hash subsets that differ only in weight to the same key.  This is not optimal
+  // in terms of the O(N) performance but typically if we have a lot of determinized
+  // states that differ only in weight then the input probably was pathological in some way,
+  // or even non-determinizable.
+  //   We don't quantize the weights, in order to avoid inexactness in simple cases.
+  // Instead we apply the delta when comparing subsets for equality, and allow a small
+  // difference.
+
+  class SubsetKey {
+   public:
+    size_t operator ()(const vector<Element> * subset) const {  // hashes only the state and string.
+      size_t hash = 0, factor = 1;
+      for (typename vector<Element>::const_iterator iter= subset->begin(); iter != subset->end(); ++iter) {
+        hash *= factor;
+        hash += iter->state + reinterpret_cast<size_t>(iter->string);
+        factor *= 23531;  // these numbers are primes.
+      }
+      return hash;
+    }
+  };
+
+  // This is the equality operator on subsets.  It checks for exact match on state-id
+  // and string, and approximate match on weights.
+  class SubsetEqual {
+   public:
+    bool operator ()(const vector<Element> * s1, const vector<Element> * s2) const {
+      size_t sz = s1->size();
+      KALDI_ASSERT(sz>=0);
+      if (sz != s2->size()) return false;
+      typename vector<Element>::const_iterator iter1 = s1->begin(),
+          iter1_end = s1->end(), iter2=s2->begin();
+      for (; iter1 < iter1_end; ++iter1, ++iter2) {
+        if (iter1->state != iter2->state ||
+           iter1->string != iter2->string ||
+            ! ApproxEqual(iter1->weight, iter2->weight, delta_)) return false;
+      }
+      return true;
+    }
+    float delta_;
+    SubsetEqual(float delta): delta_(delta) {}
+    SubsetEqual(): delta_(kDelta) {}
+  };
+
+  // Operator that says whether two Elements have the same states.
+  // Used only for debug.
+  class SubsetEqualStates {
+   public:
+    bool operator ()(const vector<Element> * s1, const vector<Element> * s2) const {
+      size_t sz = s1->size();
+      KALDI_ASSERT(sz>=0);
+      if (sz != s2->size()) return false;
+      typename vector<Element>::const_iterator iter1 = s1->begin(),
+          iter1_end = s1->end(), iter2=s2->begin();
+      for (; iter1 < iter1_end; ++iter1, ++iter2) {
+        if (iter1->state != iter2->state) return false;
+      }
+      return true;
+    }
+  };
+
+  // Define the hash type we use to map subsets (in minimal
+  // representation) to OutputStateId.
+  typedef unordered_map<const vector<Element>*, OutputStateId,
+                        SubsetKey, SubsetEqual> MinimalSubsetHash;
+
+  // Define the hash type we use to map subsets (in initial
+  // representation) to OutputStateId, together with an
+  // extra weight. [note: we interpret the Element.state in here
+  // as an OutputStateId even though it's declared as InputStateId;
+  // these types are the same anyway].
+  typedef unordered_map<const vector<Element>*, Element,
+                        SubsetKey, SubsetEqual> InitialSubsetHash;
+
+
+  // converts the representation of the subset from canonical (all states) to
+  // minimal (only states with output symbols on arcs leaving them, and final
+  // states).  Output is not necessarily normalized, even if input_subset was.
+  void ConvertToMinimal(vector<Element> *subset) {
+    KALDI_ASSERT(!subset->empty());
+    typename vector<Element>::iterator cur_in = subset->begin(),
+        cur_out = subset->begin(), end = subset->end();
+    while (cur_in != end) {
+      if(IsIsymbolOrFinal(cur_in->state)) {  // keep it...
+        *cur_out = *cur_in;
+        cur_out++;
+      }
+      cur_in++;
+    }
+    subset->resize(cur_out - subset->begin());
+  }
+
+  // Takes a minimal, normalized subset, and converts it to an OutputStateId.
+  // Involves a hash lookup, and possibly adding a new OutputStateId.
+  // If it creates a new OutputStateId, it creates a new record for it, works
+  // out its final-weight, and puts stuff on the queue relating to its
+  // transitions.
+  OutputStateId MinimalToStateId(const vector<Element> &subset,
+                                 const double forward_cost) {
+    typename MinimalSubsetHash::const_iterator iter
+        = minimal_hash_.find(&subset);
+    if (iter != minimal_hash_.end()) { // Found a matching subset.
+      OutputStateId state_id = iter->second;
+      const OutputState &state = *(output_states_[state_id]);
+      // Below is just a check that the algorithm is working...
+      if (forward_cost < state.forward_cost - 0.1) {
+        // for large weights, this check could fail due to roundoff.
+        KALDI_WARN << "New cost is less (check the difference is small) "
+                   << forward_cost << ", "
+                   << state.forward_cost;
+      }
+      return state_id;
+    }
+    OutputStateId state_id = static_cast<OutputStateId>(output_states_.size());
+    OutputState *new_state = new OutputState(subset, forward_cost);
+    minimal_hash_[&(new_state->minimal_subset)] = state_id;
+    output_states_.push_back(new_state);
+    num_elems_ += subset.size();
+    // Note: in the previous algorithm, we pushed the new state-id onto the queue
+    // at this point.  Here, the queue happens elsewhere, and we directly process
+    // the state (which result in stuff getting added to the queue).
+    ProcessFinal(state_id); // will work out the final-prob.
+    ProcessTransitions(state_id); // will process transitions and add stuff to the queue.
+    return state_id;
+  }
+
+
+  // Given a normalized initial subset of elements (i.e. before epsilon closure),
+  // compute the corresponding output-state.
+  OutputStateId InitialToStateId(const vector<Element> &subset_in,
+                                 double forward_cost,
+                                 Weight *remaining_weight,
+                                 StringId *common_prefix) {
+    typename InitialSubsetHash::const_iterator iter
+        = initial_hash_.find(&subset_in);
+    if (iter != initial_hash_.end()) { // Found a matching subset.
+      const Element &elem = iter->second;
+      *remaining_weight = elem.weight;
+      *common_prefix = elem.string;
+      if (elem.weight == Weight::Zero())
+        KALDI_WARN << "Zero weight!";
+      return elem.state;
+    }
+    // else no matching subset-- have to work it out.
+    vector<Element> subset(subset_in);
+    // Follow through epsilons.  Will add no duplicate states.  note: after
+    // EpsilonClosure, it is the same as "canonical" subset, except not
+    // normalized (actually we never compute the normalized canonical subset,
+    // only the normalized minimal one).
+    EpsilonClosure(&subset); // follow epsilons.
+    ConvertToMinimal(&subset); // remove all but emitting and final states.
+
+    Element elem; // will be used to store remaining weight and string, and
+                 // OutputStateId, in initial_hash_;
+    NormalizeSubset(&subset, &elem.weight, &elem.string); // normalize subset; put
+    // common string and weight in "elem".  The subset is now a minimal,
+    // normalized subset.
+
+    forward_cost += ConvertToCost(elem.weight);
+    OutputStateId ans = MinimalToStateId(subset, forward_cost);
+    *remaining_weight = elem.weight;
+    *common_prefix = elem.string;
+    if (elem.weight == Weight::Zero())
+      KALDI_WARN << "Zero weight!";
+
+    // Before returning "ans", add the initial subset to the hash,
+    // so that we can bypass the epsilon-closure etc., next time
+    // we process the same initial subset.
+    vector<Element> *initial_subset_ptr = new vector<Element>(subset_in);
+    elem.state = ans;
+    initial_hash_[initial_subset_ptr] = elem;
+    num_elems_ += initial_subset_ptr->size(); // keep track of memory usage.
+    return ans;
+  }
+
+  // returns the Compare value (-1 if a < b, 0 if a == b, 1 if a > b) according
+  // to the ordering we defined on strings for the CompactLatticeWeightTpl.
+  // see function
+  // inline int Compare (const CompactLatticeWeightTpl<WeightType,IntType> &w1,
+  //                     const CompactLatticeWeightTpl<WeightType,IntType> &w2)
+  // in lattice-weight.h.
+  // this is the same as that, but optimized for our data structures.
+  inline int Compare(const Weight &a_w, StringId a_str,
+                     const Weight &b_w, StringId b_str) const {
+    int weight_comp = fst::Compare(a_w, b_w);
+    if (weight_comp != 0) return weight_comp;
+    // now comparing strings.
+    if (a_str == b_str) return 0;
+    vector<IntType> a_vec, b_vec;
+    repository_.ConvertToVector(a_str, &a_vec);
+    repository_.ConvertToVector(b_str, &b_vec);
+    // First compare their lengths.
+    int a_len = a_vec.size(), b_len = b_vec.size();
+    // use opposite order on the string lengths (c.f. Compare in
+    // lattice-weight.h)
+    if (a_len > b_len) return -1;
+    else if (a_len < b_len) return 1;
+    for(int i = 0; i < a_len; i++) {
+      if (a_vec[i] < b_vec[i]) return -1;
+      else if (a_vec[i] > b_vec[i]) return 1;
+    }
+    KALDI_ASSERT(0); // because we checked if a_str == b_str above, shouldn't reach here
+    return 0;
+  }
+
+  // This function computes epsilon closure of subset of states by following epsilon links.
+  // Called by InitialToStateId and Initialize.
+  // Has no side effects except on the string repository.  The "output_subset" is not
+  // necessarily normalized (in the sense of there being no common substring), unless
+  // input_subset was.
+  void EpsilonClosure(vector<Element> *subset) {
+    // at input, subset must have only one example of each StateId.  [will still
+    // be so at output].  This function follows input-epsilons, and augments the
+    // subset accordingly.
+
+    std::priority_queue<Element, vector<Element>, greater<Element> > queue;
+    unordered_map<InputStateId, Element> cur_subset;
+    typedef typename unordered_map<InputStateId, Element>::iterator MapIter;
+    typedef typename vector<Element>::const_iterator VecIter;
+
+    for (VecIter iter = subset->begin(); iter != subset->end(); ++iter) {
+      queue.push(*iter);
+      cur_subset[iter->state] = *iter;
+    }
+
+    // find whether input fst is known to be sorted on input label.
+    bool sorted = ((ifst_->Properties(kILabelSorted, false) & kILabelSorted) != 0);
+    bool replaced_elems = false; // relates to an optimization, see below.
+    int counter = 0; // stops infinite loops here for non-lattice-determinizable input
+    // (e.g. input with negative-cost epsilon loops); useful in testing.
+    while (queue.size() != 0) {
+      Element elem = queue.top();
+      queue.pop();
+
+      // The next if-statement is a kind of optimization.  It's to prevent us
+      // unnecessarily repeating the processing of a state.  "cur_subset" always
+      // contains only one Element with a particular state.  The issue is that
+      // whenever we modify the Element corresponding to that state in "cur_subset",
+      // both the new (optimal) and old (less-optimal) Element will still be in
+      // "queue".  The next if-statement stops us from wasting compute by
+      // processing the old Element.
+      if (replaced_elems && cur_subset[elem.state] != elem)
+        continue;
+      if (opts_.max_loop > 0 && counter++ > opts_.max_loop) {
+        KALDI_ERR << "Lattice determinization aborted since looped more than "
+                  << opts_.max_loop << " times during epsilon closure.";
+      }
+      for (ArcIterator<ExpandedFst<Arc> > aiter(*ifst_, elem.state); !aiter.Done(); aiter.Next()) {
+        const Arc &arc = aiter.Value();
+        if (sorted && arc.ilabel != 0) break;  // Break from the loop: due to sorting there will be no
+        // more transitions with epsilons as input labels.
+        if (arc.ilabel == 0
+            && arc.weight != Weight::Zero()) {  // Epsilon transition.
+          Element next_elem;
+          next_elem.state = arc.nextstate;
+          next_elem.weight = Times(elem.weight, arc.weight);
+          // next_elem.string is not set up yet... create it only
+          // when we know we need it (this is an optimization)
+
+          MapIter iter = cur_subset.find(next_elem.state);
+          if (iter == cur_subset.end()) {
+            // was no such StateId: insert and add to queue.
+            next_elem.string = (arc.olabel == 0 ? elem.string :
+                                repository_.Successor(elem.string, arc.olabel));
+            cur_subset[next_elem.state] = next_elem;
+            queue.push(next_elem);
+          } else {
+            // was not inserted because one already there.  In normal
+            // determinization we'd add the weights.  Here, we find which one
+            // has the better weight, and keep its corresponding string.
+            int comp = fst::Compare(next_elem.weight, iter->second.weight);
+            if (comp == 0) { // A tie on weights.  This should be a rare case;
+                             // we don't optimize for it.
+              next_elem.string = (arc.olabel == 0 ? elem.string :
+                                  repository_.Successor(elem.string,
+                                                        arc.olabel));
+              comp = Compare(next_elem.weight, next_elem.string,
+                             iter->second.weight, iter->second.string);
+            }
+            if(comp == 1) { // next_elem is better, so use its (weight, string)
+              next_elem.string = (arc.olabel == 0 ? elem.string :
+                                  repository_.Successor(elem.string, arc.olabel));
+              iter->second.string = next_elem.string;
+              iter->second.weight = next_elem.weight;
+              queue.push(next_elem);
+              replaced_elems = true;
+            }
+            // else it is the same or worse, so use original one.
+          }
+        }
+      }
+    }
+
+    { // copy cur_subset to subset.
+      subset->clear();
+      subset->reserve(cur_subset.size());
+      MapIter iter = cur_subset.begin(), end = cur_subset.end();
+      for (; iter != end; ++iter) subset->push_back(iter->second);
+      // sort by state ID, because the subset hash function is order-dependent(see SubsetKey)
+      std::sort(subset->begin(), subset->end());
+    }
+  }
+
+
+  // This function works out the final-weight of the determinized state.
+  // called by ProcessSubset.
+  // Has no side effects except on the variable repository_, and
+  // output_states_[output_state_id].arcs
+
+  void ProcessFinal(OutputStateId output_state_id) {
+    OutputState &state = *(output_states_[output_state_id]);
+    const vector<Element> &minimal_subset = state.minimal_subset;
+    // processes final-weights for this subset.  state.minimal_subset_ may be
+    // empty if the graphs is not connected/trimmed, I think, do don't check
+    // that it's nonempty.
+    StringId final_string = repository_.EmptyString();  // set it to keep the
+    // compiler happy; if it doesn't get set in the loop, we won't use the value anyway.
+    Weight final_weight = Weight::Zero();
+    bool is_final = false;
+    typename vector<Element>::const_iterator iter = minimal_subset.begin(), end = minimal_subset.end();
+    for (; iter != end; ++iter) {
+      const Element &elem = *iter;
+      Weight this_final_weight = Times(elem.weight, ifst_->Final(elem.state));
+      StringId this_final_string = elem.string;
+      if (this_final_weight != Weight::Zero() &&
+         (!is_final || Compare(this_final_weight, this_final_string,
+                               final_weight, final_string) == 1)) { // the new
+        // (weight, string) pair is more in semiring than our current
+        // one.
+        is_final = true;
+        final_weight = this_final_weight;
+        final_string = this_final_string;
+      }
+    }
+    if (is_final &&
+        ConvertToCost(final_weight) + state.forward_cost <= cutoff_) {
+      // store final weights in TempArc structure, just like a transition.
+      // Note: we only store the final-weight if it's inside the pruning beam, hence
+      // the stuff with Compare.
+      TempArc temp_arc;
+      temp_arc.ilabel = 0;
+      temp_arc.nextstate = kNoStateId;  // special marker meaning "final weight".
+      temp_arc.string = final_string;
+      temp_arc.weight = final_weight;
+      state.arcs.push_back(temp_arc);
+      num_arcs_++;
+    }
+  }
+
+  // NormalizeSubset normalizes the subset "elems" by
+  // removing any common string prefix (putting it in common_str),
+  // and dividing by the total weight (putting it in tot_weight).
+  void NormalizeSubset(vector<Element> *elems,
+                       Weight *tot_weight,
+                       StringId *common_str) {
+    if(elems->empty()) { // just set common_str, tot_weight
+      // to defaults and return...
+      KALDI_WARN << "empty subset";
+      *common_str = repository_.EmptyString();
+      *tot_weight = Weight::Zero();
+      return;
+    }
+    size_t size = elems->size();
+    vector<IntType> common_prefix;
+    repository_.ConvertToVector((*elems)[0].string, &common_prefix);
+    Weight weight = (*elems)[0].weight;
+    for(size_t i = 1; i < size; i++) {
+      weight = Plus(weight, (*elems)[i].weight);
+      repository_.ReduceToCommonPrefix((*elems)[i].string, &common_prefix);
+    }
+    KALDI_ASSERT(weight != Weight::Zero()); // we made sure to ignore arcs with zero
+    // weights on them, so we shouldn't have zero here.
+    size_t prefix_len = common_prefix.size();
+    for(size_t i = 0; i < size; i++) {
+      (*elems)[i].weight = Divide((*elems)[i].weight, weight, DIVIDE_LEFT);
+      (*elems)[i].string =
+          repository_.RemovePrefix((*elems)[i].string, prefix_len);
+    }
+    *common_str = repository_.ConvertFromVector(common_prefix);
+    *tot_weight = weight;
+  }
+
+  // Take a subset of Elements that is sorted on state, and
+  // merge any Elements that have the same state (taking the best
+  // (weight, string) pair in the semiring).
+  void MakeSubsetUnique(vector<Element> *subset) {
+    typedef typename vector<Element>::iterator IterType;
+
+    // This KALDI_ASSERT is designed to fail (usually) if the subset is not sorted on
+    // state.
+    KALDI_ASSERT(subset->size() < 2 || (*subset)[0].state <= (*subset)[1].state);
+
+    IterType cur_in = subset->begin(), cur_out = cur_in, end = subset->end();
+    size_t num_out = 0;
+    // Merge elements with same state-id
+    while (cur_in != end) {  // while we have more elements to process.
+      // At this point, cur_out points to location of next place we want to put an element,
+      // cur_in points to location of next element we want to process.
+      if (cur_in != cur_out) *cur_out = *cur_in;
+      cur_in++;
+      while (cur_in != end && cur_in->state == cur_out->state) {
+        if (Compare(cur_in->weight, cur_in->string,
+                   cur_out->weight, cur_out->string) == 1) {
+          // if *cur_in > *cur_out in semiring, then take *cur_in.
+          cur_out->string = cur_in->string;
+          cur_out->weight = cur_in->weight;
+        }
+        cur_in++;
+      }
+      cur_out++;
+      num_out++;
+    }
+    subset->resize(num_out);
+  }
+
+  // ProcessTransition was called from "ProcessTransitions" in the non-pruned
+  // code, but now we in effect put the calls to ProcessTransition on a priority
+  // queue, and it now gets called directly from Determinize().  This function
+  // processes a transition from state "ostate_id".  The set "subset" of Elements
+  // represents a set of next-states with associated weights and strings, each
+  // one arising from an arc from some state in a determinized-state; the
+  // next-states are unique (there is only one Entry assocated with each)
+  void ProcessTransition(OutputStateId ostate_id, Label ilabel, vector<Element> *subset) {
+
+    double forward_cost = output_states_[ostate_id]->forward_cost;
+    StringId common_str;
+    Weight tot_weight;
+    NormalizeSubset(subset, &tot_weight, &common_str);
+    forward_cost += ConvertToCost(tot_weight);
+
+    OutputStateId nextstate;
+    {
+      Weight next_tot_weight;
+      StringId next_common_str;
+      nextstate = InitialToStateId(*subset,
+                                   forward_cost,
+                                   &next_tot_weight,
+                                   &next_common_str);
+      common_str = repository_.Concatenate(common_str, next_common_str);
+      tot_weight = Times(tot_weight, next_tot_weight);
+    }
+
+    // Now add an arc to the next state (would have been created if necessary by
+    // InitialToStateId).
+    TempArc temp_arc;
+    temp_arc.ilabel = ilabel;
+    temp_arc.nextstate = nextstate;
+    temp_arc.string = common_str;
+    temp_arc.weight = tot_weight;
+    output_states_[ostate_id]->arcs.push_back(temp_arc);  // record the arc.
+    num_arcs_++;
+  }
+
+
+  // "less than" operator for pair<Label, Element>.   Used in ProcessTransitions.
+  // Lexicographical order, which only compares the state when ordering the
+  // "Element" member of the pair.
+
+  class PairComparator {
+   public:
+    inline bool operator () (const pair<Label, Element> &p1, const pair<Label, Element> &p2) {
+      if (p1.first < p2.first) return true;
+      else if (p1.first > p2.first) return false;
+      else {
+        return p1.second.state < p2.second.state;
+      }
+    }
+  };
+
+
+  // ProcessTransitions processes emitting transitions (transitions with
+  // ilabels) out of this subset of states.  It actualy only creates records
+  // ("Task") that get added to the queue.  The transitions will be processed in
+  // priority order from Determinize().  This function soes not consider final
+  // states.  Partitions the emitting transitions up by ilabel (by sorting on
+  // ilabel), and for each unique ilabel, it creates a Task record that contains
+  // the information we need to process the transition.
+
+  void ProcessTransitions(OutputStateId output_state_id) {
+    const vector<Element> &minimal_subset = output_states_[output_state_id]->minimal_subset;
+    // it's possible that minimal_subset could be empty if there are
+    // unreachable parts of the graph, so don't check that it's nonempty.
+    vector<pair<Label, Element> > &all_elems(all_elems_tmp_); // use class member
+    // to avoid memory allocation/deallocation.
+    {
+      // Push back into "all_elems", elements corresponding to all
+      // non-epsilon-input transitions out of all states in "minimal_subset".
+      typename vector<Element>::const_iterator iter = minimal_subset.begin(), end = minimal_subset.end();
+      for (;iter != end; ++iter) {
+        const Element &elem = *iter;
+        for (ArcIterator<ExpandedFst<Arc> > aiter(*ifst_, elem.state); ! aiter.Done(); aiter.Next()) {
+          const Arc &arc = aiter.Value();
+          if (arc.ilabel != 0
+              && arc.weight != Weight::Zero()) {  // Non-epsilon transition -- ignore epsilons here.
+            pair<Label, Element> this_pr;
+            this_pr.first = arc.ilabel;
+            Element &next_elem(this_pr.second);
+            next_elem.state = arc.nextstate;
+            next_elem.weight = Times(elem.weight, arc.weight);
+            if (arc.olabel == 0) // output epsilon
+              next_elem.string = elem.string;
+            else
+              next_elem.string = repository_.Successor(elem.string, arc.olabel);
+            all_elems.push_back(this_pr);
+          }
+        }
+      }
+    }
+    PairComparator pc;
+    std::sort(all_elems.begin(), all_elems.end(), pc);
+    // now sorted first on input label, then on state.
+    typedef typename vector<pair<Label, Element> >::const_iterator PairIter;
+    PairIter cur = all_elems.begin(), end = all_elems.end();
+    while (cur != end) {
+      // The old code (non-pruned) called ProcessTransition; here, instead,
+      // we'll put the calls into a priority queue.
+      Task *task = new Task;
+      // Process ranges that share the same input symbol.
+      Label ilabel = cur->first;
+      task->state = output_state_id;
+      task->priority_cost = std::numeric_limits<double>::infinity();
+      task->label = ilabel;
+      while (cur != end && cur->first == ilabel) {
+        task->subset.push_back(cur->second);
+        const Element &element = cur->second;
+        // Note: we'll later include the term "forward_cost" in the
+        // priority_cost.
+        task->priority_cost = std::min(task->priority_cost,
+                                       ConvertToCost(element.weight) +
+                                       backward_costs_[element.state]);
+        cur++;
+      }
+
+      // After the command below, the "priority_cost" is a value comparable to
+      // the total-weight of the input FST, like a total-path weight... of
+      // course, it will typically be less (in the semiring) than that.
+      // note: we represent it just as a double.
+      task->priority_cost += output_states_[output_state_id]->forward_cost;
+
+      if (task->priority_cost > cutoff_) {
+        // This task would never get done as it's past the pruning cutoff.
+        delete task;
+      } else {
+        MakeSubsetUnique(&(task->subset)); // remove duplicate Elements with the same state.
+        queue_.push(task); // Push the task onto the queue.  The queue keeps it
+        // in prioritized order, so we always process the one with the "best"
+        // weight (highest in the semiring).
+
+        { // this is a check.
+          double best_cost = backward_costs_[ifst_->Start()],
+              tolerance = 0.01 + 1.0e-04 * std::abs(best_cost);
+          if (task->priority_cost < best_cost - tolerance) {
+            KALDI_WARN << "Cost below best cost was encountered:"
+                       << task->priority_cost << " < " << best_cost;
+          }
+        }
+      }
+    }
+    all_elems.clear(); // as it's a reference to a class variable; we want it to stay
+    // empty.
+  }
+
+
+  bool IsIsymbolOrFinal(InputStateId state) { // returns true if this state
+    // of the input FST either is final or has an osymbol on an arc out of it.
+    // Uses the vector isymbol_or_final_ as a cache for this info.
+    KALDI_ASSERT(state >= 0);
+    if (isymbol_or_final_.size() <= state)
+      isymbol_or_final_.resize(state+1, static_cast<char>(OSF_UNKNOWN));
+    if (isymbol_or_final_[state] == static_cast<char>(OSF_NO))
+      return false;
+    else if (isymbol_or_final_[state] == static_cast<char>(OSF_YES))
+      return true;
+    // else work it out...
+    isymbol_or_final_[state] = static_cast<char>(OSF_NO);
+    if (ifst_->Final(state) != Weight::Zero())
+      isymbol_or_final_[state] = static_cast<char>(OSF_YES);
+    for (ArcIterator<ExpandedFst<Arc> > aiter(*ifst_, state);
+         !aiter.Done();
+         aiter.Next()) {
+      const Arc &arc = aiter.Value();
+      if (arc.ilabel != 0 && arc.weight != Weight::Zero()) {
+        isymbol_or_final_[state] = static_cast<char>(OSF_YES);
+        return true;
+      }
+    }
+    return IsIsymbolOrFinal(state); // will only recurse once.
+  }
+
+  void ComputeBackwardWeight() {
+    // Sets up the backward_costs_ array, and the cutoff_ variable.
+    KALDI_ASSERT(beam_ > 0);
+
+    // Only handle the toplogically sorted case.
+    backward_costs_.resize(ifst_->NumStates());
+    for (StateId s = ifst_->NumStates() - 1; s >= 0; s--) {
+      double &cost = backward_costs_[s];
+      cost = ConvertToCost(ifst_->Final(s));
+      for (ArcIterator<ExpandedFst<Arc> > aiter(*ifst_, s);
+           !aiter.Done(); aiter.Next()) {
+        const Arc &arc = aiter.Value();
+        cost = std::min(cost,
+                        ConvertToCost(arc.weight) + backward_costs_[arc.nextstate]);
+      }
+    }
+
+    if (ifst_->Start() == kNoStateId) return; // we'll be returning
+    // an empty FST.
+
+    double best_cost = backward_costs_[ifst_->Start()];
+    if (best_cost == std::numeric_limits<double>::infinity())
+      KALDI_WARN << "Total weight of input lattice is zero.";
+    cutoff_ = best_cost + beam_;
+  }
+
+  void InitializeDeterminization() {
+    // We insist that the input lattice be topologically sorted.  This is not a
+    // fundamental limitation of the algorithm (which in principle should be
+    // applicable to even cyclic FSTs), but it helps us more efficiently
+    // compute the backward_costs_ array.  There may be some other reason we
+    // require this, that escapes me at the moment.
+    KALDI_ASSERT(ifst_->Properties(kTopSorted, true) != 0);
+    ComputeBackwardWeight();
+#if !(__GNUC__ == 4 && __GNUC_MINOR__ == 0)
+    if(ifst_->Properties(kExpanded, false) != 0) { // if we know the number of
+      // states in ifst_, it might be a bit more efficient
+      // to pre-size the hashes so we're not constantly rebuilding them.
+      StateId num_states =
+          down_cast<const ExpandedFst<Arc>*, const Fst<Arc> >(ifst_)->NumStates();
+      minimal_hash_.rehash(num_states/2 + 3);
+      initial_hash_.rehash(num_states/2 + 3);
+    }
+#endif
+    InputStateId start_id = ifst_->Start();
+    if (start_id != kNoStateId) {
+      /* Create determinized-state corresponding to the start state....
+         Unlike all the other states, we don't "normalize" the representation
+         of this determinized-state before we put it into minimal_hash_.  This is actually
+         what we want, as otherwise we'd have problems dealing with any extra weight
+         and string and might have to create a "super-initial" state which would make
+         the output nondeterministic.  Normalization is only needed to make the
+         determinized output more minimal anyway, it's not needed for correctness.
+         Note, we don't put anything in the initial_hash_.  The initial_hash_ is only
+         a lookaside buffer anyway, so this isn't a problem-- it will get populated
+         later if it needs to be.
+      */
+      vector<Element> subset(1);
+      subset[0].state = start_id;
+      subset[0].weight = Weight::One();
+      subset[0].string = repository_.EmptyString();  // Id of empty sequence.
+      EpsilonClosure(&subset); // follow through epsilon-input links
+      ConvertToMinimal(&subset); // remove all but final states and
+      // states with input-labels on arcs out of them.
+      // Weight::One() is the "forward-weight" of this determinized state...
+      // i.e. the minimal cost from the start of the determinized FST to this
+      // state [One() because it's the start state].
+      OutputState *initial_state = new OutputState(subset, 0);
+      KALDI_ASSERT(output_states_.empty());
+      output_states_.push_back(initial_state);
+      num_elems_ += subset.size();
+      OutputStateId initial_state_id = 0;
+      minimal_hash_[&(initial_state->minimal_subset)] = initial_state_id;
+      ProcessFinal(initial_state_id);
+      ProcessTransitions(initial_state_id); // this will add tasks to
+      // the queue, which we'll start processing in Determinize().
+    }
+  }
+
+  KALDI_DISALLOW_COPY_AND_ASSIGN(LatticeDeterminizerPruned);
+
+  struct OutputState {
+    vector<Element> minimal_subset;
+    vector<TempArc> arcs; // arcs out of the state-- those that have been processed.
+    // Note: the final-weight is included here with kNoStateId as the state id.  We
+    // always process the final-weight regardless of the beam; when producing the
+    // output we may have to ignore some of these.
+    double forward_cost; // Represents minimal cost from start-state
+    // to this state.  Used in prioritization of tasks, and pruning.
+    // Note: we know this minimal cost from when we first create the OutputState;
+    // this is because of the priority-queue we use, that ensures that the
+    // "best" path into the state will be expanded first.
+    OutputState(const vector<Element> &minimal_subset,
+                double forward_cost): minimal_subset(minimal_subset),
+                                      forward_cost(forward_cost) { }
+  };
+
+  vector<OutputState*> output_states_; // All the info about the output states.
+
+  int num_arcs_; // keep track of memory usage: number of arcs in output_states_[ ]->arcs
+  int num_elems_; // keep track of memory usage: number of elems in output_states_ and
+  // the keys of initial_hash_
+
+  const ExpandedFst<Arc> *ifst_;
+  std::vector<double> backward_costs_; // This vector stores, for every state in ifst_,
+  // the minimal cost to the end-state (i.e. the sum of weights; they are guaranteed to
+  // have "take-the-minimum" semantics).  We get the double from the ConvertToCost()
+  // function on the lattice weights.
+
+  double beam_;
+  double cutoff_; // beam plus total-weight of input (and note, the weight is
+  // guaranteed to be "tropical-like" so the sum does represent a min-cost.
+
+  DeterminizeLatticePrunedOptions opts_;
+  SubsetKey hasher_;  // object that computes keys-- has no data members.
+  SubsetEqual equal_;  // object that compares subsets-- only data member is delta_.
+  bool determinized_; // set to true when user called Determinize(); used to make
+  // sure this object is used correctly.
+  MinimalSubsetHash minimal_hash_;  // hash from Subset to OutputStateId.  Subset is "minimal
+                                    // representation" (only include final and states and states with
+                                    // nonzero ilabel on arc out of them.  Owns the pointers
+                                    // in its keys.
+  InitialSubsetHash initial_hash_;   // hash from Subset to Element, which
+                                     // represents the OutputStateId together
+                                     // with an extra weight and string.  Subset
+                                     // is "initial representation".  The extra
+                                     // weight and string is needed because after
+                                     // we convert to minimal representation and
+                                     // normalize, there may be an extra weight
+                                     // and string.  Owns the pointers
+                                     // in its keys.
+
+  struct Task {
+    OutputStateId state; // State from which we're processing the transition.
+    Label label; // Label on the transition we're processing out of this state.
+    vector<Element> subset; // Weighted subset of states (with strings)-- not normalized.
+    double priority_cost; // Cost used in deciding priority of tasks.  Note:
+    // we assume there is a ConvertToCost() function that converts the semiring to double.
+  };
+
+  struct TaskCompare {
+    inline int operator() (const Task *t1, const Task *t2) {
+      // view this like operator <, which is the default template parameter
+      // to std::priority_queue.
+      // returns true if t1 is worse than t2.
+      return (t1->priority_cost > t2->priority_cost);
+    }
+  };
+
+  // This priority queue contains "Task"s to be processed; these correspond
+  // to transitions out of determinized states.  We process these in priority
+  // order according to the best weight of any path passing through these
+  // determinized states... it's possible to work this out.
+  std::priority_queue<Task*, vector<Task*>, TaskCompare> queue_;
+
+  vector<pair<Label, Element> > all_elems_tmp_; // temporary vector used in ProcessTransitions.
+
+  enum IsymbolOrFinal { OSF_UNKNOWN = 0, OSF_NO = 1, OSF_YES = 2 };
+
+  vector<char> isymbol_or_final_; // A kind of cache; it says whether
+  // each state is (emitting or final) where emitting means it has at least one
+  // non-epsilon output arc.  Only accessed by IsIsymbolOrFinal()
+
+  LatticeStringRepository<IntType> repository_;  // defines a compact and fast way of
+  // storing sequences of labels.
+
+  void AddStrings(const vector<Element> &vec,
+                  vector<StringId> *needed_strings) {
+    for (typename std::vector<Element>::const_iterator iter = vec.begin();
+         iter != vec.end(); ++iter)
+      needed_strings->push_back(iter->string);
+  }
+};
+
+
+// normally Weight would be LatticeWeight<float> (which has two floats),
+// or possibly TropicalWeightTpl<float>, and IntType would be int32.
+// Caution: there are two versions of the function DeterminizeLatticePruned,
+// with identical code but different output FST types.
+template<class Weight, class IntType>
+bool DeterminizeLatticePruned(
+    const ExpandedFst<ArcTpl<Weight> >&ifst,
+    double beam,
+    MutableFst<ArcTpl<CompactLatticeWeightTpl<Weight, IntType> > >*ofst,
+    DeterminizeLatticePrunedOptions opts) {
+  ofst->SetInputSymbols(ifst.InputSymbols());
+  ofst->SetOutputSymbols(ifst.OutputSymbols());
+  if (ifst.NumStates() == 0) {
+    ofst->DeleteStates();
+    return true;
+  }
+  KALDI_ASSERT(opts.retry_cutoff >= 0.0 && opts.retry_cutoff < 1.0);
+  int32 max_num_iters = 10;  // avoid the potential for infinite loops if
+                             // retrying.
+  VectorFst<ArcTpl<Weight> > temp_fst;
+
+  for (int32 iter = 0; iter < max_num_iters; iter++) {
+    LatticeDeterminizerPruned<Weight, IntType> det(iter == 0 ? ifst : temp_fst,
+                                                   beam, opts);
+    double effective_beam;
+    bool ans = det.Determinize(&effective_beam);
+    // if it returns false it will typically still produce reasonable output,
+    // just with a narrower beam than "beam".  If the user specifies an infinite
+    // beam we don't do this beam-narrowing.
+    if (effective_beam >= beam * opts.retry_cutoff ||
+        beam == std::numeric_limits<double>::infinity() ||
+        iter + 1 == max_num_iters) {
+      det.Output(ofst);
+      return ans;
+    } else {
+      // The code below to set "beam" is a heuristic.
+      // If effective_beam is very small, we want to reduce by a lot.
+      // But never change the beam by more than a factor of two.
+      if (effective_beam < 0.0) effective_beam = 0.0;
+      double new_beam = beam * sqrt(effective_beam / beam);
+      if (new_beam < 0.5 * beam) new_beam = 0.5 * beam;
+      beam = new_beam;
+      if (iter == 0) temp_fst = ifst;
+      kaldi::PruneLattice(beam, &temp_fst);
+      KALDI_LOG << "Pruned state-level lattice with beam " << beam
+                << " and retrying determinization with that beam.";
+    }
+  }
+  return false; // Suppress compiler warning; this code is unreachable.
+}
+
+
+// normally Weight would be LatticeWeight<float> (which has two floats),
+// or possibly TropicalWeightTpl<float>, and IntType would be int32.
+// Caution: there are two versions of the function DeterminizeLatticePruned,
+// with identical code but different output FST types.
+template<class Weight>
+bool DeterminizeLatticePruned(const ExpandedFst<ArcTpl<Weight> > &ifst,
+                              double beam,
+                              MutableFst<ArcTpl<Weight> > *ofst,
+                              DeterminizeLatticePrunedOptions opts) {
+  typedef int32 IntType;
+  ofst->SetInputSymbols(ifst.InputSymbols());
+  ofst->SetOutputSymbols(ifst.OutputSymbols());
+  KALDI_ASSERT(opts.retry_cutoff >= 0.0 && opts.retry_cutoff < 1.0);
+  if (ifst.NumStates() == 0) {
+    ofst->DeleteStates();
+    return true;
+  }
+  int32 max_num_iters = 10;  // avoid the potential for infinite loops if
+                             // retrying.
+  VectorFst<ArcTpl<Weight> > temp_fst;
+
+  for (int32 iter = 0; iter < max_num_iters; iter++) {
+    LatticeDeterminizerPruned<Weight, IntType> det(iter == 0 ? ifst : temp_fst,
+                                                   beam, opts);
+    double effective_beam;
+    bool ans = det.Determinize(&effective_beam);
+    // if it returns false it will typically still
+    // produce reasonable output, just with a
+    // narrower beam than "beam".
+    if (effective_beam >= beam * opts.retry_cutoff ||
+        iter + 1 == max_num_iters) {
+      det.Output(ofst);
+      return ans;
+    } else {
+      // The code below to set "beam" is a heuristic.
+      // If effective_beam is very small, we want to reduce by a lot.
+      // But never change the beam by more than a factor of two.
+      if (effective_beam < 0)
+        effective_beam = 0;
+      double new_beam = beam * sqrt(effective_beam / beam);
+      if (new_beam < 0.5 * beam) new_beam = 0.5 * beam;
+      KALDI_WARN << "Effective beam " << effective_beam << " was less than beam "
+                 << beam << " * cutoff " << opts.retry_cutoff << ", pruning raw "
+                 << "lattice with new beam " << new_beam << " and retrying.";
+      beam = new_beam;
+      if (iter == 0) temp_fst = ifst;
+      kaldi::PruneLattice(beam, &temp_fst);
+    }
+  }
+  return false; // Suppress compiler warning; this code is unreachable.
+}
+
+template<class Weight>
+typename ArcTpl<Weight>::Label DeterminizeLatticeInsertPhones(
+    const kaldi::TransitionInformation &trans_model,
+    MutableFst<ArcTpl<Weight> > *fst) {
+  // Define some types.
+  typedef ArcTpl<Weight> Arc;
+  typedef typename Arc::StateId StateId;
+  typedef typename Arc::Label Label;
+
+  // Work out the first phone symbol. This is more related to the phone
+  // insertion function, so we put it here and make it the returning value of
+  // DeterminizeLatticeInsertPhones().
+  Label first_phone_label = HighestNumberedInputSymbol(*fst) + 1;
+
+  // Insert phones here.
+  for (StateIterator<MutableFst<Arc> > siter(*fst);
+       !siter.Done(); siter.Next()) {
+    StateId state = siter.Value();
+    if (state == fst->Start())
+      continue;
+    for (MutableArcIterator<MutableFst<Arc> > aiter(fst, state);
+         !aiter.Done(); aiter.Next()) {
+      Arc arc = aiter.Value();
+
+      // Note: the words are on the input symbol side and transition-id's are on
+      // the output symbol side.
+      if ((arc.olabel != 0)
+          && (trans_model.TransitionIdIsStartOfPhone(arc.olabel))
+          && (!trans_model.IsSelfLoop(arc.olabel))) {
+        Label phone =
+            static_cast<Label>(trans_model.TransitionIdToPhone(arc.olabel));
+
+        // Skips <eps>.
+        KALDI_ASSERT(phone != 0);
+
+        if (arc.ilabel == 0) {
+          // If there is no word on the arc, insert the phone directly.
+          arc.ilabel = first_phone_label + phone;
+        } else {
+          // Otherwise, add an additional arc.
+          StateId additional_state = fst->AddState();
+          StateId next_state = arc.nextstate;
+          arc.nextstate = additional_state;
+          fst->AddArc(additional_state,
+                      Arc(first_phone_label + phone, 0,
+                          Weight::One(), next_state));
+        }
+      }
+
+      aiter.SetValue(arc);
+    }
+  }
+
+  return first_phone_label;
+}
+
+template<class Weight>
+void DeterminizeLatticeDeletePhones(
+    typename ArcTpl<Weight>::Label first_phone_label,
+    MutableFst<ArcTpl<Weight> > *fst) {
+  // Define some types.
+  typedef ArcTpl<Weight> Arc;
+  typedef typename Arc::StateId StateId;
+  typedef typename Arc::Label Label;
+
+  // Delete phones here.
+  for (StateIterator<MutableFst<Arc> > siter(*fst);
+       !siter.Done(); siter.Next()) {
+    StateId state = siter.Value();
+    for (MutableArcIterator<MutableFst<Arc> > aiter(fst, state);
+         !aiter.Done(); aiter.Next()) {
+      Arc arc = aiter.Value();
+
+      if (arc.ilabel >= first_phone_label)
+        arc.ilabel = 0;
+
+      aiter.SetValue(arc);
+    }
+  }
+}
+// instantiate for type LatticeWeight
+template
+void DeterminizeLatticeDeletePhones(
+    ArcTpl<kaldi::LatticeWeight>::Label first_phone_label,
+    MutableFst<ArcTpl<kaldi::LatticeWeight> > *fst);
+
+/** This function does a first pass determinization with phone symbols inserted
+    at phone boundary. It uses a transition model to work out the transition-id
+    to phone map. First, phones will be inserted into the word level lattice.
+    Second, determinization will be applied on top of the phone + word lattice.
+    Finally, the inserted phones will be removed, converting the lattice back to
+    a word level lattice. The output lattice of this pass is not deterministic,
+    since we remove the phone symbols as a last step. It is supposed to be
+    followed by another pass of determinization at the word level. It could also
+    be useful for some other applications such as fMLLR estimation, confidence
+    estimation, discriminative training, etc.
+*/
+template<class Weight, class IntType>
+bool DeterminizeLatticePhonePrunedFirstPass(
+    const kaldi::TransitionInformation &trans_model,
+    double beam,
+    MutableFst<ArcTpl<Weight> > *fst,
+    const DeterminizeLatticePrunedOptions &opts) {
+  // First, insert the phones.
+  typename ArcTpl<Weight>::Label first_phone_label =
+      DeterminizeLatticeInsertPhones(trans_model, fst);
+  TopSort(fst);
+
+  // Second, do determinization with phone inserted.
+  bool ans = DeterminizeLatticePruned<Weight>(*fst, beam, fst, opts);
+
+  // Finally, remove the inserted phones.
+  DeterminizeLatticeDeletePhones(first_phone_label, fst);
+  TopSort(fst);
+
+  return ans;
+}
+
+// "Destructive" version of DeterminizeLatticePhonePruned() where the input
+// lattice might be modified.
+template<class Weight, class IntType>
+bool DeterminizeLatticePhonePruned(
+    const kaldi::TransitionInformation &trans_model,
+    MutableFst<ArcTpl<Weight> > *ifst,
+    double beam,
+    MutableFst<ArcTpl<CompactLatticeWeightTpl<Weight, IntType> > > *ofst,
+    DeterminizeLatticePhonePrunedOptions opts) {
+  // Returning status.
+  bool ans = true;
+
+  // Make sure at least one of opts.phone_determinize and opts.word_determinize
+  // is not false, otherwise calling this function doesn't make any sense.
+  if ((opts.phone_determinize || opts.word_determinize) == false) {
+    KALDI_WARN << "Both --phone-determinize and --word-determinize are set to "
+               << "false, copying lattice without determinization.";
+    // We are expecting the words on the input side.
+    ConvertLattice<Weight, IntType>(*ifst, ofst, false);
+    return ans;
+  }
+
+  // Determinization options.
+  DeterminizeLatticePrunedOptions det_opts;
+  det_opts.delta = opts.delta;
+  det_opts.max_mem = opts.max_mem;
+
+  // If --phone-determinize is true, do the determinization on phone + word
+  // lattices.
+  if (opts.phone_determinize) {
+    KALDI_VLOG(3) << "Doing first pass of determinization on phone + word "
+                  << "lattices.";
+    ans = DeterminizeLatticePhonePrunedFirstPass<Weight, IntType>(
+        trans_model, beam, ifst, det_opts) && ans;
+
+    // If --word-determinize is false, we've finished the job and return here.
+    if (!opts.word_determinize) {
+      // We are expecting the words on the input side.
+      ConvertLattice<Weight, IntType>(*ifst, ofst, false);
+      return ans;
+    }
+  }
+
+  // If --word-determinize is true, do the determinization on word lattices.
+  if (opts.word_determinize) {
+    KALDI_VLOG(3) << "Doing second pass of determinization on word lattices.";
+    ans = DeterminizeLatticePruned<Weight, IntType>(
+        *ifst, beam, ofst, det_opts) && ans;
+  }
+
+  // If --minimize is true, push and minimize after determinization.
+  if (opts.minimize) {
+    KALDI_VLOG(3) << "Pushing and minimizing on word lattices.";
+    ans = PushCompactLatticeStrings<Weight, IntType>(ofst) && ans;
+    ans = PushCompactLatticeWeights<Weight, IntType>(ofst) && ans;
+    ans = MinimizeCompactLattice<Weight, IntType>(ofst) && ans;
+  }
+
+  return ans;
+}
+
+// Normal verson of DeterminizeLatticePhonePruned(), where the input lattice
+// will be kept as unchanged.
+template<class Weight, class IntType>
+bool DeterminizeLatticePhonePruned(
+    const kaldi::TransitionInformation &trans_model,
+    const ExpandedFst<ArcTpl<Weight> > &ifst,
+    double beam,
+    MutableFst<ArcTpl<CompactLatticeWeightTpl<Weight, IntType> > > *ofst,
+    DeterminizeLatticePhonePrunedOptions opts) {
+  VectorFst<ArcTpl<Weight> > temp_fst(ifst);
+  return DeterminizeLatticePhonePruned(trans_model, &temp_fst,
+                                       beam, ofst, opts);
+}
+
+bool DeterminizeLatticePhonePrunedWrapper(
+    const kaldi::TransitionInformation &trans_model,
+    MutableFst<kaldi::LatticeArc> *ifst,
+    double beam,
+    MutableFst<kaldi::CompactLatticeArc> *ofst,
+    DeterminizeLatticePhonePrunedOptions opts) {
+  bool ans = true;
+  Invert(ifst);
+  if (ifst->Properties(fst::kTopSorted, true) == 0) {
+    if (!TopSort(ifst)) {
+      // Cannot topologically sort the lattice -- determinization will fail.
+      KALDI_ERR << "Topological sorting of state-level lattice failed (probably"
+                << " your lexicon has empty words or your LM has epsilon cycles"
+                << ").";
+    }
+  }
+  ILabelCompare<kaldi::LatticeArc> ilabel_comp;
+  ArcSort(ifst, ilabel_comp);
+  ans = DeterminizeLatticePhonePruned<kaldi::LatticeWeight, kaldi::int32>(
+      trans_model, ifst, beam, ofst, opts);
+  Connect(ofst);
+  return ans;
+}
+
+// Instantiate the templates for the types we might need.
+// Note: there are actually four templates, each of which
+// we instantiate for a single type.
+template
+bool DeterminizeLatticePruned<kaldi::LatticeWeight>(
+    const ExpandedFst<kaldi::LatticeArc> &ifst,
+    double prune,
+    MutableFst<kaldi::CompactLatticeArc> *ofst,
+    DeterminizeLatticePrunedOptions opts);
+
+template
+bool DeterminizeLatticePruned<kaldi::LatticeWeight>(
+    const ExpandedFst<kaldi::LatticeArc> &ifst,
+    double prune,
+    MutableFst<kaldi::LatticeArc> *ofst,
+    DeterminizeLatticePrunedOptions opts);
+
+template
+bool DeterminizeLatticePhonePruned<kaldi::LatticeWeight, kaldi::int32>(
+    const kaldi::TransitionInformation &trans_model,
+    const ExpandedFst<kaldi::LatticeArc> &ifst,
+    double prune,
+    MutableFst<kaldi::CompactLatticeArc> *ofst,
+    DeterminizeLatticePhonePrunedOptions opts);
+
+template
+bool DeterminizeLatticePhonePruned<kaldi::LatticeWeight, kaldi::int32>(
+    const kaldi::TransitionInformation &trans_model,
+    MutableFst<kaldi::LatticeArc> *ifst,
+    double prune,
+    MutableFst<kaldi::CompactLatticeArc> *ofst,
+    DeterminizeLatticePhonePrunedOptions opts);
+
+}
diff --git a/speechx/speechx/kaldi/lat/determinize-lattice-pruned.h b/speechx/speechx/kaldi/lat/determinize-lattice-pruned.h
new file mode 100644
index 000000000..c268f1299
--- /dev/null
+++ b/speechx/speechx/kaldi/lat/determinize-lattice-pruned.h
@@ -0,0 +1,296 @@
+// lat/determinize-lattice-pruned.h
+
+// Copyright 2009-2012  Microsoft Corporation
+//           2012-2013  Johns Hopkins University (Author: Daniel Povey)
+//                2014  Guoguo Chen
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_LAT_DETERMINIZE_LATTICE_PRUNED_H_
+#define KALDI_LAT_DETERMINIZE_LATTICE_PRUNED_H_
+#include <fst/fstlib.h>
+#include <fst/fst-decl.h>
+#include <algorithm>
+#include <map>
+#include <set>
+#include <vector>
+#include "fstext/lattice-weight.h"
+#include "itf/transition-information.h"
+#include "itf/options-itf.h"
+#include "lat/kaldi-lattice.h"
+
+namespace fst {
+
+/// \addtogroup fst_extensions
+///  @{
+
+
+// For example of usage, see test-determinize-lattice-pruned.cc
+
+/*
+   DeterminizeLatticePruned implements a special form of determinization with
+   epsilon removal, optimized for a phase of lattice generation.  This algorithm
+   also does pruning at the same time-- the combination is more efficient as it
+   somtimes prevents us from creating a lot of states that would later be pruned
+   away.  This allows us to increase the lattice-beam and not have the algorithm
+   blow up.  Also, because our algorithm processes states in order from those
+   that appear on high-scoring paths down to those that appear on low-scoring
+   paths, we can easily terminate the algorithm after a certain specified number
+   of states or arcs.
+
+   The input is an FST with weight-type BaseWeightType (usually a pair of floats,
+   with a lexicographical type of order, such as LatticeWeightTpl<float>).
+   Typically this would be a state-level lattice, with input symbols equal to
+   words, and output-symbols equal to p.d.f's (so like the inverse of HCLG).  Imagine representing this as an
+   acceptor of type CompactLatticeWeightTpl<float>, in which the input/output
+   symbols are words, and the weights contain the original weights together with
+   strings (with zero or one symbol in them) containing the original output labels
+   (the p.d.f.'s).  We determinize this using acceptor determinization with
+   epsilon removal.  Remember (from lattice-weight.h) that
+   CompactLatticeWeightTpl has a special kind of semiring where we always take
+   the string corresponding to the best cost (of type BaseWeightType), and
+   discard the other.  This corresponds to taking the best output-label sequence
+   (of p.d.f.'s) for each input-label sequence (of words).  We couldn't use the
+   Gallic weight for this, or it would die as soon as it detected that the input
+   FST was non-functional.  In our case, any acyclic FST (and many cyclic ones)
+   can be determinized.
+   We assume that there is a function
+      Compare(const BaseWeightType &a, const BaseWeightType &b)
+   that returns (-1, 0, 1) according to whether (a < b, a == b, a > b) in the
+   total order on the BaseWeightType... this information should be the
+   same as NaturalLess would give, but it's more efficient to do it this way.
+   You can define this for things like TropicalWeight if you need to instantiate
+   this class for that weight type.
+
+   We implement this determinization in a special way to make it efficient for
+   the types of FSTs that we will apply it to.  One issue is that if we
+   explicitly represent the strings (in CompactLatticeWeightTpl) as vectors of
+   type vector<IntType>, the algorithm takes time quadratic in the length of
+   words (in states), because propagating each arc involves copying a whole
+   vector (of integers representing p.d.f.'s).  Instead we use a hash structure
+   where each string is a pointer (Entry*), and uses a hash from (Entry*,
+   IntType), to the successor string (and a way to get the latest IntType and the
+   ancestor Entry*).  [this is the class LatticeStringRepository].
+
+   Another issue is that rather than representing a determinized-state as a
+   collection of (state, weight), we represent it in a couple of reduced forms.
+   Suppose a determinized-state is a collection of (state, weight) pairs; call
+   this the "canonical representation".  Note: these collections are always
+   normalized to remove any common weight and string part.  Define end-states as
+   the subset of states that have an arc out of them with a label on, or are
+   final.  If we represent a determinized-state a the set of just its (end-state,
+   weight) pairs, this will be a valid and more compact representation, and will
+   lead to a smaller set of determinized states (like early minimization).  Call
+   this collection of (end-state, weight) pairs the "minimal representation".  As
+   a mechanism to reduce compute, we can also consider another representation.
+   In the determinization algorithm, we start off with a set of (begin-state,
+   weight) pairs (where the "begin-states" are initial or have a label on the
+   transition into them), and the "canonical representation" consists of the
+   epsilon-closure of this set (i.e. follow epsilons).  Call this set of
+   (begin-state, weight) pairs, appropriately normalized, the "initial
+   representation".  If two initial representations are the same, the "canonical
+   representation" and hence the "minimal representation" will be the same.  We
+   can use this to reduce compute.  Note that if two initial representations are
+   different, this does not preclude the other representations from being the same.
+
+*/
+
+
+struct DeterminizeLatticePrunedOptions {
+  float delta; // A small offset used to measure equality of weights.
+  int max_mem; // If >0, determinization will fail and return false
+  // when the algorithm's (approximate) memory consumption crosses this threshold.
+  int max_loop; // If >0, can be used to detect non-determinizable input
+  // (a case that wouldn't be caught by max_mem).
+  int max_states;
+  int max_arcs;
+  float retry_cutoff;
+  DeterminizeLatticePrunedOptions(): delta(kDelta),
+                                     max_mem(-1),
+                                     max_loop(-1),
+                                     max_states(-1),
+                                     max_arcs(-1),
+                                     retry_cutoff(0.5) { }
+  void Register (kaldi::OptionsItf *opts) {
+    opts->Register("delta", &delta, "Tolerance used in determinization");
+    opts->Register("max-mem", &max_mem, "Maximum approximate memory usage in "
+                   "determinization (real usage might be many times this)");
+    opts->Register("max-arcs", &max_arcs, "Maximum number of arcs in "
+                   "output FST (total, not per state");
+    opts->Register("max-states", &max_states, "Maximum number of arcs in output "
+                   "FST (total, not per state");
+    opts->Register("max-loop", &max_loop, "Option used to detect a particular "
+                   "type of determinization failure, typically due to invalid input "
+                   "(e.g., negative-cost loops)");
+    opts->Register("retry-cutoff", &retry_cutoff, "Controls pruning un-determinized "
+                   "lattice and retrying determinization: if effective-beam < "
+                   "retry-cutoff * beam, we prune the raw lattice and retry.  Avoids "
+                   "ever getting empty output for long segments.");
+  }
+};
+
+struct DeterminizeLatticePhonePrunedOptions {
+  // delta: a small offset used to measure equality of weights.
+  float delta;
+  // max_mem: if > 0, determinization will fail and return false when the
+  // algorithm's (approximate) memory consumption crosses this threshold.
+  int max_mem;
+  // phone_determinize: if true, do a first pass determinization on both phones
+  // and words.
+  bool phone_determinize;
+  // word_determinize: if true, do a second pass determinization on words only.
+  bool word_determinize;
+  // minimize: if true, push and minimize after determinization.
+  bool minimize;
+  DeterminizeLatticePhonePrunedOptions(): delta(kDelta),
+                                          max_mem(50000000),
+                                          phone_determinize(true),
+                                          word_determinize(true),
+                                          minimize(false) {}
+  void Register (kaldi::OptionsItf *opts) {
+    opts->Register("delta", &delta, "Tolerance used in determinization");
+    opts->Register("max-mem", &max_mem, "Maximum approximate memory usage in "
+                   "determinization (real usage might be many times this).");
+    opts->Register("phone-determinize", &phone_determinize, "If true, do an "
+                   "initial pass of determinization on both phones and words (see"
+                   " also --word-determinize)");
+    opts->Register("word-determinize", &word_determinize, "If true, do a second "
+                   "pass of determinization on words only (see also "
+                   "--phone-determinize)");
+    opts->Register("minimize", &minimize, "If true, push and minimize after "
+                   "determinization.");
+  }
+};
+
+/**
+    This function implements the normal version of DeterminizeLattice, in which the
+    output strings are represented using sequences of arcs, where all but the
+    first one has an epsilon on the input side.  It also prunes using the beam
+    in the "prune" parameter.  The input FST must be topologically sorted in order
+    for the algorithm to work. For efficiency it is recommended to sort ilabel as well.
+    Returns true on success, and false if it had to terminate the determinization
+    earlier than specified by the "prune" beam-- that is, if it terminated because
+    of the max_mem, max_loop or max_arcs constraints in the options.
+    CAUTION: you may want to use the version below which outputs to CompactLattice.
+*/
+template<class Weight>
+bool DeterminizeLatticePruned(
+    const ExpandedFst<ArcTpl<Weight> > &ifst,
+    double prune,
+    MutableFst<ArcTpl<Weight> > *ofst,
+    DeterminizeLatticePrunedOptions opts = DeterminizeLatticePrunedOptions());
+
+
+/*  This is a version of DeterminizeLattice with a slightly more "natural" output format,
+    where the output sequences are encoded using the CompactLatticeArcTpl template
+    (i.e. the sequences of output symbols are represented directly as strings The input
+    FST must be topologically sorted in order for the algorithm to work. For efficiency
+    it is recommended to sort the ilabel for the input FST as well.
+    Returns true on normal success, and false if it had to terminate the determinization
+    earlier than specified by the "prune" beam-- that is, if it terminated because
+    of the max_mem, max_loop or max_arcs constraints in the options.
+    CAUTION: if Lattice is the input, you need to Invert() before calling this,
+    so words are on the input side.
+*/
+template<class Weight, class IntType>
+bool DeterminizeLatticePruned(
+    const ExpandedFst<ArcTpl<Weight> >&ifst,
+    double prune,
+    MutableFst<ArcTpl<CompactLatticeWeightTpl<Weight, IntType> > > *ofst,
+    DeterminizeLatticePrunedOptions opts = DeterminizeLatticePrunedOptions());
+
+/** This function takes in lattices and inserts phones at phone boundaries. It
+    uses the transition model to work out the transition_id to phone map. The
+    returning value is the starting index of the phone label. Typically we pick
+    (maximum_output_label_index + 1) as this value. The inserted phones are then
+    mapped to (returning_value + original_phone_label) in the new lattice. The
+    returning value will be used by DeterminizeLatticeDeletePhones() where it
+    works out the phones according to this value.
+*/
+template<class Weight>
+typename ArcTpl<Weight>::Label DeterminizeLatticeInsertPhones(
+    const kaldi::TransitionInformation &trans_model,
+    MutableFst<ArcTpl<Weight> > *fst);
+
+/** This function takes in lattices and deletes "phones" from them. The "phones"
+    here are actually any label that is larger than first_phone_label because
+    when we insert phones into the lattice, we map the original phone label to
+    (first_phone_label + original_phone_label). It is supposed to be used
+    together with DeterminizeLatticeInsertPhones()
+*/
+template<class Weight>
+void DeterminizeLatticeDeletePhones(
+    typename ArcTpl<Weight>::Label first_phone_label,
+    MutableFst<ArcTpl<Weight> > *fst);
+
+/** This function is a wrapper of DeterminizeLatticePhonePrunedFirstPass() and
+    DeterminizeLatticePruned(). If --phone-determinize is set to true, it first
+    calls DeterminizeLatticePhonePrunedFirstPass() to do the initial pass of
+    determinization on the phone + word lattices. If --word-determinize is set
+    true, it then does a second pass of determinization on the word lattices by
+    calling DeterminizeLatticePruned(). If both are set to false, then it gives
+    a warning and copying the lattices without determinization.
+
+    Note: the point of doing first a phone-level determinization pass and then
+    a word-level determinization pass is that it allows us to determinize
+    deeper lattices without "failing early" and returning a too-small lattice
+    due to the max-mem constraint.  The result should be the same as word-level
+    determinization in general, but for deeper lattices it is a bit faster,
+    despite the fact that we now have two passes of determinization by default.
+*/
+template<class Weight, class IntType>
+bool DeterminizeLatticePhonePruned(
+    const kaldi::TransitionInformation &trans_model,
+    const ExpandedFst<ArcTpl<Weight> > &ifst,
+    double prune,
+    MutableFst<ArcTpl<CompactLatticeWeightTpl<Weight, IntType> > > *ofst,
+    DeterminizeLatticePhonePrunedOptions opts
+      = DeterminizeLatticePhonePrunedOptions());
+
+/** "Destructive" version of DeterminizeLatticePhonePruned() where the input
+    lattice might be changed.
+*/
+template<class Weight, class IntType>
+bool DeterminizeLatticePhonePruned(
+    const kaldi::TransitionInformation &trans_model,
+    MutableFst<ArcTpl<Weight> > *ifst,
+    double prune,
+    MutableFst<ArcTpl<CompactLatticeWeightTpl<Weight, IntType> > > *ofst,
+    DeterminizeLatticePhonePrunedOptions opts
+      = DeterminizeLatticePhonePrunedOptions());
+
+/** This function is a wrapper of DeterminizeLatticePhonePruned() that works for
+    Lattice type FSTs.  It simplifies the calling process by calling
+    TopSort() Invert() and ArcSort() for you.
+    Unlike other determinization routines, the function
+    requires "ifst" to have transition-id's on the input side and words on the
+    output side.
+    This function can be used as the top-level interface to all the determinization
+    code.
+*/
+bool DeterminizeLatticePhonePrunedWrapper(
+    const kaldi::TransitionInformation &trans_model,
+    MutableFst<kaldi::LatticeArc> *ifst,
+    double prune,
+    MutableFst<kaldi::CompactLatticeArc> *ofst,
+    DeterminizeLatticePhonePrunedOptions opts
+      = DeterminizeLatticePhonePrunedOptions());
+
+/// @} end "addtogroup fst_extensions"
+
+} // end namespace fst
+
+#endif
diff --git a/speechx/speechx/kaldi/lat/kaldi-lattice.cc b/speechx/speechx/kaldi/lat/kaldi-lattice.cc
new file mode 100644
index 000000000..744cc5384
--- /dev/null
+++ b/speechx/speechx/kaldi/lat/kaldi-lattice.cc
@@ -0,0 +1,506 @@
+// lat/kaldi-lattice.cc
+
+// Copyright 2009-2011     Microsoft Corporation
+//                2013     Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "lat/kaldi-lattice.h"
+#include "fst/script/print-impl.h"
+
+namespace kaldi {
+
+/// Converts lattice types if necessary, deleting its input.
+template<class OrigWeightType>
+CompactLattice* ConvertToCompactLattice(fst::VectorFst<OrigWeightType> *ifst) {
+  if (!ifst) return NULL;
+  CompactLattice *ofst = new CompactLattice();
+  ConvertLattice(*ifst, ofst);
+  delete ifst;
+  return ofst;
+}
+
+// This overrides the template if there is no type conversion going on
+// (for efficiency).
+template<>
+CompactLattice* ConvertToCompactLattice(CompactLattice *ifst) {
+  return ifst;
+}
+
+/// Converts lattice types if necessary, deleting its input.
+template<class OrigWeightType>
+Lattice* ConvertToLattice(fst::VectorFst<OrigWeightType> *ifst) {
+  if (!ifst) return NULL;
+  Lattice *ofst = new Lattice();
+  ConvertLattice(*ifst, ofst);
+  delete ifst;
+  return ofst;
+}
+
+// This overrides the template if there is no type conversion going on
+// (for efficiency).
+template<>
+Lattice* ConvertToLattice(Lattice *ifst) {
+  return ifst;
+}
+
+
+bool WriteCompactLattice(std::ostream &os, bool binary,
+                         const CompactLattice &t) {
+  if (binary) {
+    fst::FstWriteOptions opts;
+    // Leave all the options default.  Normally these lattices wouldn't have any
+    // osymbols/isymbols so no point directing it not to write them (who knows what
+    // we'd want to if we had them).
+    return t.Write(os, opts);
+  } else {
+    // Text-mode output.  Note: we expect that t.InputSymbols() and
+    // t.OutputSymbols() would always return NULL.  The corresponding input
+    // routine would not work if the FST actually had symbols attached.
+    // Write a newline after the key, so the first line of the FST appears
+    // on its own line.
+    os << '\n';
+    bool acceptor = true, write_one = false;
+    fst::FstPrinter<CompactLatticeArc> printer(t, t.InputSymbols(),
+                                               t.OutputSymbols(),
+                                               NULL, acceptor, write_one, "\t");
+    printer.Print(&os, "<unknown>");
+    if (os.fail())
+      KALDI_WARN << "Stream failure detected.";
+    // Write another newline as a terminating character.  The read routine will
+    // detect this [this is a Kaldi mechanism, not somethig in the original
+    // OpenFst code].
+    os << '\n';
+    return os.good();
+  }
+}
+
+/// LatticeReader provides (static) functions for reading both Lattice
+/// and CompactLattice, in text form.
+class LatticeReader {
+  typedef LatticeArc Arc;
+  typedef LatticeWeight Weight;
+  typedef CompactLatticeArc CArc;
+  typedef CompactLatticeWeight CWeight;
+  typedef Arc::Label Label;
+  typedef Arc::StateId StateId;
+ public:
+  // everything is static in this class.
+
+  /** This function reads from the FST text format; it does not know in advance
+      whether it's a Lattice or CompactLattice in the stream so it tries to
+      read both formats until it becomes clear which is the correct one.
+  */
+  static std::pair<Lattice*, CompactLattice*> ReadText(
+      std::istream &is) {
+    typedef std::pair<Lattice*, CompactLattice*> PairT;
+    using std::string;
+    using std::vector;
+    Lattice *fst = new Lattice();
+    CompactLattice *cfst = new CompactLattice();
+    string line;
+    size_t nline = 0;
+    string separator = FLAGS_fst_field_separator + "\r\n";
+    while (std::getline(is, line)) {
+      nline++;
+      vector<string> col;
+      // on Windows we'll write in text and read in binary mode.
+      SplitStringToVector(line, separator.c_str(), true, &col);
+      if (col.size() == 0) break; // Empty line is a signal to stop, in our
+      // archive format.
+      if (col.size() > 5) {
+        KALDI_WARN << "Reading lattice: bad line in FST: " << line;
+        delete fst;
+        delete cfst;
+        return PairT(static_cast<Lattice*>(NULL),
+                     static_cast<CompactLattice*>(NULL));
+      }
+      StateId s;
+      if (!ConvertStringToInteger(col[0], &s)) {
+        KALDI_WARN << "FstCompiler: bad line in FST: " << line;
+        delete fst;
+        delete cfst;
+        return PairT(static_cast<Lattice*>(NULL),
+                     static_cast<CompactLattice*>(NULL));
+      }
+      if (fst)
+        while (s >= fst->NumStates())
+          fst->AddState();
+      if (cfst)
+        while (s >= cfst->NumStates())
+          cfst->AddState();
+      if (nline == 1) {
+        if (fst) fst->SetStart(s);
+        if (cfst) cfst->SetStart(s);
+      }
+
+      if (fst) { // we still have fst; try to read that arc.
+        bool ok = true;
+        Arc arc;
+        Weight w;
+        StateId d = s;
+        switch (col.size()) {
+          case 1 :
+            fst->SetFinal(s, Weight::One());
+            break;
+          case 2:
+            if (!StrToWeight(col[1], true, &w)) ok = false;
+            else fst->SetFinal(s, w);
+            break;
+          case 3: // 3 columns not ok for Lattice format; it's not an acceptor.
+            ok = false;
+            break;
+          case 4:
+            ok = ConvertStringToInteger(col[1], &arc.nextstate) &&
+                ConvertStringToInteger(col[2], &arc.ilabel) &&
+                ConvertStringToInteger(col[3], &arc.olabel);
+            if (ok) {
+              d = arc.nextstate;
+              arc.weight = Weight::One();
+              fst->AddArc(s, arc);
+            }
+            break;
+          case 5:
+            ok = ConvertStringToInteger(col[1], &arc.nextstate) &&
+                ConvertStringToInteger(col[2], &arc.ilabel) &&
+                ConvertStringToInteger(col[3], &arc.olabel) &&
+                StrToWeight(col[4], false, &arc.weight);
+            if (ok) {
+              d = arc.nextstate;
+              fst->AddArc(s, arc);
+            }
+            break;
+          default:
+            ok = false;
+        }
+        while (d >= fst->NumStates())
+          fst->AddState();
+        if (!ok) {
+          delete fst;
+          fst = NULL;
+        }
+      }
+      if (cfst) {
+        bool ok = true;
+        CArc arc;
+        CWeight w;
+        StateId d = s;
+        switch (col.size()) {
+          case 1 :
+            cfst->SetFinal(s, CWeight::One());
+            break;
+          case 2:
+            if (!StrToCWeight(col[1], true, &w)) ok = false;
+            else cfst->SetFinal(s, w);
+            break;
+          case 3: // compact-lattice is acceptor format: state, next-state, label.
+            ok = ConvertStringToInteger(col[1], &arc.nextstate) &&
+                ConvertStringToInteger(col[2], &arc.ilabel);
+            if (ok) {
+              d = arc.nextstate;
+              arc.olabel = arc.ilabel;
+              arc.weight = CWeight::One();
+              cfst->AddArc(s, arc);
+            }
+            break;
+          case 4:
+            ok = ConvertStringToInteger(col[1], &arc.nextstate) &&
+                ConvertStringToInteger(col[2], &arc.ilabel) &&
+                StrToCWeight(col[3], false, &arc.weight);
+            if (ok) {
+              d = arc.nextstate;
+              arc.olabel = arc.ilabel;
+              cfst->AddArc(s, arc);
+            }
+            break;
+          case 5: default:
+            ok = false;
+        }
+        while (d >= cfst->NumStates())
+          cfst->AddState();
+        if (!ok) {
+          delete cfst;
+          cfst = NULL;
+        }
+      }
+      if (!fst && !cfst) {
+        KALDI_WARN << "Bad line in lattice text format: " << line;
+        // read until we get an empty line, so at least we
+        // have a chance to read the next one (although this might
+        // be a bit futile since the calling code will get unhappy
+        // about failing to read this one.
+        while (std::getline(is, line)) {
+          SplitStringToVector(line, separator.c_str(), true, &col);
+          if (col.empty()) break;
+        }
+        return PairT(static_cast<Lattice*>(NULL),
+                     static_cast<CompactLattice*>(NULL));
+      }
+    }
+    return PairT(fst, cfst);
+  }
+
+  static bool StrToWeight(const std::string &s, bool allow_zero, Weight *w) {
+    std::istringstream strm(s);
+    strm >> *w;
+    if (!strm || (!allow_zero && *w == Weight::Zero())) {
+      return false;
+    }
+    return true;
+  }
+
+  static  bool StrToCWeight(const std::string &s, bool allow_zero, CWeight *w) {
+    std::istringstream strm(s);
+    strm >> *w;
+    if (!strm || (!allow_zero && *w == CWeight::Zero())) {
+      return false;
+    }
+    return true;
+  }
+};
+
+
+CompactLattice *ReadCompactLatticeText(std::istream &is) {
+  std::pair<Lattice*, CompactLattice*> lat_pair = LatticeReader::ReadText(is);
+  if (lat_pair.second != NULL) {
+    delete lat_pair.first;
+    return lat_pair.second;
+  } else if (lat_pair.first != NULL) {
+    // note: ConvertToCompactLattice frees its input.
+    return ConvertToCompactLattice(lat_pair.first);
+  } else {
+    return NULL;
+  }
+}
+
+
+Lattice *ReadLatticeText(std::istream &is) {
+  std::pair<Lattice*, CompactLattice*> lat_pair = LatticeReader::ReadText(is);
+  if (lat_pair.first != NULL) {
+    delete lat_pair.second;
+    return lat_pair.first;
+  } else if (lat_pair.second != NULL) {
+    // note: ConvertToLattice frees its input.
+    return ConvertToLattice(lat_pair.second);
+  } else {
+    return NULL;
+  }
+}
+
+bool ReadCompactLattice(std::istream &is, bool binary,
+                        CompactLattice **clat) {
+  KALDI_ASSERT(*clat == NULL);
+  if (binary) {
+    fst::FstHeader hdr;
+    if (!hdr.Read(is, "<unknown>")) {
+      KALDI_WARN << "Reading compact lattice: error reading FST header.";
+      return false;
+    }
+    if (hdr.FstType() != "vector") {
+      KALDI_WARN << "Reading compact lattice: unsupported FST type: "
+                 << hdr.FstType();
+      return false;
+    }
+    fst::FstReadOptions ropts("<unspecified>",
+                              &hdr);
+
+    typedef fst::CompactLatticeWeightTpl<fst::LatticeWeightTpl<float>, int32> T1;
+    typedef fst::CompactLatticeWeightTpl<fst::LatticeWeightTpl<double>, int32> T2;
+    typedef fst::LatticeWeightTpl<float> T3;
+    typedef fst::LatticeWeightTpl<double> T4;
+    typedef fst::VectorFst<fst::ArcTpl<T1> > F1;
+    typedef fst::VectorFst<fst::ArcTpl<T2> > F2;
+    typedef fst::VectorFst<fst::ArcTpl<T3> > F3;
+    typedef fst::VectorFst<fst::ArcTpl<T4> > F4;
+
+    CompactLattice *ans = NULL;
+    if (hdr.ArcType() == T1::Type()) {
+      ans = ConvertToCompactLattice(F1::Read(is, ropts));
+    } else if (hdr.ArcType() == T2::Type()) {
+      ans = ConvertToCompactLattice(F2::Read(is, ropts));
+    } else if (hdr.ArcType() == T3::Type()) {
+      ans = ConvertToCompactLattice(F3::Read(is, ropts));
+    } else if (hdr.ArcType() == T4::Type()) {
+      ans = ConvertToCompactLattice(F4::Read(is, ropts));
+    } else {
+      KALDI_WARN << "FST with arc type " << hdr.ArcType()
+                 << " cannot be converted to CompactLattice.\n";
+      return false;
+    }
+    if (ans == NULL) {
+      KALDI_WARN << "Error reading compact lattice (after reading header).";
+      return false;
+    }
+    *clat = ans;
+    return true;
+  } else {
+    // The next line would normally consume the \r on Windows, plus any
+    // extra spaces that might have got in there somehow.
+    while (std::isspace(is.peek()) && is.peek() != '\n') is.get();
+    if (is.peek() == '\n') is.get(); // consume the newline.
+    else { // saw spaces but no newline.. this is not expected.
+      KALDI_WARN << "Reading compact lattice: unexpected sequence of spaces "
+                 << " at file position " << is.tellg();
+      return false;
+    }
+    *clat = ReadCompactLatticeText(is); // that routine will warn on error.
+    return (*clat != NULL);
+  }
+}
+
+
+bool CompactLatticeHolder::Read(std::istream &is) {
+  Clear(); // in case anything currently stored.
+  int c = is.peek();
+  if (c == -1) {
+    KALDI_WARN << "End of stream detected reading CompactLattice.";
+    return false;
+  } else if (isspace(c)) { // The text form of the lattice begins
+    // with space (normally, '\n'), so this means it's text (the binary form
+    // cannot begin with space because it starts with the FST Type() which is not
+    // space).
+    return ReadCompactLattice(is, false, &t_);
+  } else if (c != 214) { // 214 is first char of FST magic number,
+    // on little-endian machines which is all we support (\326 octal)
+    KALDI_WARN << "Reading compact lattice: does not appear to be an FST "
+               << " [non-space but no magic number detected], file pos is "
+               << is.tellg();
+    return false;
+  } else {
+    return ReadCompactLattice(is, true, &t_);
+  }
+}
+
+bool WriteLattice(std::ostream &os, bool binary, const Lattice &t) {
+  if (binary) {
+    fst::FstWriteOptions opts;
+    // Leave all the options default.  Normally these lattices wouldn't have any
+    // osymbols/isymbols so no point directing it not to write them (who knows what
+    // we'd want to do if we had them).
+    return t.Write(os, opts);
+  } else {
+    // Text-mode output.  Note: we expect that t.InputSymbols() and
+    // t.OutputSymbols() would always return NULL.  The corresponding input
+    // routine would not work if the FST actually had symbols attached.
+    // Write a newline after the key, so the first line of the FST appears
+    // on its own line.
+    os << '\n';
+    bool acceptor = false, write_one = false;
+    fst::FstPrinter<LatticeArc> printer(t, t.InputSymbols(),
+                                        t.OutputSymbols(),
+                                        NULL, acceptor, write_one, "\t");
+    printer.Print(&os, "<unknown>");
+    if (os.fail())
+      KALDI_WARN << "Stream failure detected.";
+    // Write another newline as a terminating character.  The read routine will
+    // detect this [this is a Kaldi mechanism, not somethig in the original
+    // OpenFst code].
+    os << '\n';
+    return os.good();
+  }
+}
+
+bool ReadLattice(std::istream &is, bool binary,
+                 Lattice **lat) {
+  KALDI_ASSERT(*lat == NULL);
+  if (binary) {
+    fst::FstHeader hdr;
+    if (!hdr.Read(is, "<unknown>")) {
+      KALDI_WARN << "Reading lattice: error reading FST header.";
+      return false;
+    }
+    if (hdr.FstType() != "vector") {
+      KALDI_WARN << "Reading lattice: unsupported FST type: "
+                 << hdr.FstType();
+      return false;
+    }
+    fst::FstReadOptions ropts("<unspecified>",
+                              &hdr);
+
+    typedef fst::CompactLatticeWeightTpl<fst::LatticeWeightTpl<float>, int32> T1;
+    typedef fst::CompactLatticeWeightTpl<fst::LatticeWeightTpl<double>, int32> T2;
+    typedef fst::LatticeWeightTpl<float> T3;
+    typedef fst::LatticeWeightTpl<double> T4;
+    typedef fst::VectorFst<fst::ArcTpl<T1> > F1;
+    typedef fst::VectorFst<fst::ArcTpl<T2> > F2;
+    typedef fst::VectorFst<fst::ArcTpl<T3> > F3;
+    typedef fst::VectorFst<fst::ArcTpl<T4> > F4;
+
+    Lattice *ans = NULL;
+    if (hdr.ArcType() == T1::Type()) {
+      ans = ConvertToLattice(F1::Read(is, ropts));
+    } else if (hdr.ArcType() == T2::Type()) {
+      ans = ConvertToLattice(F2::Read(is, ropts));
+    } else if (hdr.ArcType() == T3::Type()) {
+      ans = ConvertToLattice(F3::Read(is, ropts));
+    } else if (hdr.ArcType() == T4::Type()) {
+      ans = ConvertToLattice(F4::Read(is, ropts));
+    } else {
+      KALDI_WARN << "FST with arc type " << hdr.ArcType()
+                 << " cannot be converted to Lattice.\n";
+      return false;
+    }
+    if (ans == NULL) {
+      KALDI_WARN << "Error reading lattice (after reading header).";
+      return false;
+    }
+    *lat = ans;
+    return true;
+  } else {
+    // The next line would normally consume the \r on Windows, plus any
+    // extra spaces that might have got in there somehow.
+    while (std::isspace(is.peek()) && is.peek() != '\n') is.get();
+    if (is.peek() == '\n') is.get(); // consume the newline.
+    else { // saw spaces but no newline.. this is not expected.
+      KALDI_WARN << "Reading compact lattice: unexpected sequence of spaces "
+                 << " at file position " << is.tellg();
+      return false;
+    }
+    *lat = ReadLatticeText(is); // that routine will warn on error.
+    return (*lat != NULL);
+  }
+}
+
+
+/* Since we don't write the binary headers for this type of holder,
+   we use a different method to work out whether we're in binary mode.
+ */
+bool LatticeHolder::Read(std::istream &is) {
+  Clear(); // in case anything currently stored.
+  int c = is.peek();
+  if (c == -1) {
+    KALDI_WARN << "End of stream detected reading Lattice.";
+    return false;
+  } else if (isspace(c)) { // The text form of the lattice begins
+    // with space (normally, '\n'), so this means it's text (the binary form
+    // cannot begin with space because it starts with the FST Type() which is not
+    // space).
+    return ReadLattice(is, false, &t_);
+  } else if (c != 214) { // 214 is first char of FST magic number,
+    // on little-endian machines which is all we support (\326 octal)
+    KALDI_WARN << "Reading compact lattice: does not appear to be an FST "
+               << " [non-space but no magic number detected], file pos is "
+               << is.tellg();
+    return false;
+  } else {
+    return ReadLattice(is, true, &t_);
+  }
+}
+
+
+
+} // end namespace kaldi
diff --git a/speechx/speechx/kaldi/lat/kaldi-lattice.h b/speechx/speechx/kaldi/lat/kaldi-lattice.h
new file mode 100644
index 000000000..dc9f3a710
--- /dev/null
+++ b/speechx/speechx/kaldi/lat/kaldi-lattice.h
@@ -0,0 +1,156 @@
+// lat/kaldi-lattice.h
+
+// Copyright 2009-2011  Microsoft Corporation
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#ifndef KALDI_LAT_KALDI_LATTICE_H_
+#define KALDI_LAT_KALDI_LATTICE_H_
+
+#include "fstext/fstext-lib.h"
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+
+
+namespace kaldi {
+// will import some things above...
+
+typedef fst::LatticeWeightTpl<BaseFloat> LatticeWeight;
+
+// careful: kaldi::int32 is not always the same C type as fst::int32
+typedef fst::CompactLatticeWeightTpl<LatticeWeight, int32> CompactLatticeWeight;
+
+typedef fst::CompactLatticeWeightCommonDivisorTpl<LatticeWeight, int32>
+  CompactLatticeWeightCommonDivisor;
+
+typedef fst::ArcTpl<LatticeWeight> LatticeArc;
+
+typedef fst::ArcTpl<CompactLatticeWeight> CompactLatticeArc;
+
+typedef fst::VectorFst<LatticeArc> Lattice;
+
+typedef fst::VectorFst<CompactLatticeArc> CompactLattice;
+
+// The following functions for writing and reading lattices in binary or text
+// form are provided here in case you need to include lattices in larger,
+// Kaldi-type objects with their own Read and Write functions.  Caution: these
+// functions return false on stream failure rather than throwing an exception as
+// most similar Kaldi functions would do.
+
+bool WriteCompactLattice(std::ostream &os, bool binary,
+                         const CompactLattice &clat);
+bool WriteLattice(std::ostream &os, bool binary,
+                  const Lattice &lat);
+
+// the following function requires that *clat be
+// NULL when called.
+bool ReadCompactLattice(std::istream &is, bool binary,
+                        CompactLattice **clat);
+// the following function requires that *lat be
+// NULL when called.
+bool ReadLattice(std::istream &is, bool binary,
+                 Lattice **lat);
+
+
+class CompactLatticeHolder {
+ public:
+  typedef CompactLattice T;
+
+  CompactLatticeHolder() { t_ = NULL; }
+
+  static bool Write(std::ostream &os, bool binary, const T &t) {
+    // Note: we don't include the binary-mode header when writing
+    // this object to disk; this ensures that if we write to single
+    // files, the result can be read by OpenFst.
+    return WriteCompactLattice(os, binary, t);
+  }
+
+  bool Read(std::istream &is);
+
+  static bool IsReadInBinary() { return true; }
+
+  T &Value() {
+    KALDI_ASSERT(t_ != NULL && "Called Value() on empty CompactLatticeHolder");
+    return *t_;
+  }
+
+  void Clear() { delete t_; t_ = NULL; }
+
+  void Swap(CompactLatticeHolder *other) {
+    std::swap(t_, other->t_);
+  }
+
+  bool ExtractRange(const CompactLatticeHolder &other, const std::string &range) {
+    KALDI_ERR << "ExtractRange is not defined for this type of holder.";
+    return false;
+  }
+
+  ~CompactLatticeHolder() { Clear(); }
+ private:
+  T *t_;
+};
+
+class LatticeHolder {
+ public:
+  typedef Lattice T;
+
+  LatticeHolder() { t_ = NULL; }
+
+  static bool Write(std::ostream &os, bool binary, const T &t) {
+    // Note: we don't include the binary-mode header when writing
+    // this object to disk; this ensures that if we write to single
+    // files, the result can be read by OpenFst.
+    return WriteLattice(os, binary, t);
+  }
+
+  bool Read(std::istream &is);
+
+  static bool IsReadInBinary() { return true; }
+
+  T &Value() {
+    KALDI_ASSERT(t_ != NULL && "Called Value() on empty LatticeHolder");
+    return *t_;
+  }
+
+  void Clear() {  delete t_; t_ = NULL; }
+
+  void Swap(LatticeHolder *other) {
+    std::swap(t_, other->t_);
+  }
+
+  bool ExtractRange(const LatticeHolder &other, const std::string &range) {
+    KALDI_ERR << "ExtractRange is not defined for this type of holder.";
+    return false;
+  }
+
+  ~LatticeHolder() { Clear(); }
+ private:
+  T *t_;
+};
+
+typedef TableWriter<LatticeHolder> LatticeWriter;
+typedef SequentialTableReader<LatticeHolder> SequentialLatticeReader;
+typedef RandomAccessTableReader<LatticeHolder> RandomAccessLatticeReader;
+
+typedef TableWriter<CompactLatticeHolder> CompactLatticeWriter;
+typedef SequentialTableReader<CompactLatticeHolder> SequentialCompactLatticeReader;
+typedef RandomAccessTableReader<CompactLatticeHolder> RandomAccessCompactLatticeReader;
+
+
+} // namespace kaldi
+
+#endif  // KALDI_LAT_KALDI_LATTICE_H_
diff --git a/speechx/speechx/kaldi/lat/lattice-functions.cc b/speechx/speechx/kaldi/lat/lattice-functions.cc
new file mode 100644
index 000000000..354cdf601
--- /dev/null
+++ b/speechx/speechx/kaldi/lat/lattice-functions.cc
@@ -0,0 +1,1880 @@
+// lat/lattice-functions.cc
+
+// Copyright 2009-2011  Saarland University (Author: Arnab Ghoshal)
+//           2012-2013  Johns Hopkins University (Author: Daniel Povey);  Chao Weng;
+//                      Bagher BabaAli
+//                2013  Cisco Systems (author: Neha Agrawal) [code modified
+//                      from original code in ../gmmbin/gmm-rescore-lattice.cc]
+//                2014  Guoguo Chen
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "base/kaldi-math.h"
+#include "lat/lattice-functions.h"
+
+namespace kaldi {
+using std::map;
+using std::vector;
+
+void GetPerFrameAcousticCosts(const Lattice &nbest,
+                              Vector<BaseFloat> *per_frame_loglikes) {
+  using namespace fst;
+  typedef Lattice::Arc::Weight Weight;
+  vector<BaseFloat> loglikes;
+
+  int32 cur_state = nbest.Start();
+  int32 prev_frame = -1;
+  BaseFloat eps_acwt = 0.0;
+  while(1) {
+    Weight w = nbest.Final(cur_state);
+    if (w != Weight::Zero()) {
+      KALDI_ASSERT(nbest.NumArcs(cur_state) == 0);
+      if (per_frame_loglikes != NULL)  {
+        SubVector<BaseFloat> subvec(&(loglikes[0]), loglikes.size());
+        Vector<BaseFloat> vec(subvec);
+        *per_frame_loglikes = vec;
+      }
+      break;
+    } else {
+      KALDI_ASSERT(nbest.NumArcs(cur_state) == 1);
+      fst::ArcIterator<Lattice> iter(nbest, cur_state);
+      const Lattice::Arc &arc = iter.Value();
+      BaseFloat acwt = arc.weight.Value2();
+      if (arc.ilabel != 0) {
+        if (eps_acwt > 0) {
+          acwt += eps_acwt;
+          eps_acwt = 0.0;
+        }
+        loglikes.push_back(acwt);
+        prev_frame++;
+      } else if (acwt == acwt){
+        if (prev_frame > -1) {
+          loglikes[prev_frame] += acwt;
+        } else {
+          eps_acwt += acwt;
+        }
+      }
+      cur_state = arc.nextstate;
+    }
+  }
+}
+
+int32 LatticeStateTimes(const Lattice &lat, vector<int32> *times) {
+  if (!lat.Properties(fst::kTopSorted, true))
+    KALDI_ERR << "Input lattice must be topologically sorted.";
+  KALDI_ASSERT(lat.Start() == 0);
+  int32 num_states = lat.NumStates();
+  times->clear();
+  times->resize(num_states, -1);
+  (*times)[0] = 0;
+  for (int32 state = 0; state < num_states; state++) {
+    int32 cur_time = (*times)[state];
+    for (fst::ArcIterator<Lattice> aiter(lat, state); !aiter.Done();
+        aiter.Next()) {
+      const LatticeArc &arc = aiter.Value();
+
+      if (arc.ilabel != 0) {  // Non-epsilon input label on arc
+        // next time instance
+        if ((*times)[arc.nextstate] == -1) {
+          (*times)[arc.nextstate] = cur_time + 1;
+        } else {
+          KALDI_ASSERT((*times)[arc.nextstate] == cur_time + 1);
+        }
+      } else {  // epsilon input label on arc
+        // Same time instance
+        if ((*times)[arc.nextstate] == -1)
+          (*times)[arc.nextstate] = cur_time;
+        else
+          KALDI_ASSERT((*times)[arc.nextstate] == cur_time);
+      }
+    }
+  }
+  return (*std::max_element(times->begin(), times->end()));
+}
+
+int32 CompactLatticeStateTimes(const CompactLattice &lat,
+                               vector<int32> *times) {
+  if (!lat.Properties(fst::kTopSorted, true))
+    KALDI_ERR << "Input lattice must be topologically sorted.";
+  KALDI_ASSERT(lat.Start() == 0);
+  int32 num_states = lat.NumStates();
+  times->clear();
+  times->resize(num_states, -1);
+  (*times)[0] = 0;
+  int32 utt_len = -1;
+  for (int32 state = 0; state < num_states; state++) {
+    int32 cur_time = (*times)[state];
+    for (fst::ArcIterator<CompactLattice> aiter(lat, state); !aiter.Done();
+        aiter.Next()) {
+      const CompactLatticeArc &arc = aiter.Value();
+      int32 arc_len = static_cast<int32>(arc.weight.String().size());
+      if ((*times)[arc.nextstate] == -1)
+        (*times)[arc.nextstate] = cur_time + arc_len;
+      else
+        KALDI_ASSERT((*times)[arc.nextstate] == cur_time + arc_len);
+    }
+    if (lat.Final(state) != CompactLatticeWeight::Zero()) {
+      int32 this_utt_len = (*times)[state] + lat.Final(state).String().size();
+      if (utt_len == -1) utt_len = this_utt_len;
+      else {
+        if (this_utt_len != utt_len) {
+          KALDI_WARN << "Utterance does not "
+              "seem to have a consistent length.";
+          utt_len = std::max(utt_len, this_utt_len);
+        }
+      }
+    }
+  }
+  if (utt_len == -1) {
+    KALDI_WARN << "Utterance does not have a final-state.";
+    return 0;
+  }
+  return utt_len;
+}
+
+bool ComputeCompactLatticeAlphas(const CompactLattice &clat,
+                                 vector<double> *alpha) {
+  using namespace fst;
+
+  // typedef the arc, weight types
+  typedef CompactLattice::Arc Arc;
+  typedef Arc::Weight Weight;
+  typedef Arc::StateId StateId;
+
+  //Make sure the lattice is topologically sorted.
+  if (clat.Properties(fst::kTopSorted, true) == 0) {
+    KALDI_WARN << "Input lattice must be topologically sorted.";
+    return false;
+  }
+  if (clat.Start() != 0) {
+    KALDI_WARN << "Input lattice must start from state 0.";
+    return false;
+  }
+
+  int32 num_states = clat.NumStates();
+  (*alpha).resize(0);
+  (*alpha).resize(num_states, kLogZeroDouble);
+
+  // Now propagate alphas forward. Note that we don't acount the weight of the
+  // final state to alpha[final_state] -- we acount it to beta[final_state];
+  (*alpha)[0] = 0.0;
+  for (StateId s = 0; s < num_states; s++) {
+    double this_alpha = (*alpha)[s];
+    for (ArcIterator<CompactLattice> aiter(clat, s);
+         !aiter.Done(); aiter.Next()) {
+      const Arc &arc = aiter.Value();
+      double arc_like = -(arc.weight.Weight().Value1() +
+                          arc.weight.Weight().Value2());
+      (*alpha)[arc.nextstate] = LogAdd((*alpha)[arc.nextstate],
+                                       this_alpha + arc_like);
+    }
+  }
+
+  return true;
+}
+
+bool ComputeCompactLatticeBetas(const CompactLattice &clat,
+                                vector<double> *beta) {
+  using namespace fst;
+
+  // typedef the arc, weight types
+  typedef CompactLattice::Arc Arc;
+  typedef Arc::Weight Weight;
+  typedef Arc::StateId StateId;
+
+  // Make sure the lattice is topologically sorted.
+  if (clat.Properties(fst::kTopSorted, true) == 0) {
+    KALDI_WARN << "Input lattice must be topologically sorted.";
+    return false;
+  }
+  if (clat.Start() != 0) {
+    KALDI_WARN << "Input lattice must start from state 0.";
+    return false;
+  }
+
+  int32 num_states = clat.NumStates();
+  (*beta).resize(0);
+  (*beta).resize(num_states, kLogZeroDouble);
+
+  // Now propagate betas backward. Note that beta[final_state] contains the
+  // weight of the final state in the lattice -- compare that with alpha.
+  for (StateId s = num_states-1; s >= 0; s--) {
+    Weight f = clat.Final(s);
+    double this_beta = -(f.Weight().Value1()+f.Weight().Value2());
+    for (ArcIterator<CompactLattice> aiter(clat, s);
+         !aiter.Done(); aiter.Next()) {
+      const Arc &arc = aiter.Value();
+      double arc_like = -(arc.weight.Weight().Value1() +
+                          arc.weight.Weight().Value2());
+      double arc_beta = (*beta)[arc.nextstate] + arc_like;
+      this_beta = LogAdd(this_beta, arc_beta);
+    }
+    (*beta)[s] = this_beta;
+  }
+
+  return true;
+}
+
+template<class LatType>  // could be Lattice or CompactLattice
+bool PruneLattice(BaseFloat beam, LatType *lat) {
+  typedef typename LatType::Arc Arc;
+  typedef typename Arc::Weight Weight;
+  typedef typename Arc::StateId StateId;
+
+  KALDI_ASSERT(beam > 0.0);
+  if (!lat->Properties(fst::kTopSorted, true)) {
+    if (fst::TopSort(lat) == false) {
+      KALDI_WARN << "Cycles detected in lattice";
+      return false;
+    }
+  }
+  // We assume states before "start" are not reachable, since
+  // the lattice is topologically sorted.
+  int32 start = lat->Start();
+  int32 num_states = lat->NumStates();
+  if (num_states == 0) return false;
+  std::vector<double> forward_cost(num_states,
+                                   std::numeric_limits<double>::infinity());  // viterbi forward.
+  forward_cost[start] = 0.0; // lattice can't have cycles so couldn't be
+  // less than this.
+  double best_final_cost = std::numeric_limits<double>::infinity();
+  // Update the forward probs.
+  // Thanks to Jing Zheng for finding a bug here.
+  for (int32 state = 0; state < num_states; state++) {
+    double this_forward_cost = forward_cost[state];
+    for (fst::ArcIterator<LatType> aiter(*lat, state);
+         !aiter.Done();
+         aiter.Next()) {
+      const Arc &arc(aiter.Value());
+      StateId nextstate = arc.nextstate;
+      KALDI_ASSERT(nextstate > state && nextstate < num_states);
+      double next_forward_cost = this_forward_cost +
+          ConvertToCost(arc.weight);
+      if (forward_cost[nextstate] > next_forward_cost)
+        forward_cost[nextstate] = next_forward_cost;
+    }
+    Weight final_weight = lat->Final(state);
+    double this_final_cost = this_forward_cost +
+        ConvertToCost(final_weight);
+    if (this_final_cost < best_final_cost)
+      best_final_cost = this_final_cost;
+  }
+  int32 bad_state = lat->AddState(); // this state is not final.
+  double cutoff = best_final_cost + beam;
+
+  // Go backwards updating the backward probs (which share memory with the
+  // forward probs), and pruning arcs and deleting final-probs.  We prune arcs
+  // by making them point to the non-final state "bad_state".  We'll then use
+  // Trim() to remove unnecessary arcs and states.  [this is just easier than
+  // doing it ourselves.]
+  std::vector<double> &backward_cost(forward_cost);
+  for (int32 state = num_states - 1; state >= 0; state--) {
+    double this_forward_cost = forward_cost[state];
+    double this_backward_cost = ConvertToCost(lat->Final(state));
+    if (this_backward_cost + this_forward_cost > cutoff
+        && this_backward_cost != std::numeric_limits<double>::infinity())
+      lat->SetFinal(state, Weight::Zero());
+    for (fst::MutableArcIterator<LatType> aiter(lat, state);
+         !aiter.Done();
+         aiter.Next()) {
+      Arc arc(aiter.Value());
+      StateId nextstate = arc.nextstate;
+      KALDI_ASSERT(nextstate > state && nextstate < num_states);
+      double arc_cost = ConvertToCost(arc.weight),
+          arc_backward_cost = arc_cost + backward_cost[nextstate],
+          this_fb_cost = this_forward_cost + arc_backward_cost;
+      if (arc_backward_cost < this_backward_cost)
+        this_backward_cost = arc_backward_cost;
+      if (this_fb_cost > cutoff) { // Prune the arc.
+        arc.nextstate = bad_state;
+        aiter.SetValue(arc);
+      }
+    }
+    backward_cost[state] = this_backward_cost;
+  }
+  fst::Connect(lat);
+  return (lat->NumStates() > 0);
+}
+
+// instantiate the template for lattice and CompactLattice.
+template bool PruneLattice(BaseFloat beam, Lattice *lat);
+template bool PruneLattice(BaseFloat beam, CompactLattice *lat);
+
+
+BaseFloat LatticeForwardBackward(const Lattice &lat, Posterior *post,
+                                 double *acoustic_like_sum) {
+  // Note, Posterior is defined as follows:  Indexed [frame], then a list
+  // of (transition-id, posterior-probability) pairs.
+  // typedef std::vector<std::vector<std::pair<int32, BaseFloat> > > Posterior;
+  using namespace fst;
+  typedef Lattice::Arc Arc;
+  typedef Arc::Weight Weight;
+  typedef Arc::StateId StateId;
+
+  if (acoustic_like_sum) *acoustic_like_sum = 0.0;
+
+  // Make sure the lattice is topologically sorted.
+  if (lat.Properties(fst::kTopSorted, true) == 0)
+    KALDI_ERR << "Input lattice must be topologically sorted.";
+  KALDI_ASSERT(lat.Start() == 0);
+
+  int32 num_states = lat.NumStates();
+  vector<int32> state_times;
+  int32 max_time = LatticeStateTimes(lat, &state_times);
+  std::vector<double> alpha(num_states, kLogZeroDouble);
+  std::vector<double> &beta(alpha); // we re-use the same memory for
+  // this, but it's semantically distinct so we name it differently.
+  double tot_forward_prob = kLogZeroDouble;
+
+  post->clear();
+  post->resize(max_time);
+
+  alpha[0] = 0.0;
+  // Propagate alphas forward.
+  for (StateId s = 0; s < num_states; s++) {
+    double this_alpha = alpha[s];
+    for (ArcIterator<Lattice> aiter(lat, s); !aiter.Done(); aiter.Next()) {
+      const Arc &arc = aiter.Value();
+      double arc_like = -ConvertToCost(arc.weight);
+      alpha[arc.nextstate] = LogAdd(alpha[arc.nextstate], this_alpha + arc_like);
+    }
+    Weight f = lat.Final(s);
+    if (f != Weight::Zero()) {
+      double final_like = this_alpha - (f.Value1() + f.Value2());
+      tot_forward_prob = LogAdd(tot_forward_prob, final_like);
+      KALDI_ASSERT(state_times[s] == max_time &&
+                   "Lattice is inconsistent (final-prob not at max_time)");
+    }
+  }
+  for (StateId s = num_states-1; s >= 0; s--) {
+    Weight f = lat.Final(s);
+    double this_beta = -(f.Value1() + f.Value2());
+    for (ArcIterator<Lattice> aiter(lat, s); !aiter.Done(); aiter.Next()) {
+      const Arc &arc = aiter.Value();
+      double arc_like = -ConvertToCost(arc.weight),
+          arc_beta = beta[arc.nextstate] + arc_like;
+      this_beta = LogAdd(this_beta, arc_beta);
+      int32 transition_id = arc.ilabel;
+
+      // The following "if" is an optimization to avoid un-needed exp().
+      if (transition_id != 0 || acoustic_like_sum != NULL) {
+        double posterior = Exp(alpha[s] + arc_beta - tot_forward_prob);
+
+        if (transition_id != 0) // Arc has a transition-id on it [not epsilon]
+          (*post)[state_times[s]].push_back(std::make_pair(transition_id,
+                                                           static_cast<kaldi::BaseFloat>(posterior)));
+        if (acoustic_like_sum != NULL)
+          *acoustic_like_sum -= posterior * arc.weight.Value2();
+      }
+    }
+    if (acoustic_like_sum != NULL && f != Weight::Zero()) {
+      double final_logprob = - ConvertToCost(f),
+          posterior = Exp(alpha[s] + final_logprob - tot_forward_prob);
+      *acoustic_like_sum -= posterior * f.Value2();
+    }
+    beta[s] = this_beta;
+  }
+  double tot_backward_prob = beta[0];
+  if (!ApproxEqual(tot_forward_prob, tot_backward_prob, 1e-8)) {
+    KALDI_WARN << "Total forward probability over lattice = " << tot_forward_prob
+              << ", while total backward probability = " << tot_backward_prob;
+  }
+  // Now combine any posteriors with the same transition-id.
+  for (int32 t = 0; t < max_time; t++)
+    MergePairVectorSumming(&((*post)[t]));
+  return tot_backward_prob;
+}
+
+
+void LatticeActivePhones(const Lattice &lat, const TransitionInformation &trans,
+                         const vector<int32> &silence_phones,
+                         vector< std::set<int32> > *active_phones) {
+  KALDI_ASSERT(IsSortedAndUniq(silence_phones));
+  vector<int32> state_times;
+  int32 num_states = lat.NumStates();
+  int32 max_time = LatticeStateTimes(lat, &state_times);
+  active_phones->clear();
+  active_phones->resize(max_time);
+  for (int32 state = 0; state < num_states; state++) {
+    int32 cur_time = state_times[state];
+    for (fst::ArcIterator<Lattice> aiter(lat, state); !aiter.Done();
+        aiter.Next()) {
+      const LatticeArc &arc = aiter.Value();
+      if (arc.ilabel != 0) {  // Non-epsilon arc
+        int32 phone = trans.TransitionIdToPhone(arc.ilabel);
+        if (!std::binary_search(silence_phones.begin(),
+                                silence_phones.end(), phone))
+          (*active_phones)[cur_time].insert(phone);
+      }
+    }  // end looping over arcs
+  }  // end looping over states
+}
+
+void ConvertLatticeToPhones(const TransitionInformation &trans,
+                            Lattice *lat) {
+  typedef LatticeArc Arc;
+  int32 num_states = lat->NumStates();
+  for (int32 state = 0; state < num_states; state++) {
+    for (fst::MutableArcIterator<Lattice> aiter(lat, state); !aiter.Done();
+        aiter.Next()) {
+      Arc arc(aiter.Value());
+      arc.olabel = 0; // remove any word.
+      if ((arc.ilabel != 0) // has a transition-id on input..
+          && (trans.TransitionIdIsStartOfPhone(arc.ilabel))
+          && (!trans.IsSelfLoop(arc.ilabel))) {
+         // && trans.IsFinal(arc.ilabel)) // there is one of these per phone...
+        arc.olabel = trans.TransitionIdToPhone(arc.ilabel);
+      }
+      aiter.SetValue(arc);
+    }  // end looping over arcs
+  }  // end looping over states
+}
+
+
+static inline double LogAddOrMax(bool viterbi, double a, double b) {
+  if (viterbi)
+    return std::max(a, b);
+  else
+    return LogAdd(a, b);
+}
+
+template<typename LatticeType>
+double ComputeLatticeAlphasAndBetas(const LatticeType &lat,
+                                    bool viterbi,
+                                    vector<double> *alpha,
+                                    vector<double> *beta) {
+  typedef typename LatticeType::Arc Arc;
+  typedef typename Arc::Weight Weight;
+  typedef typename Arc::StateId StateId;
+
+  StateId num_states = lat.NumStates();
+  KALDI_ASSERT(lat.Properties(fst::kTopSorted, true) == fst::kTopSorted);
+  KALDI_ASSERT(lat.Start() == 0);
+  alpha->clear();
+  beta->clear();
+  alpha->resize(num_states, kLogZeroDouble);
+  beta->resize(num_states, kLogZeroDouble);
+
+  double tot_forward_prob = kLogZeroDouble;
+  (*alpha)[0] = 0.0;
+  // Propagate alphas forward.
+  for (StateId s = 0; s < num_states; s++) {
+    double this_alpha = (*alpha)[s];
+    for (fst::ArcIterator<LatticeType> aiter(lat, s); !aiter.Done();
+         aiter.Next()) {
+      const Arc &arc = aiter.Value();
+      double arc_like = -ConvertToCost(arc.weight);
+      (*alpha)[arc.nextstate] = LogAddOrMax(viterbi, (*alpha)[arc.nextstate],
+                                                this_alpha + arc_like);
+    }
+    Weight f = lat.Final(s);
+    if (f != Weight::Zero()) {
+      double final_like = this_alpha - ConvertToCost(f);
+      tot_forward_prob = LogAddOrMax(viterbi, tot_forward_prob, final_like);
+    }
+  }
+  for (StateId s = num_states-1; s >= 0; s--) { // it's guaranteed signed.
+    double this_beta = -ConvertToCost(lat.Final(s));
+    for (fst::ArcIterator<LatticeType> aiter(lat, s); !aiter.Done();
+         aiter.Next()) {
+      const Arc &arc = aiter.Value();
+      double arc_like = -ConvertToCost(arc.weight),
+          arc_beta = (*beta)[arc.nextstate] + arc_like;
+      this_beta = LogAddOrMax(viterbi, this_beta, arc_beta);
+    }
+    (*beta)[s] = this_beta;
+  }
+  double tot_backward_prob = (*beta)[lat.Start()];
+  if (!ApproxEqual(tot_forward_prob, tot_backward_prob, 1e-8)) {
+    KALDI_WARN << "Total forward probability over lattice = " << tot_forward_prob
+               << ", while total backward probability = " << tot_backward_prob;
+  }
+  // Split the difference when returning... they should be the same.
+  return 0.5 * (tot_backward_prob + tot_forward_prob);
+}
+
+// instantiate the template for Lattice and CompactLattice
+template
+double ComputeLatticeAlphasAndBetas(const Lattice &lat,
+                                    bool viterbi,
+                                    vector<double> *alpha,
+                                    vector<double> *beta);
+
+template
+double ComputeLatticeAlphasAndBetas(const CompactLattice &lat,
+                                    bool viterbi,
+                                    vector<double> *alpha,
+                                    vector<double> *beta);
+
+
+
+/// This is used in CompactLatticeLimitDepth.
+struct LatticeArcRecord {
+  BaseFloat logprob; // logprob <= 0 is the best Viterbi logprob of this arc,
+                     // minus the overall best-cost of the lattice.
+  CompactLatticeArc::StateId state; // state in the lattice.
+  size_t arc; // arc index within the state.
+  bool operator < (const LatticeArcRecord &other) const {
+    return logprob < other.logprob;
+  }
+};
+
+void CompactLatticeLimitDepth(int32 max_depth_per_frame,
+                              CompactLattice *clat) {
+  typedef CompactLatticeArc Arc;
+  typedef Arc::Weight Weight;
+  typedef Arc::StateId StateId;
+
+  if (clat->Start() == fst::kNoStateId) {
+    KALDI_WARN << "Limiting depth of empty lattice.";
+    return;
+  }
+  if (clat->Properties(fst::kTopSorted, true) == 0) {
+    if (!TopSort(clat))
+      KALDI_ERR << "Topological sorting of lattice failed.";
+  }
+
+  vector<int32> state_times;
+  int32 T = CompactLatticeStateTimes(*clat, &state_times);
+
+  // The alpha and beta quantities here are "viterbi" alphas and beta.
+  std::vector<double> alpha;
+  std::vector<double> beta;
+  bool viterbi = true;
+  double best_prob = ComputeLatticeAlphasAndBetas(*clat, viterbi,
+                                                  &alpha, &beta);
+
+  std::vector<std::vector<LatticeArcRecord> > arc_records(T);
+
+  StateId num_states = clat->NumStates();
+  for (StateId s = 0; s < num_states; s++) {
+    for (fst::ArcIterator<CompactLattice> aiter(*clat, s); !aiter.Done();
+         aiter.Next()) {
+      const Arc &arc = aiter.Value();
+      LatticeArcRecord arc_record;
+      arc_record.state = s;
+      arc_record.arc = aiter.Position();
+      arc_record.logprob =
+          (alpha[s] + beta[arc.nextstate] - ConvertToCost(arc.weight))
+           - best_prob;
+      KALDI_ASSERT(arc_record.logprob < 0.1); // Should be zero or negative.
+      int32 num_frames = arc.weight.String().size(), start_t = state_times[s];
+      for (int32 t = start_t; t < start_t + num_frames; t++) {
+        KALDI_ASSERT(t < T);
+        arc_records[t].push_back(arc_record);
+      }
+    }
+  }
+  StateId dead_state = clat->AddState(); // A non-coaccesible state which we use
+                                         // to remove arcs (make them end
+                                         // there).
+  size_t max_depth = max_depth_per_frame;
+  for (int32 t = 0; t < T; t++) {
+    size_t size = arc_records[t].size();
+    if (size > max_depth) {
+      // we sort from worst to best, so we keep the later-numbered ones,
+      // and delete the lower-numbered ones.
+      size_t cutoff = size - max_depth;
+      std::nth_element(arc_records[t].begin(),
+                       arc_records[t].begin() + cutoff,
+                       arc_records[t].end());
+      for (size_t index = 0; index < cutoff; index++) {
+        LatticeArcRecord record(arc_records[t][index]);
+        fst::MutableArcIterator<CompactLattice> aiter(clat, record.state);
+        aiter.Seek(record.arc);
+        Arc arc = aiter.Value();
+        if (arc.nextstate != dead_state) { // not already killed.
+          arc.nextstate = dead_state;
+          aiter.SetValue(arc);
+        }
+      }
+    }
+  }
+  Connect(clat);
+  TopSortCompactLatticeIfNeeded(clat);
+}
+
+
+void TopSortCompactLatticeIfNeeded(CompactLattice *clat) {
+  if (clat->Properties(fst::kTopSorted, true) == 0) {
+    if (fst::TopSort(clat) == false) {
+      KALDI_ERR << "Topological sorting failed";
+    }
+  }
+}
+
+void TopSortLatticeIfNeeded(Lattice *lat) {
+  if (lat->Properties(fst::kTopSorted, true) == 0) {
+    if (fst::TopSort(lat) == false) {
+      KALDI_ERR << "Topological sorting failed";
+    }
+  }
+}
+
+
+/// Returns the depth of the lattice, defined as the average number of
+/// arcs crossing any given frame.  Returns 1 for empty lattices.
+/// Requires that input is topologically sorted.
+BaseFloat CompactLatticeDepth(const CompactLattice &clat,
+                              int32 *num_frames) {
+  typedef CompactLattice::Arc::StateId StateId;
+  if (clat.Properties(fst::kTopSorted, true) == 0) {
+    KALDI_ERR << "Lattice input to CompactLatticeDepth was not topologically "
+              << "sorted.";
+  }
+  if (clat.Start() == fst::kNoStateId) {
+    *num_frames = 0;
+    return 1.0;
+  }
+  size_t num_arc_frames = 0;
+  int32 t;
+  {
+    vector<int32> state_times;
+    t = CompactLatticeStateTimes(clat, &state_times);
+  }
+  if (num_frames != NULL)
+    *num_frames = t;
+  for (StateId s = 0; s < clat.NumStates(); s++) {
+    for (fst::ArcIterator<CompactLattice> aiter(clat, s); !aiter.Done();
+         aiter.Next()) {
+      const CompactLatticeArc &arc = aiter.Value();
+      num_arc_frames += arc.weight.String().size();
+    }
+    num_arc_frames += clat.Final(s).String().size();
+  }
+  return num_arc_frames / static_cast<BaseFloat>(t);
+}
+
+
+void CompactLatticeDepthPerFrame(const CompactLattice &clat,
+                                 std::vector<int32> *depth_per_frame) {
+  typedef CompactLattice::Arc::StateId StateId;
+  if (clat.Properties(fst::kTopSorted, true) == 0) {
+    KALDI_ERR << "Lattice input to CompactLatticeDepthPerFrame was not "
+              << "topologically sorted.";
+  }
+  if (clat.Start() == fst::kNoStateId) {
+    depth_per_frame->clear();
+    return;
+  }
+  vector<int32> state_times;
+  int32 T = CompactLatticeStateTimes(clat, &state_times);
+
+  depth_per_frame->clear();
+  if (T <= 0) {
+    return;
+  } else {
+    depth_per_frame->resize(T, 0);
+    for (StateId s = 0; s < clat.NumStates(); s++) {
+      int32 start_time = state_times[s];
+      for (fst::ArcIterator<CompactLattice> aiter(clat, s); !aiter.Done();
+           aiter.Next()) {
+        const CompactLatticeArc &arc = aiter.Value();
+        int32 len = arc.weight.String().size();
+        for (int32 t = start_time; t < start_time + len; t++) {
+          KALDI_ASSERT(t < T);
+          (*depth_per_frame)[t]++;
+        }
+      }
+      int32 final_len = clat.Final(s).String().size();
+      for (int32 t = start_time; t < start_time + final_len; t++) {
+        KALDI_ASSERT(t < T);
+        (*depth_per_frame)[t]++;
+      }
+    }
+  }
+}
+
+
+
+void ConvertCompactLatticeToPhones(const TransitionInformation &trans,
+                                   CompactLattice *clat) {
+  typedef CompactLatticeArc Arc;
+  typedef Arc::Weight Weight;
+  int32 num_states = clat->NumStates();
+  for (int32 state = 0; state < num_states; state++) {
+    for (fst::MutableArcIterator<CompactLattice> aiter(clat, state);
+         !aiter.Done();
+         aiter.Next()) {
+      Arc arc(aiter.Value());
+      std::vector<int32> phone_seq;
+      const std::vector<int32> &tid_seq = arc.weight.String();
+      for (std::vector<int32>::const_iterator iter = tid_seq.begin();
+           iter != tid_seq.end(); ++iter) {
+        if (trans.IsFinal(*iter))// note: there is one of these per phone...
+          phone_seq.push_back(trans.TransitionIdToPhone(*iter));
+      }
+      arc.weight.SetString(phone_seq);
+      aiter.SetValue(arc);
+    } // end looping over arcs
+    Weight f = clat->Final(state);
+    if (f != Weight::Zero()) {
+      std::vector<int32> phone_seq;
+      const std::vector<int32> &tid_seq = f.String();
+      for (std::vector<int32>::const_iterator iter = tid_seq.begin();
+           iter != tid_seq.end(); ++iter) {
+        if (trans.IsFinal(*iter))// note: there is one of these per phone...
+          phone_seq.push_back(trans.TransitionIdToPhone(*iter));
+      }
+      f.SetString(phone_seq);
+      clat->SetFinal(state, f);
+    }
+  }  // end looping over states
+}
+
+bool LatticeBoost(const TransitionInformation &trans,
+                  const std::vector<int32> &alignment,
+                  const std::vector<int32> &silence_phones,
+                  BaseFloat b,
+                  BaseFloat max_silence_error,
+                  Lattice *lat) {
+  TopSortLatticeIfNeeded(lat);
+
+  // get all stored properties (test==false means don't test if not known).
+  uint64 props = lat->Properties(fst::kFstProperties,
+                                 false);
+
+  KALDI_ASSERT(IsSortedAndUniq(silence_phones));
+  KALDI_ASSERT(max_silence_error >= 0.0 && max_silence_error <= 1.0);
+  vector<int32> state_times;
+  int32 num_states = lat->NumStates();
+  int32 num_frames = LatticeStateTimes(*lat, &state_times);
+  KALDI_ASSERT(num_frames == static_cast<int32>(alignment.size()));
+  for (int32 state = 0; state < num_states; state++) {
+    int32 cur_time = state_times[state];
+    for (fst::MutableArcIterator<Lattice> aiter(lat, state); !aiter.Done();
+         aiter.Next()) {
+      LatticeArc arc = aiter.Value();
+      if (arc.ilabel != 0) {  // Non-epsilon arc
+        if (arc.ilabel < 0 || arc.ilabel > trans.NumTransitionIds()) {
+          KALDI_WARN << "Lattice has out-of-range transition-ids: "
+                     << "lattice/model mismatch?";
+          return false;
+        }
+        int32 phone = trans.TransitionIdToPhone(arc.ilabel),
+            ref_phone = trans.TransitionIdToPhone(alignment[cur_time]);
+        BaseFloat frame_error;
+        if (phone == ref_phone) {
+          frame_error = 0.0;
+        } else { // an error...
+          if (std::binary_search(silence_phones.begin(), silence_phones.end(), phone))
+            frame_error = max_silence_error;
+          else
+            frame_error = 1.0;
+        }
+        BaseFloat delta_cost = -b * frame_error; // negative cost if
+        // frame is wrong, to boost likelihood of arcs with errors on them.
+        // Add this cost to the graph part.
+        arc.weight.SetValue1(arc.weight.Value1() + delta_cost);
+        aiter.SetValue(arc);
+      }
+    }
+  }
+  // All we changed is the weights, so any properties that were
+  // known before, are still known, except for whether or not the
+  // lattice was weighted.
+  lat->SetProperties(props,
+                     ~(fst::kWeighted|fst::kUnweighted));
+
+  return true;
+}
+
+
+
+BaseFloat LatticeForwardBackwardMpeVariants(
+    const TransitionInformation &trans,
+    const std::vector<int32> &silence_phones,
+    const Lattice &lat,
+    const std::vector<int32> &num_ali,
+    std::string criterion,
+    bool one_silence_class,
+    Posterior *post) {
+  using namespace fst;
+  typedef Lattice::Arc Arc;
+  typedef Arc::Weight Weight;
+  typedef Arc::StateId StateId;
+
+  KALDI_ASSERT(criterion == "mpfe" || criterion == "smbr");
+  bool is_mpfe = (criterion == "mpfe");
+
+  if (lat.Properties(fst::kTopSorted, true) == 0)
+    KALDI_ERR << "Input lattice must be topologically sorted.";
+  KALDI_ASSERT(lat.Start() == 0);
+
+  int32 num_states = lat.NumStates();
+  vector<int32> state_times;
+  int32 max_time = LatticeStateTimes(lat, &state_times);
+  KALDI_ASSERT(max_time == static_cast<int32>(num_ali.size()));
+  std::vector<double> alpha(num_states, kLogZeroDouble),
+      alpha_smbr(num_states, 0), //forward variable for sMBR
+      beta(num_states, kLogZeroDouble),
+      beta_smbr(num_states, 0); //backward variable for sMBR
+
+  double tot_forward_prob = kLogZeroDouble;
+  double tot_forward_score = 0;
+
+  post->clear();
+  post->resize(max_time);
+
+  alpha[0] = 0.0;
+  // First Pass Forward,
+  for (StateId s = 0; s < num_states; s++) {
+    double this_alpha = alpha[s];
+    for (ArcIterator<Lattice> aiter(lat, s); !aiter.Done(); aiter.Next()) {
+      const Arc &arc = aiter.Value();
+      double arc_like = -ConvertToCost(arc.weight);
+      alpha[arc.nextstate] = LogAdd(alpha[arc.nextstate], this_alpha + arc_like);
+    }
+    Weight f = lat.Final(s);
+    if (f != Weight::Zero()) {
+      double final_like = this_alpha - (f.Value1() + f.Value2());
+      tot_forward_prob = LogAdd(tot_forward_prob, final_like);
+      KALDI_ASSERT(state_times[s] == max_time &&
+                   "Lattice is inconsistent (final-prob not at max_time)");
+    }
+  }
+  // First Pass Backward,
+  for (StateId s = num_states-1; s >= 0; s--) {
+    Weight f = lat.Final(s);
+    double this_beta = -(f.Value1() + f.Value2());
+    for (ArcIterator<Lattice> aiter(lat, s); !aiter.Done(); aiter.Next()) {
+      const Arc &arc = aiter.Value();
+      double arc_like = -ConvertToCost(arc.weight),
+          arc_beta = beta[arc.nextstate] + arc_like;
+      this_beta = LogAdd(this_beta, arc_beta);
+    }
+    beta[s] = this_beta;
+  }
+  // First Pass Forward-Backward Check
+  double tot_backward_prob = beta[0];
+  // may loose the condition somehow here 1e-6 (was 1e-8)
+  if (!ApproxEqual(tot_forward_prob, tot_backward_prob, 1e-6)) {
+    KALDI_ERR << "Total forward probability over lattice = " << tot_forward_prob
+              << ", while total backward probability = " << tot_backward_prob;
+  }
+
+  alpha_smbr[0] = 0.0;
+  // Second Pass Forward, calculate forward for MPFE/SMBR
+  for (StateId s = 0; s < num_states; s++) {
+    double this_alpha = alpha[s];
+    for (ArcIterator<Lattice> aiter(lat, s); !aiter.Done(); aiter.Next()) {
+      const Arc &arc = aiter.Value();
+      double arc_like = -ConvertToCost(arc.weight);
+      double frame_acc = 0.0;
+      if (arc.ilabel != 0) {
+        int32 cur_time = state_times[s];
+        int32 phone = trans.TransitionIdToPhone(arc.ilabel),
+            ref_phone = trans.TransitionIdToPhone(num_ali[cur_time]);
+        bool phone_is_sil = std::binary_search(silence_phones.begin(),
+                                               silence_phones.end(),
+                                               phone),
+            ref_phone_is_sil = std::binary_search(silence_phones.begin(),
+                                                  silence_phones.end(),
+                                                  ref_phone),
+            both_sil = phone_is_sil && ref_phone_is_sil;
+        if (!is_mpfe) { // smbr.
+          int32 pdf = trans.TransitionIdToPdf(arc.ilabel),
+              ref_pdf = trans.TransitionIdToPdf(num_ali[cur_time]);
+          if (!one_silence_class)  // old behavior
+            frame_acc = (pdf == ref_pdf && !phone_is_sil) ? 1.0 : 0.0;
+          else
+            frame_acc = (pdf == ref_pdf || both_sil) ? 1.0 : 0.0;
+        } else {
+          if (!one_silence_class)  // old behavior
+            frame_acc = (phone == ref_phone && !phone_is_sil) ? 1.0 : 0.0;
+          else
+            frame_acc = (phone == ref_phone || both_sil) ? 1.0 : 0.0;
+        }
+      }
+      double arc_scale = Exp(alpha[s] + arc_like - alpha[arc.nextstate]);
+      alpha_smbr[arc.nextstate] += arc_scale * (alpha_smbr[s] + frame_acc);
+    }
+    Weight f = lat.Final(s);
+    if (f != Weight::Zero()) {
+      double final_like = this_alpha - (f.Value1() + f.Value2());
+      double arc_scale = Exp(final_like - tot_forward_prob);
+      tot_forward_score += arc_scale * alpha_smbr[s];
+      KALDI_ASSERT(state_times[s] == max_time &&
+                   "Lattice is inconsistent (final-prob not at max_time)");
+    }
+  }
+  // Second Pass Backward, collect Mpe style posteriors
+  for (StateId s = num_states-1; s >= 0; s--) {
+    for (ArcIterator<Lattice> aiter(lat, s); !aiter.Done(); aiter.Next()) {
+      const Arc &arc = aiter.Value();
+      double arc_like = -ConvertToCost(arc.weight),
+          arc_beta = beta[arc.nextstate] + arc_like;
+      double frame_acc = 0.0;
+      int32 transition_id = arc.ilabel;
+      if (arc.ilabel != 0) {
+        int32 cur_time = state_times[s];
+        int32 phone = trans.TransitionIdToPhone(arc.ilabel),
+            ref_phone = trans.TransitionIdToPhone(num_ali[cur_time]);
+        bool phone_is_sil = std::binary_search(silence_phones.begin(),
+                                               silence_phones.end(), phone),
+            ref_phone_is_sil = std::binary_search(silence_phones.begin(),
+                                                  silence_phones.end(),
+                                                  ref_phone),
+            both_sil = phone_is_sil && ref_phone_is_sil;
+        if (!is_mpfe) { // smbr.
+          int32 pdf = trans.TransitionIdToPdf(arc.ilabel),
+              ref_pdf = trans.TransitionIdToPdf(num_ali[cur_time]);
+          if (!one_silence_class)  // old behavior
+            frame_acc = (pdf == ref_pdf && !phone_is_sil) ? 1.0 : 0.0;
+          else
+            frame_acc = (pdf == ref_pdf || both_sil) ? 1.0 : 0.0;
+        } else {
+          if (!one_silence_class)  // old behavior
+            frame_acc = (phone == ref_phone && !phone_is_sil) ? 1.0 : 0.0;
+          else
+            frame_acc = (phone == ref_phone || both_sil) ? 1.0 : 0.0;
+        }
+      }
+      double arc_scale = Exp(beta[arc.nextstate] + arc_like - beta[s]);
+      // check arc_scale NAN,
+      // this is to prevent partial paths in Lattices
+      // i.e., paths don't survive to the final state
+      if (KALDI_ISNAN(arc_scale)) arc_scale = 0;
+      beta_smbr[s] += arc_scale * (beta_smbr[arc.nextstate] + frame_acc);
+
+      if (transition_id != 0) { // Arc has a transition-id on it [not epsilon]
+        double posterior = Exp(alpha[s] + arc_beta - tot_forward_prob);
+        double acc_diff = alpha_smbr[s] + frame_acc + beta_smbr[arc.nextstate]
+                               - tot_forward_score;
+        double posterior_smbr = posterior * acc_diff;
+        (*post)[state_times[s]].push_back(std::make_pair(transition_id,
+                                                         static_cast<BaseFloat>(posterior_smbr)));
+      }
+    }
+  }
+
+  //Second Pass Forward Backward check
+  double tot_backward_score = beta_smbr[0];  // Initial state id == 0
+  // may loose the condition somehow here 1e-5/1e-4
+  if (!ApproxEqual(tot_forward_score, tot_backward_score, 1e-4)) {
+    KALDI_ERR << "Total forward score over lattice = " << tot_forward_score
+              << ", while total backward score = " << tot_backward_score;
+  }
+
+  // Output the computed posteriors
+  for (int32 t = 0; t < max_time; t++)
+    MergePairVectorSumming(&((*post)[t]));
+  return tot_forward_score;
+}
+
+bool CompactLatticeToWordAlignment(const CompactLattice &clat,
+                                   std::vector<int32> *words,
+                                   std::vector<int32> *begin_times,
+                                   std::vector<int32> *lengths) {
+  words->clear();
+  begin_times->clear();
+  lengths->clear();
+  typedef CompactLattice::Arc Arc;
+  typedef Arc::Label Label;
+  typedef CompactLattice::StateId StateId;
+  typedef CompactLattice::Weight Weight;
+  using namespace fst;
+  StateId state = clat.Start();
+  int32 cur_time = 0;
+  if (state == kNoStateId) {
+    KALDI_WARN << "Empty lattice.";
+    return false;
+  }
+  while (1) {
+    Weight final = clat.Final(state);
+    size_t num_arcs = clat.NumArcs(state);
+    if (final != Weight::Zero()) {
+      if (num_arcs != 0) {
+        KALDI_WARN << "Lattice is not linear.";
+        return false;
+      }
+      if (! final.String().empty()) {
+        KALDI_WARN << "Lattice has alignments on final-weight: probably "
+            "was not word-aligned (alignments will be approximate)";
+      }
+      return true;
+    } else {
+      if (num_arcs != 1) {
+        KALDI_WARN << "Lattice is not linear: num-arcs = " << num_arcs;
+        return false;
+      }
+      fst::ArcIterator<CompactLattice> aiter(clat, state);
+      const Arc &arc = aiter.Value();
+      Label word_id = arc.ilabel; // Note: ilabel==olabel, since acceptor.
+      // Also note: word_id may be zero; we output it anyway.
+      int32 length = arc.weight.String().size();
+      words->push_back(word_id);
+      begin_times->push_back(cur_time);
+      lengths->push_back(length);
+      cur_time += length;
+      state = arc.nextstate;
+    }
+  }
+}
+
+
+void CompactLatticeShortestPath(const CompactLattice &clat,
+                                CompactLattice *shortest_path) {
+  using namespace fst;
+  if (clat.Properties(fst::kTopSorted, true) == 0) {
+    CompactLattice clat_copy(clat);
+    if (!TopSort(&clat_copy))
+      KALDI_ERR << "Was not able to topologically sort lattice (cycles found?)";
+    CompactLatticeShortestPath(clat_copy, shortest_path);
+    return;
+  }
+  // Now we can assume it's topologically sorted.
+  shortest_path->DeleteStates();
+  if (clat.Start() == kNoStateId) return;
+  typedef CompactLatticeArc Arc;
+  typedef Arc::StateId StateId;
+  typedef CompactLatticeWeight Weight;
+  vector<std::pair<double, StateId> > best_cost_and_pred(clat.NumStates() + 1);
+  StateId superfinal = clat.NumStates();
+  for (StateId s = 0; s <= clat.NumStates(); s++) {
+    best_cost_and_pred[s].first = std::numeric_limits<double>::infinity();
+    best_cost_and_pred[s].second = fst::kNoStateId;
+  }
+  best_cost_and_pred[clat.Start()].first = 0;
+  for (StateId s = 0; s < clat.NumStates(); s++) {
+    double my_cost = best_cost_and_pred[s].first;
+    for (ArcIterator<CompactLattice> aiter(clat, s);
+         !aiter.Done();
+         aiter.Next()) {
+      const Arc &arc = aiter.Value();
+      double arc_cost = ConvertToCost(arc.weight),
+          next_cost = my_cost + arc_cost;
+      if (next_cost < best_cost_and_pred[arc.nextstate].first) {
+        best_cost_and_pred[arc.nextstate].first = next_cost;
+        best_cost_and_pred[arc.nextstate].second = s;
+      }
+    }
+    double final_cost = ConvertToCost(clat.Final(s)),
+        tot_final = my_cost + final_cost;
+    if (tot_final < best_cost_and_pred[superfinal].first) {
+      best_cost_and_pred[superfinal].first = tot_final;
+      best_cost_and_pred[superfinal].second = s;
+    }
+  }
+  std::vector<StateId> states; // states on best path.
+  StateId cur_state = superfinal, start_state = clat.Start();
+  while (cur_state != start_state) {
+    StateId prev_state = best_cost_and_pred[cur_state].second;
+    if (prev_state == kNoStateId) {
+      KALDI_WARN << "Failure in best-path algorithm for lattice (infinite costs?)";
+      return; // return empty best-path.
+    }
+    states.push_back(prev_state);
+    KALDI_ASSERT(cur_state != prev_state && "Lattice with cycles");
+    cur_state = prev_state;
+  }
+  std::reverse(states.begin(), states.end());
+  for (size_t i = 0; i < states.size(); i++)
+    shortest_path->AddState();
+  for (StateId s = 0; static_cast<size_t>(s) < states.size(); s++) {
+    if (s == 0) shortest_path->SetStart(s);
+    if (static_cast<size_t>(s + 1) < states.size()) { // transition to next state.
+      bool have_arc = false;
+      Arc cur_arc;
+      for (ArcIterator<CompactLattice> aiter(clat, states[s]);
+           !aiter.Done();
+           aiter.Next()) {
+        const Arc &arc = aiter.Value();
+        if (arc.nextstate == states[s+1]) {
+          if (!have_arc ||
+              ConvertToCost(arc.weight) < ConvertToCost(cur_arc.weight)) {
+            cur_arc = arc;
+            have_arc = true;
+          }
+        }
+      }
+      KALDI_ASSERT(have_arc && "Code error.");
+      shortest_path->AddArc(s, Arc(cur_arc.ilabel, cur_arc.olabel,
+                                   cur_arc.weight, s+1));
+    } else { // final-prob.
+      shortest_path->SetFinal(s, clat.Final(states[s]));
+    }
+  }
+}
+
+
+void ExpandCompactLattice(const CompactLattice &clat,
+                          double epsilon,
+                          CompactLattice *expand_clat) {
+  using namespace fst;
+  typedef CompactLattice::Arc Arc;
+  typedef Arc::Weight Weight;
+  typedef Arc::StateId StateId;
+  typedef std::pair<StateId, StateId> StatePair;
+  typedef unordered_map<StatePair, StateId, PairHasher<StateId> > MapType;
+  typedef MapType::iterator IterType;
+
+  if (clat.Start() == kNoStateId) return;
+  // Make sure the input lattice is topologically sorted.
+  if (clat.Properties(kTopSorted, true) == 0) {
+    CompactLattice clat_copy(clat);
+    KALDI_LOG << "Topsort this lattice.";
+    if (!TopSort(&clat_copy))
+      KALDI_ERR << "Was not able to topologically sort lattice (cycles found?)";
+    ExpandCompactLattice(clat_copy, epsilon, expand_clat);
+    return;
+  }
+
+  // Compute backward logprobs betas for the expanded lattice.
+  // Note: the backward logprobs in the original lattice <clat> and the
+  // expanded lattice <expand_clat> are the same.
+  int32 num_states = clat.NumStates();
+  std::vector<double> beta(num_states, kLogZeroDouble);
+  ComputeCompactLatticeBetas(clat, &beta);
+  double tot_backward_logprob = beta[0];
+  std::vector<double> alpha;
+  alpha.push_back(0.0);
+  expand_clat->DeleteStates();
+  MapType state_map; // Map from state pair (orig_state, copy_state) to
+  // copy_state, where orig_state is a state in the original lattice, and
+  // copy_state is its corresponding one in the expanded lattice.
+  unordered_map<StateId, StateId> states; // Map from orig_state to its
+  // copy_state for states with incoming arcs' posteriors <= epsilon.
+  std::queue<StatePair> state_queue;
+
+  // Set start state in the expanded lattice.
+  StateId start_state = expand_clat->AddState();
+  expand_clat->SetStart(start_state);
+  StatePair start_pair(clat.Start(), start_state);
+  state_queue.push(start_pair);
+  std::pair<IterType, bool> result =
+    state_map.insert(std::make_pair(start_pair, start_state));
+  KALDI_ASSERT(result.second == true);
+
+  // Expand <clat> and update forward logprobs alphas in <expand_clat>.
+  while (!state_queue.empty()) {
+    StatePair s = state_queue.front();
+    StateId s1 = s.first,
+            s2 = s.second;
+    state_queue.pop();
+
+    Weight f = clat.Final(s1);
+    if (f != Weight::Zero()) {
+      KALDI_ASSERT(state_map.find(s) != state_map.end());
+      expand_clat->SetFinal(state_map[s], f);
+    }
+
+    for (ArcIterator<CompactLattice> aiter(clat, s1);
+         !aiter.Done(); aiter.Next()) {
+      const Arc &arc = aiter.Value();
+      StateId orig_state = arc.nextstate;
+      double arc_like = -ConvertToCost(arc.weight),
+             this_alpha = alpha[s2] + arc_like,
+             arc_post = Exp(this_alpha + beta[orig_state] -
+                            tot_backward_logprob);
+      // Generate the expanded lattice.
+      StateId copy_state;
+      if (arc_post > epsilon) {
+        copy_state = expand_clat->AddState();
+        StatePair next_pair(orig_state, copy_state);
+        std::pair<IterType, bool> result =
+          state_map.insert(std::make_pair(next_pair, copy_state));
+        KALDI_ASSERT(result.second == true);
+        state_queue.push(next_pair);
+      } else {
+        unordered_map<StateId, StateId>::iterator iter = states.find(orig_state);
+        if (iter == states.end() ) { // The counterpart state of orig_state
+                                   // has not been created in <expand_clat> yet.
+          copy_state = expand_clat->AddState();
+          StatePair next_pair(orig_state, copy_state);
+          std::pair<IterType, bool> result =
+            state_map.insert(std::make_pair(next_pair, copy_state));
+          KALDI_ASSERT(result.second == true);
+          state_queue.push(next_pair);
+          states[orig_state] = copy_state;
+        } else {
+          copy_state = iter->second;
+        }
+      }
+      // Create an arc from state_map[s] to copy_state in the expanded lattice.
+      expand_clat->AddArc(state_map[s], Arc(arc.ilabel, arc.olabel, arc.weight,
+                                            copy_state));
+      // Compute forward logprobs alpha for the expanded lattice.
+      if ((alpha.size() - 1) < copy_state) { // The first time to compute alpha
+                                             // for copy_state in <expand_clat>.
+        alpha.push_back(this_alpha);
+      } else { // Accumulate alpha.
+        alpha[copy_state] = LogAdd(alpha[copy_state], this_alpha);
+      }
+    }
+  } // end while
+}
+
+
+void CompactLatticeBestCostsAndTracebacks(
+    const CompactLattice &clat,
+    CostTraceType *forward_best_cost_and_pred,
+    CostTraceType *backward_best_cost_and_pred) {
+
+  // typedef the arc, weight types
+  typedef CompactLatticeArc Arc;
+  typedef Arc::Weight Weight;
+  typedef Arc::StateId StateId;
+
+  forward_best_cost_and_pred->clear();
+  backward_best_cost_and_pred->clear();
+  forward_best_cost_and_pred->resize(clat.NumStates());
+  backward_best_cost_and_pred->resize(clat.NumStates());
+  // Initialize the cost and predecessor state for each state.
+  for (StateId s = 0; s < clat.NumStates(); s++) {
+    (*forward_best_cost_and_pred)[s].first =
+                                        std::numeric_limits<double>::infinity();
+    (*backward_best_cost_and_pred)[s].first =
+                                        std::numeric_limits<double>::infinity();
+    (*forward_best_cost_and_pred)[s].second = fst::kNoStateId;
+    (*backward_best_cost_and_pred)[s].second = fst::kNoStateId;
+  }
+
+  StateId start_state = clat.Start();
+  (*forward_best_cost_and_pred)[start_state].first = 0;
+  // Transverse the lattice forwardly to compute the best cost from the start
+  // state to each state and the best predecessor state of each state.
+  for (StateId s = 0; s < clat.NumStates(); s++) {
+    double cur_cost = (*forward_best_cost_and_pred)[s].first;
+    for (fst::ArcIterator<CompactLattice> aiter(clat, s);
+         !aiter.Done(); aiter.Next()) {
+      const Arc &arc = aiter.Value();
+      double next_cost = cur_cost + ConvertToCost(arc.weight);
+      if (next_cost < (*forward_best_cost_and_pred)[arc.nextstate].first) {
+        (*forward_best_cost_and_pred)[arc.nextstate].first = next_cost;
+        (*forward_best_cost_and_pred)[arc.nextstate].second = s;
+      }
+    }
+  }
+  // Transverse the lattice backwardly to compute the best cost from a final
+  // state to each state and the best predecessor state of each state.
+  for (StateId s = clat.NumStates() - 1; s >= 0; s--) {
+    double this_cost = ConvertToCost(clat.Final(s));
+    for (fst::ArcIterator<CompactLattice> aiter(clat, s);
+         !aiter.Done(); aiter.Next()) {
+      const Arc &arc = aiter.Value();
+      double next_cost = (*backward_best_cost_and_pred)[arc.nextstate].first +
+        ConvertToCost(arc.weight);
+      if (next_cost < this_cost) {
+        this_cost = next_cost;
+        (*backward_best_cost_and_pred)[s].second = arc.nextstate;
+      }
+    }
+    (*backward_best_cost_and_pred)[s].first = this_cost;
+  }
+}
+
+
+void AddNnlmScoreToCompactLattice(const MapT &nnlm_scores,
+                                  CompactLattice *clat) {
+  if (clat->Start() == fst::kNoStateId) return;
+  // Make sure the input lattice is topologically sorted.
+  if (clat->Properties(fst::kTopSorted, true) == 0) {
+    KALDI_LOG << "Topsort this lattice.";
+    if (!TopSort(clat))
+      KALDI_ERR << "Was not able to topologically sort lattice (cycles found?)";
+    AddNnlmScoreToCompactLattice(nnlm_scores, clat);
+    return;
+  }
+
+  // typedef the arc, weight types
+  typedef CompactLatticeArc Arc;
+  typedef Arc::Weight Weight;
+  typedef Arc::StateId StateId;
+  typedef std::pair<int32, int32> StatePair;
+
+  int32 num_states = clat->NumStates();
+  unordered_map<StatePair, bool, PairHasher<int32> > final_state_check;
+  for (StateId s = 0; s < num_states; s++) {
+    for (fst::MutableArcIterator<CompactLattice> aiter(clat, s);
+         !aiter.Done(); aiter.Next()) {
+      Arc arc(aiter.Value());
+      StatePair arc_index = std::make_pair(static_cast<int32>(s),
+                                           static_cast<int32>(arc.nextstate));
+      MapT::const_iterator it = nnlm_scores.find(arc_index);
+      double nnlm_score;
+      if (it != nnlm_scores.end())
+        nnlm_score = it->second;
+      else
+        KALDI_ERR << "Some arc does not have neural language model score.";
+      if (arc.ilabel != 0) { // if there is a word on this arc
+        LatticeWeight weight = arc.weight.Weight();
+        // Add associated neural LM score to each arc.
+        weight.SetValue1(weight.Value1() + nnlm_score);
+        arc.weight.SetWeight(weight);
+        aiter.SetValue(arc);
+      }
+      Weight clat_final = clat->Final(arc.nextstate);
+      StatePair final_pair = std::make_pair(arc.nextstate, arc.nextstate);
+      // Add neural LM scores to each final state only once.
+      if (clat_final != CompactLatticeWeight::Zero() &&
+          final_state_check.find(final_pair) == final_state_check.end()) {
+        MapT::const_iterator final_it = nnlm_scores.find(final_pair);
+        double final_nnlm_score = 0.0;
+        if (final_it != nnlm_scores.end())
+          final_nnlm_score = final_it->second;
+        // Add neural LM scores to the final weight.
+        Weight final_weight(LatticeWeight(clat_final.Weight().Value1() +
+                                          final_nnlm_score,
+                                          clat_final.Weight().Value2()),
+                                          clat_final.String());
+        clat->SetFinal(arc.nextstate, final_weight);
+        final_state_check[final_pair] = true;
+      }
+    } // end looping over arcs
+  } // end looping over states
+}
+
+void AddWordInsPenToCompactLattice(BaseFloat word_ins_penalty,
+                                   CompactLattice *clat) {
+  typedef CompactLatticeArc Arc;
+  int32 num_states = clat->NumStates();
+
+  //scan the lattice
+  for (int32 state = 0; state < num_states; state++) {
+    for (fst::MutableArcIterator<CompactLattice> aiter(clat, state);
+         !aiter.Done(); aiter.Next()) {
+
+      Arc arc(aiter.Value());
+
+      if (arc.ilabel != 0) { // if there is a word on this arc
+        LatticeWeight weight = arc.weight.Weight();
+        // add word insertion penalty to lattice
+        weight.SetValue1( weight.Value1() + word_ins_penalty);
+        arc.weight.SetWeight(weight);
+        aiter.SetValue(arc);
+      }
+    } // end looping over arcs
+  }  // end looping over states
+}
+
+struct ClatRescoreTuple {
+  ClatRescoreTuple(int32 state, int32 arc, int32 tid):
+      state_id(state), arc_id(arc), tid(tid) { }
+  int32 state_id;
+  int32 arc_id;
+  int32 tid;
+};
+
+/** RescoreCompactLatticeInternal is the internal code for both
+    RescoreCompactLattice and RescoreCompatLatticeSpeedup.  For
+    RescoreCompactLattice, "tmodel" will be NULL and speedup_factor will be 1.0.
+ */
+bool RescoreCompactLatticeInternal(
+    const TransitionInformation *tmodel,
+    BaseFloat speedup_factor,
+    DecodableInterface *decodable,
+    CompactLattice *clat) {
+  KALDI_ASSERT(speedup_factor >= 1.0);
+  if (clat->NumStates() == 0) {
+    KALDI_WARN << "Rescoring empty lattice";
+    return false;
+  }
+  if (!clat->Properties(fst::kTopSorted, true)) {
+    if (fst::TopSort(clat) == false) {
+      KALDI_WARN << "Cycles detected in lattice.";
+      return false;
+    }
+  }
+  std::vector<int32> state_times;
+  int32 utt_len = kaldi::CompactLatticeStateTimes(*clat, &state_times);
+
+  std::vector<std::vector<ClatRescoreTuple> > time_to_state(utt_len);
+
+  int32 num_states = clat->NumStates();
+  KALDI_ASSERT(num_states == state_times.size());
+  for (size_t state = 0; state < num_states; state++) {
+    KALDI_ASSERT(state_times[state] >= 0);
+    int32 t = state_times[state];
+    int32 arc_id = 0;
+    for (fst::MutableArcIterator<CompactLattice> aiter(clat, state);
+         !aiter.Done(); aiter.Next(), arc_id++) {
+      CompactLatticeArc arc = aiter.Value();
+      std::vector<int32> arc_string = arc.weight.String();
+
+      for (size_t offset = 0; offset < arc_string.size(); offset++) {
+        if (t < utt_len) { // end state may be past this..
+          int32 tid = arc_string[offset];
+          time_to_state[t+offset].push_back(ClatRescoreTuple(state, arc_id, tid));
+        } else {
+          if (t != utt_len) {
+            KALDI_WARN << "There appears to be lattice/feature mismatch, "
+                       << "aborting.";
+            return false;
+          }
+        }
+      }
+    }
+    if (clat->Final(state) != CompactLatticeWeight::Zero()) {
+      arc_id = -1;
+      std::vector<int32> arc_string = clat->Final(state).String();
+      for (size_t offset = 0; offset < arc_string.size(); offset++) {
+        KALDI_ASSERT(t + offset < utt_len); // already checked in
+        // CompactLatticeStateTimes, so would be code error.
+        time_to_state[t+offset].push_back(
+            ClatRescoreTuple(state, arc_id, arc_string[offset]));
+      }
+    }
+  }
+
+  for (int32 t = 0; t < utt_len; t++) {
+    if ((t < utt_len - 1) && decodable->IsLastFrame(t)) {
+      KALDI_WARN << "Features are too short for lattice: utt-len is "
+                 << utt_len << ", " << t << " is last frame";
+      return false;
+    }
+    // frame_scale is the scale we put on the computed acoustic probs for this
+    // frame.  It will always be 1.0 if tmodel == NULL (i.e. if we are not doing
+    // the "speedup" code).  For frames with multiple pdf-ids it will be one.
+    // For frames with only one pdf-id, it will equal speedup_factor (>=1.0)
+    // with probability 1.0 / speedup_factor, and zero otherwise.  If it is zero,
+    // we can avoid computing the probabilities.
+    BaseFloat frame_scale = 1.0;
+    KALDI_ASSERT(!time_to_state[t].empty());
+    if (tmodel != NULL) {
+      int32 pdf_id = tmodel->TransitionIdToPdf(time_to_state[t][0].tid);
+      bool frame_has_multiple_pdfs = false;
+      for (size_t i = 1; i < time_to_state[t].size(); i++) {
+        if (tmodel->TransitionIdToPdf(time_to_state[t][i].tid) != pdf_id) {
+          frame_has_multiple_pdfs = true;
+          break;
+        }
+      }
+      if (frame_has_multiple_pdfs) {
+        frame_scale = 1.0;
+      } else {
+        if (WithProb(1.0 / speedup_factor)) {
+          frame_scale = speedup_factor;
+        } else {
+          frame_scale = 0.0;
+        }
+      }
+      if (frame_scale == 0.0)
+        continue; // the code below would be pointless.
+    }
+
+    for (size_t i = 0; i < time_to_state[t].size(); i++) {
+      int32 state = time_to_state[t][i].state_id;
+      int32 arc_id = time_to_state[t][i].arc_id;
+      int32 tid = time_to_state[t][i].tid;
+
+      if (arc_id == -1) { // Final state
+        // Access the trans_id
+        CompactLatticeWeight curr_clat_weight = clat->Final(state);
+
+        // Calculate likelihood
+        BaseFloat log_like = decodable->LogLikelihood(t, tid) * frame_scale;
+        // update weight
+        CompactLatticeWeight new_clat_weight = curr_clat_weight;
+        LatticeWeight new_lat_weight = new_clat_weight.Weight();
+        new_lat_weight.SetValue2(-log_like + curr_clat_weight.Weight().Value2());
+        new_clat_weight.SetWeight(new_lat_weight);
+        clat->SetFinal(state, new_clat_weight);
+      } else {
+        fst::MutableArcIterator<CompactLattice> aiter(clat, state);
+
+        aiter.Seek(arc_id);
+        CompactLatticeArc arc = aiter.Value();
+
+        // Calculate likelihood
+        BaseFloat log_like = decodable->LogLikelihood(t, tid) * frame_scale;
+        // update weight
+        LatticeWeight new_weight = arc.weight.Weight();
+        new_weight.SetValue2(-log_like + arc.weight.Weight().Value2());
+        arc.weight.SetWeight(new_weight);
+        aiter.SetValue(arc);
+      }
+    }
+  }
+  return true;
+}
+
+
+bool RescoreCompactLatticeSpeedup(
+    const TransitionInformation &tmodel,
+    BaseFloat speedup_factor,
+    DecodableInterface *decodable,
+    CompactLattice *clat) {
+  return RescoreCompactLatticeInternal(&tmodel, speedup_factor, decodable, clat);
+}
+
+bool RescoreCompactLattice(DecodableInterface *decodable,
+                           CompactLattice *clat) {
+  return RescoreCompactLatticeInternal(NULL, 1.0, decodable, clat);
+}
+
+
+bool RescoreLattice(DecodableInterface *decodable,
+                    Lattice *lat) {
+  if (lat->NumStates() == 0) {
+    KALDI_WARN << "Rescoring empty lattice";
+    return false;
+  }
+  if (!lat->Properties(fst::kTopSorted, true)) {
+    if (fst::TopSort(lat) == false) {
+      KALDI_WARN << "Cycles detected in lattice.";
+      return false;
+    }
+  }
+  std::vector<int32> state_times;
+  int32 utt_len = kaldi::LatticeStateTimes(*lat, &state_times);
+
+  std::vector<std::vector<int32> > time_to_state(utt_len );
+
+  int32 num_states = lat->NumStates();
+  KALDI_ASSERT(num_states == state_times.size());
+  for (size_t state = 0; state < num_states; state++) {
+    int32 t = state_times[state];
+    // Don't check t >= 0 because non-accessible states could have t = -1.
+    KALDI_ASSERT(t <= utt_len);
+    if (t >= 0 && t < utt_len)
+      time_to_state[t].push_back(state);
+  }
+
+  for (int32 t = 0; t < utt_len; t++) {
+    if ((t < utt_len - 1) && decodable->IsLastFrame(t)) {
+      KALDI_WARN << "Features are too short for lattice: utt-len is "
+                 << utt_len << ", " << t << " is last frame";
+      return false;
+    }
+    for (size_t i = 0; i < time_to_state[t].size(); i++) {
+      int32 state = time_to_state[t][i];
+      for (fst::MutableArcIterator<Lattice> aiter(lat, state);
+           !aiter.Done(); aiter.Next()) {
+        LatticeArc arc = aiter.Value();
+        if (arc.ilabel != 0) {
+          int32 trans_id = arc.ilabel; // Note: it doesn't necessarily
+          // have to be a transition-id, just whatever the Decodable
+          // object is expecting, but it's normally a transition-id.
+
+          BaseFloat log_like = decodable->LogLikelihood(t, trans_id);
+          arc.weight.SetValue2(-log_like + arc.weight.Value2());
+          aiter.SetValue(arc);
+        }
+      }
+    }
+  }
+  return true;
+}
+
+
+int32 LongestSentenceLength(const Lattice &lat) {
+  typedef Lattice::Arc Arc;
+  typedef Arc::Label Label;
+  typedef Arc::StateId StateId;
+
+  if (lat.Properties(fst::kTopSorted, true) == 0) {
+    Lattice lat_copy(lat);
+    if (!TopSort(&lat_copy))
+      KALDI_ERR << "Was not able to topologically sort lattice (cycles found?)";
+    return LongestSentenceLength(lat_copy);
+  }
+  std::vector<int32> max_length(lat.NumStates(), 0);
+  int32 lattice_max_length = 0;
+  for (StateId s = 0; s < lat.NumStates(); s++) {
+    int32 this_max_length = max_length[s];
+    for (fst::ArcIterator<Lattice> aiter(lat, s); !aiter.Done(); aiter.Next()) {
+      const Arc &arc = aiter.Value();
+      bool arc_has_word = (arc.olabel != 0);
+      StateId nextstate = arc.nextstate;
+      KALDI_ASSERT(static_cast<size_t>(nextstate) < max_length.size());
+      if (arc_has_word) {
+        // A lattice should ideally not have cycles anyway; a cycle with a word
+        // on is something very bad.
+        KALDI_ASSERT(nextstate > s && "Lattice has cycles with words on.");
+        max_length[nextstate] = std::max(max_length[nextstate],
+                                         this_max_length + 1);
+      } else {
+        max_length[nextstate] = std::max(max_length[nextstate],
+                                         this_max_length);
+      }
+    }
+    if (lat.Final(s) != LatticeWeight::Zero())
+      lattice_max_length = std::max(lattice_max_length, max_length[s]);
+  }
+  return lattice_max_length;
+}
+
+int32 LongestSentenceLength(const CompactLattice &clat) {
+  typedef CompactLattice::Arc Arc;
+  typedef Arc::Label Label;
+  typedef Arc::StateId StateId;
+
+  if (clat.Properties(fst::kTopSorted, true) == 0) {
+    CompactLattice clat_copy(clat);
+    if (!TopSort(&clat_copy))
+      KALDI_ERR << "Was not able to topologically sort lattice (cycles found?)";
+    return LongestSentenceLength(clat_copy);
+  }
+  std::vector<int32> max_length(clat.NumStates(), 0);
+  int32 lattice_max_length = 0;
+  for (StateId s = 0; s < clat.NumStates(); s++) {
+    int32 this_max_length = max_length[s];
+    for (fst::ArcIterator<CompactLattice> aiter(clat, s);
+         !aiter.Done(); aiter.Next()) {
+      const Arc &arc = aiter.Value();
+      bool arc_has_word = (arc.ilabel != 0); // note: olabel == ilabel.
+      // also note: for normal CompactLattice, e.g. as produced by
+      // determinization, all arcs will have nonzero labels, but the user might
+      // decide to remplace some of the labels with zero for some reason, and we
+      // want to support this.
+      StateId nextstate = arc.nextstate;
+      KALDI_ASSERT(static_cast<size_t>(nextstate) < max_length.size());
+      KALDI_ASSERT(nextstate > s && "CompactLattice has cycles");
+      if (arc_has_word)
+        max_length[nextstate] = std::max(max_length[nextstate],
+                                         this_max_length + 1);
+      else
+        max_length[nextstate] = std::max(max_length[nextstate],
+                                         this_max_length);
+    }
+    if (clat.Final(s) != CompactLatticeWeight::Zero())
+      lattice_max_length = std::max(lattice_max_length, max_length[s]);
+  }
+  return lattice_max_length;
+}
+
+void ComposeCompactLatticeDeterministic(
+    const CompactLattice& clat,
+    fst::DeterministicOnDemandFst<fst::StdArc>* det_fst,
+    CompactLattice* composed_clat) {
+  // StdFst::Arc and CompactLatticeArc has the same StateId type.
+  typedef fst::StdArc::StateId StateId;
+  typedef fst::StdArc::Weight Weight1;
+  typedef CompactLatticeArc::Weight Weight2;
+  typedef std::pair<StateId, StateId> StatePair;
+  typedef unordered_map<StatePair, StateId, PairHasher<StateId> > MapType;
+  typedef MapType::iterator IterType;
+
+  // Empties the output FST.
+  KALDI_ASSERT(composed_clat != NULL);
+  composed_clat->DeleteStates();
+
+  MapType state_map;
+  std::queue<StatePair> state_queue;
+
+  // Sets start state in <composed_clat>.
+  StateId start_state = composed_clat->AddState();
+  StatePair start_pair(clat.Start(), det_fst->Start());
+  composed_clat->SetStart(start_state);
+  state_queue.push(start_pair);
+  std::pair<IterType, bool> result =
+      state_map.insert(std::make_pair(start_pair, start_state));
+  KALDI_ASSERT(result.second == true);
+
+  // Starts composition here.
+  while (!state_queue.empty()) {
+    // Gets the first state in the queue.
+    StatePair s = state_queue.front();
+    StateId s1 = s.first;
+    StateId s2 = s.second;
+    state_queue.pop();
+
+
+    Weight2 clat_final = clat.Final(s1);
+    if (clat_final.Weight().Value1() !=
+        std::numeric_limits<BaseFloat>::infinity()) {
+      // Test for whether the final-prob of state s1 was zero.
+      Weight1 det_fst_final = det_fst->Final(s2);
+      if (det_fst_final.Value() !=
+          std::numeric_limits<BaseFloat>::infinity()) {
+        // Test for whether the final-prob of state s2 was zero.  If neither
+        // source-state final prob was zero, then we should create final state
+        // in fst_composed. We compute the product manually since this is more
+        // efficient.
+        Weight2 final_weight(LatticeWeight(clat_final.Weight().Value1() +
+                                           det_fst_final.Value(),
+                                           clat_final.Weight().Value2()),
+                             clat_final.String());
+        // we can assume final_weight is not Zero(), since neither of
+        // the sources was zero.
+        KALDI_ASSERT(state_map.find(s) != state_map.end());
+        composed_clat->SetFinal(state_map[s], final_weight);
+      }
+    }
+
+    // Loops over pair of edges at s1 and s2.
+    for (fst::ArcIterator<CompactLattice> aiter(clat, s1);
+         !aiter.Done(); aiter.Next()) {
+      const CompactLatticeArc& arc1 = aiter.Value();
+      fst::StdArc arc2;
+      StateId next_state1 = arc1.nextstate, next_state2;
+      bool matched = false;
+
+      if (arc1.olabel == 0) {
+        // If the symbol on <arc1> is <epsilon>, we transit to the next state
+        // for <clat>, but keep <det_fst> at the current state.
+        matched = true;
+        next_state2 = s2;
+      } else {
+        // Otherwise try to find the matched arc in <det_fst>.
+        matched = det_fst->GetArc(s2, arc1.olabel, &arc2);
+        if (matched) {
+          next_state2 = arc2.nextstate;
+        }
+      }
+
+      // If matched arc is found in <det_fst>, then we have to add new arcs to
+      // <composed_clat>.
+      if (matched) {
+        StatePair next_state_pair(next_state1, next_state2);
+        IterType siter = state_map.find(next_state_pair);
+        StateId next_state;
+
+        // Adds composed state to <state_map>.
+        if (siter == state_map.end()) {
+          // If the composed state has not been created yet, create it.
+          next_state = composed_clat->AddState();
+          std::pair<const StatePair, StateId> next_state_map(next_state_pair,
+                                                             next_state);
+          std::pair<IterType, bool> result = state_map.insert(next_state_map);
+          KALDI_ASSERT(result.second);
+          state_queue.push(next_state_pair);
+        } else {
+          // If the composed state is already in <state_map>, we can directly
+          // use that.
+          next_state = siter->second;
+        }
+
+        // Adds arc to <composed_clat>.
+        if (arc1.olabel == 0) {
+          composed_clat->AddArc(state_map[s],
+                                CompactLatticeArc(arc1.ilabel, 0,
+                                                  arc1.weight, next_state));
+        } else {
+          Weight2 composed_weight(
+              LatticeWeight(arc1.weight.Weight().Value1() +
+                            arc2.weight.Value(),
+                            arc1.weight.Weight().Value2()),
+              arc1.weight.String());
+          composed_clat->AddArc(state_map[s],
+                                CompactLatticeArc(arc1.ilabel, arc2.olabel,
+                                                  composed_weight, next_state));
+        }
+      }
+    }
+  }
+  fst::Connect(composed_clat);
+}
+
+
+void ComputeAcousticScoresMap(
+    const Lattice &lat,
+    unordered_map<std::pair<int32, int32>, std::pair<BaseFloat, int32>,
+                                        PairHasher<int32> > *acoustic_scores) {
+  // typedef the arc, weight types
+  typedef Lattice::Arc Arc;
+  typedef Arc::Weight LatticeWeight;
+  typedef Arc::StateId StateId;
+
+  acoustic_scores->clear();
+
+  std::vector<int32> state_times;
+  LatticeStateTimes(lat, &state_times);   // Assumes the input is top sorted
+
+  KALDI_ASSERT(lat.Start() == 0);
+
+  for (StateId s = 0; s < lat.NumStates(); s++) {
+    int32 t = state_times[s];
+    for (fst::ArcIterator<Lattice> aiter(lat, s); !aiter.Done();
+          aiter.Next()) {
+      const Arc &arc = aiter.Value();
+      const LatticeWeight &weight = arc.weight;
+
+      int32 tid = arc.ilabel;
+
+      if (tid != 0) {
+        unordered_map<std::pair<int32, int32>, std::pair<BaseFloat, int32>,
+          PairHasher<int32> >::iterator it = acoustic_scores->find(std::make_pair(t, tid));
+        if (it == acoustic_scores->end()) {
+          acoustic_scores->insert(std::make_pair(std::make_pair(t, tid),
+                                          std::make_pair(weight.Value2(), 1)));
+        } else {
+          if (it->second.second == 2
+                && it->second.first / it->second.second != weight.Value2()) {
+            KALDI_VLOG(2) << "Transitions on the same frame have different "
+                          << "acoustic costs for tid " << tid << "; "
+                          << it->second.first / it->second.second
+                          << " vs " << weight.Value2();
+          }
+          it->second.first += weight.Value2();
+          it->second.second++;
+        }
+      } else {
+        // Arcs with epsilon input label (tid) must have 0 acoustic cost
+        KALDI_ASSERT(weight.Value2() == 0);
+      }
+    }
+
+    LatticeWeight f = lat.Final(s);
+    if (f != LatticeWeight::Zero()) {
+      // Final acoustic cost must be 0 as we are reading from
+      // non-determinized, non-compact lattice
+      KALDI_ASSERT(f.Value2() == 0.0);
+    }
+  }
+}
+
+void ReplaceAcousticScoresFromMap(
+    const unordered_map<std::pair<int32, int32>, std::pair<BaseFloat, int32>,
+                                        PairHasher<int32> > &acoustic_scores,
+    Lattice *lat) {
+  // typedef the arc, weight types
+  typedef Lattice::Arc Arc;
+  typedef Arc::Weight LatticeWeight;
+  typedef Arc::StateId StateId;
+
+  TopSortLatticeIfNeeded(lat);
+
+  std::vector<int32> state_times;
+  LatticeStateTimes(*lat, &state_times);
+
+  KALDI_ASSERT(lat->Start() == 0);
+
+  for (StateId s = 0; s < lat->NumStates(); s++) {
+    int32 t = state_times[s];
+    for (fst::MutableArcIterator<Lattice> aiter(lat, s);
+         !aiter.Done(); aiter.Next()) {
+      Arc arc(aiter.Value());
+
+      int32 tid = arc.ilabel;
+      if (tid != 0) {
+        unordered_map<std::pair<int32, int32>, std::pair<BaseFloat, int32>,
+          PairHasher<int32> >::const_iterator it = acoustic_scores.find(std::make_pair(t, tid));
+        if (it == acoustic_scores.end()) {
+          KALDI_ERR << "Could not find tid " << tid << " at time " << t
+                    << " in the acoustic scores map.";
+        } else {
+          arc.weight.SetValue2(it->second.first / it->second.second);
+        }
+      } else {
+        // For epsilon arcs, set acoustic cost to 0.0
+        arc.weight.SetValue2(0.0);
+      }
+      aiter.SetValue(arc);
+    }
+
+    LatticeWeight f = lat->Final(s);
+    if (f != LatticeWeight::Zero()) {
+      // Set final acoustic cost to 0.0
+      f.SetValue2(0.0);
+      lat->SetFinal(s, f);
+    }
+  }
+}
+
+}  // namespace kaldi
diff --git a/speechx/speechx/kaldi/lat/lattice-functions.h b/speechx/speechx/kaldi/lat/lattice-functions.h
new file mode 100644
index 000000000..2388dd98b
--- /dev/null
+++ b/speechx/speechx/kaldi/lat/lattice-functions.h
@@ -0,0 +1,402 @@
+// lat/lattice-functions.h
+
+// Copyright 2009-2012   Saarland University (author: Arnab Ghoshal)
+//           2012-2013   Johns Hopkins University (Author: Daniel Povey);
+//                       Bagher BabaAli
+//                2014   Guoguo Chen
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#ifndef KALDI_LAT_LATTICE_FUNCTIONS_H_
+#define KALDI_LAT_LATTICE_FUNCTIONS_H_
+
+#include <vector>
+#include <map>
+
+#include "base/kaldi-common.h"
+#include "fstext/fstext-lib.h"
+#include "itf/decodable-itf.h"
+#include "itf/transition-information.h"
+#include "lat/kaldi-lattice.h"
+
+namespace kaldi {
+
+// Redundant with the typedef in hmm/posterior.h. We want functions
+// using the Posterior type to be usable without a dependency on the
+// hmm library.
+typedef std::vector<std::vector<std::pair<int32, BaseFloat> > > Posterior;
+
+/**
+   This function extracts the per-frame log likelihoods from a linear
+   lattice (which we refer to as an 'nbest' lattice elsewhere in Kaldi code).
+   The dimension of *per_frame_loglikes will be set to the
+   number of input symbols in 'nbest'.  The elements of
+   '*per_frame_loglikes' will be set to the .Value2() elements of the lattice
+   weights, which represent the acoustic costs; you may want to scale this
+   vector afterward by -1/acoustic_scale to get the original loglikes.
+   If there are acoustic costs on input-epsilon arcs or the final-prob in 'nbest'
+   (and this should not normally be the case in situations where it makes
+   sense to call this function), they will be included to the cost of the
+   preceding input symbol, or the following input symbol for input-epsilons
+   encountered prior to any input symbol.  If 'nbest' has no input symbols,
+   'per_frame_loglikes' will be set to the empty vector.
+**/
+void GetPerFrameAcousticCosts(const Lattice &nbest,
+                              Vector<BaseFloat> *per_frame_loglikes);
+
+/// This function iterates over the states of a topologically sorted lattice and
+/// counts the time instance corresponding to each state. The times are returned
+/// in a vector of integers 'times' which is resized to have a size equal to the
+/// number of states in the lattice. The function also returns the maximum time
+/// in the lattice (this will equal the number of frames in the file).
+int32 LatticeStateTimes(const Lattice &lat, std::vector<int32> *times);
+
+/// As LatticeStateTimes, but in the CompactLattice format.  Note: must
+/// be topologically sorted.  Returns length of the utterance in frames, which
+/// might not be the same as the maximum time in the lattice, due to frames
+/// in the final-prob.
+int32 CompactLatticeStateTimes(const CompactLattice &clat,
+                               std::vector<int32> *times);
+
+/// This function does the forward-backward over lattices and computes the
+/// posterior probabilities of the arcs. It returns the total log-probability
+/// of the lattice.  The Posterior quantities contain pairs of (transition-id, weight)
+/// on each frame.
+/// If the pointer "acoustic_like_sum" is provided, this value is set to
+/// the sum over the arcs, of the posterior of the arc times the
+/// acoustic likelihood [i.e. negated acoustic score] on that link.
+/// This is used in combination with other quantities to work out
+/// the objective function in MMI discriminative training.
+BaseFloat LatticeForwardBackward(const Lattice &lat,
+                                 Posterior *arc_post,
+                                 double *acoustic_like_sum = NULL);
+
+// This function is something similar to LatticeForwardBackward(), but it is on
+// the CompactLattice lattice format. Also we only need the alpha in the forward
+// path, not the posteriors.
+bool ComputeCompactLatticeAlphas(const CompactLattice &lat,
+                                 std::vector<double> *alpha);
+
+// A sibling of the function CompactLatticeAlphas()... We compute the beta from
+// the backward path here.
+bool ComputeCompactLatticeBetas(const CompactLattice &lat,
+                                std::vector<double> *beta);
+
+
+// Computes (normal or Viterbi) alphas and betas; returns (total-prob, or
+// best-path negated cost) Note: in either case, the alphas and betas are
+// negated costs.  Requires that lat be topologically sorted.  This code
+// will work for either CompactLattice or Lattice.
+template<typename LatticeType>
+double ComputeLatticeAlphasAndBetas(const LatticeType &lat,
+                                    bool viterbi,
+                                    std::vector<double> *alpha,
+                                    std::vector<double> *beta);
+
+
+/// Topologically sort the compact lattice if not already topologically sorted.
+/// Will crash if the lattice cannot be topologically sorted.
+void TopSortCompactLatticeIfNeeded(CompactLattice *clat);
+
+
+/// Topologically sort the lattice if not already topologically sorted.
+/// Will crash if lattice cannot be topologically sorted.
+void TopSortLatticeIfNeeded(Lattice *clat);
+
+/// Returns the depth of the lattice, defined as the average number of arcs (or
+/// final-prob strings) crossing any given frame.  Returns 1 for empty lattices.
+/// Requires that clat is topologically sorted!
+BaseFloat CompactLatticeDepth(const CompactLattice &clat,
+                              int32 *num_frames = NULL);
+
+/// This function returns, for each frame, the number of arcs crossing that
+/// frame.
+void CompactLatticeDepthPerFrame(const CompactLattice &clat,
+                                 std::vector<int32> *depth_per_frame);
+
+
+/// This function limits the depth of the lattice, per frame: that means, it
+/// does not allow more than a specified number of arcs active on any given
+/// frame.  This can be used to reduce the size of the "very deep" portions of
+/// the lattice.
+void CompactLatticeLimitDepth(int32 max_arcs_per_frame,
+                              CompactLattice *clat);
+
+
+/// Given a lattice, and a transition model to map pdf-ids to phones,
+/// outputs for each frame the set of phones active on that frame.  If
+/// sil_phones (which must be sorted and uniq) is nonempty, it excludes
+/// phones in this list.
+void LatticeActivePhones(const Lattice &lat, const TransitionInformation &trans,
+                         const std::vector<int32> &sil_phones,
+                         std::vector<std::set<int32> > *active_phones);
+
+/// Given a lattice, and a transition model to map pdf-ids to phones,
+/// replace the output symbols (presumably words), with phones; we
+/// use the TransitionModel to work out the phone sequence.  Note
+/// that the phone labels are not exactly aligned with the phone
+/// boundaries.  We put a phone label to coincide with any transition
+/// to the final, nonemitting state of a phone (this state always exists,
+/// we ensure this in HmmTopology::Check()).  This would be the last
+/// transition-id in the phone if reordering is not done (but typically
+/// we do reorder).
+/// Also see PhoneAlignLattice, in phone-align-lattice.h.
+void ConvertLatticeToPhones(const TransitionInformation &trans_model,
+                            Lattice *lat);
+
+/// Prunes a lattice or compact lattice.  Returns true on success, false if
+/// there was some kind of failure.
+template<class LatticeType>
+bool PruneLattice(BaseFloat beam, LatticeType *lat);
+
+
+/// Given a lattice, and a transition model to map pdf-ids to phones,
+/// replace the sequences of transition-ids with sequences of phones.
+/// Note that this is different from ConvertLatticeToPhones, in that
+/// we replace the transition-ids not the words.
+void ConvertCompactLatticeToPhones(const TransitionInformation &trans_model,
+                                   CompactLattice *clat);
+
+/// Boosts LM probabilities by b * [number of frame errors]; equivalently, adds
+/// -b*[number of frame errors] to the graph-component of the cost of each arc/path.
+/// There is a frame error if a particular transition-id on a particular frame
+/// corresponds to a phone not matching transcription's alignment for that frame.
+/// This is used in "margin-inspired" discriminative training, esp. Boosted MMI.
+/// The TransitionInformation is used to map transition-ids in the lattice
+/// input-side to phones; the phones appearing in
+/// "silence_phones" are treated specially in that we replace the frame error f
+/// (either zero or 1) for a frame, with the minimum of f or max_silence_error.
+/// For the normal recipe, max_silence_error would be zero.
+/// Returns true on success, false if there was some kind of mismatch.
+/// At input, silence_phones must be sorted and unique.
+bool LatticeBoost(const TransitionInformation &trans,
+                  const std::vector<int32> &alignment,
+                  const std::vector<int32> &silence_phones,
+                  BaseFloat b,
+                  BaseFloat max_silence_error,
+                  Lattice *lat);
+
+
+/**
+   This function implements either the MPFE (minimum phone frame error) or SMBR
+   (state-level minimum bayes risk) forward-backward, depending on whether
+   "criterion" is "mpfe" or "smbr".  It returns the MPFE
+   criterion of SMBR criterion for this utterance, and outputs the posteriors (which
+   may be positive or negative) into "post".
+
+   @param [in] trans    The transition model. Used to map the
+                        transition-ids to phones or pdfs.
+   @param [in] silence_phones   A list of integer ids of silence phones. The
+                        silence frames i.e. the frames where num_ali
+                        corresponds to a silence phones are treated specially.
+                        The behavior is determined by 'one_silence_class'
+                        being false (traditional behavior) or true.
+                        Usually in our setup, several phones including
+                        the silence, vocalized noise, non-spoken noise
+                        and unk are treated as "silence phones"
+   @param [in] lat      The denominator lattice
+   @param [in] num_ali  The numerator alignment
+   @param [in] criterion    The objective function. Must be "mpfe" or "smbr"
+                        for MPFE (minimum phone frame error) or sMBR
+                        (state minimum bayes risk) training.
+   @param [in] one_silence_class   Determines how the silence frames are treated.
+                        Setting this to false gives the old traditional behavior,
+                        where the silence frames (according to num_ali) are
+                        treated as incorrect. However, this means that the
+                        insertions are not penalized by the objective.
+                        Setting this to true gives the new behaviour, where we
+                        treat silence as any other phone, except that all pdfs
+                        of silence phones are collapsed into a single class for
+                        the frame-error computation. This can possible reduce
+                        the insertions in the trained model. This is closer to
+                        the WER metric that we actually care about, since WER is
+                        generally computed after filtering out noises, but
+                        does penalize insertions.
+    @param [out] post   The "MBR posteriors" i.e. derivatives w.r.t to the
+                        pseudo log-likelihoods of states at each frame.
+*/
+BaseFloat LatticeForwardBackwardMpeVariants(
+    const TransitionInformation &trans,
+    const std::vector<int32> &silence_phones,
+    const Lattice &lat,
+    const std::vector<int32> &num_ali,
+    std::string criterion,
+    bool one_silence_class,
+    Posterior *post);
+
+/// This function takes a CompactLattice that should only contain a single
+/// linear sequence (e.g. derived from lattice-1best), and that should have been
+/// processed so that the arcs in the CompactLattice align correctly with the
+/// word boundaries (e.g. by lattice-align-words).  It outputs 3 vectors of the
+/// same size, which give, for each word in the lattice (in sequence), the word
+/// label and the begin time and length in frames.  This is done even for zero
+/// (epsilon) words, generally corresponding to optional silence-- if you don't
+/// want them, just ignore them in the output.
+/// This function will print a warning and return false, if the lattice
+/// did not have the correct format (e.g. if it is empty or it is not
+/// linear).
+bool CompactLatticeToWordAlignment(const CompactLattice &clat,
+                                   std::vector<int32> *words,
+                                   std::vector<int32> *begin_times,
+                                   std::vector<int32> *lengths);
+
+/// A form of the shortest-path/best-path algorithm that's specially coded for
+/// CompactLattice.  Requires that clat be acyclic.
+void CompactLatticeShortestPath(const CompactLattice &clat,
+                                CompactLattice *shortest_path);
+
+/// This function expands a CompactLattice to ensure high-probability paths
+/// have unique histories. Arcs with posteriors larger than epsilon get splitted.
+void ExpandCompactLattice(const CompactLattice &clat,
+                          double epsilon,
+                          CompactLattice *expand_clat);
+
+/// For each state, compute forward and backward best (viterbi) costs and its
+/// traceback states (for generating best paths later). The forward best cost
+/// for a state is the cost of the best path from the start state to the state.
+/// The traceback state of this state is its predecessor state in the best path.
+/// The backward best cost for a state is the cost of the best path from the
+/// state to a final one. Its traceback state is the successor state in the best
+/// path in the forward direction.
+/// Note: final weights of states are in backward_best_cost_and_pred.
+/// Requires the input CompactLattice clat be acyclic.
+typedef std::vector<std::pair<double,
+        CompactLatticeArc::StateId> > CostTraceType;
+void CompactLatticeBestCostsAndTracebacks(
+    const CompactLattice &clat,
+    CostTraceType *forward_best_cost_and_pred,
+    CostTraceType *backward_best_cost_and_pred);
+
+/// This function adds estimated neural language model scores of words in a
+/// minimal list of hypotheses that covers a lattice, to the graph scores on the
+/// arcs. The list of hypotheses are generated by latbin/lattice-path-cover.
+typedef unordered_map<std::pair<int32, int32>, double, PairHasher<int32> > MapT;
+void AddNnlmScoreToCompactLattice(const MapT &nnlm_scores,
+                                  CompactLattice *clat);
+
+/// This function add the word insertion penalty to graph score of each word
+/// in the compact lattice
+void AddWordInsPenToCompactLattice(BaseFloat word_ins_penalty,
+                                   CompactLattice *clat);
+
+/// This function *adds* the negated scores obtained from the Decodable object,
+/// to the acoustic scores on the arcs.  If you want to replace them, you should
+/// use ScaleCompactLattice to first set the acoustic scores to zero.  Returns
+/// true on success, false on error (typically some kind of mismatched inputs).
+bool RescoreCompactLattice(DecodableInterface *decodable,
+                           CompactLattice *clat);
+
+
+/// This function returns the number of words in the longest sentence in a
+/// CompactLattice (i.e. the the maximum of any path, of the count of
+/// olabels on that path).
+int32 LongestSentenceLength(const Lattice &lat);
+
+/// This function returns the number of words in the longest sentence in a
+/// CompactLattice, i.e. the the maximum of any path, of the count of
+/// labels on that path... note, in CompactLattice, the ilabels and olabels
+/// are identical because it is an acceptor.
+int32 LongestSentenceLength(const CompactLattice &lat);
+
+
+/// This function is like RescoreCompactLattice, but it is modified to avoid
+/// computing probabilities on most frames where all the pdf-ids are the same.
+/// (it needs the transition-model to work out whether two transition-ids map to
+/// the same pdf-id, and it assumes that the lattice has transition-ids on it).
+/// The naive thing would be to just set all probabilities to zero on frames
+/// where all the pdf-ids are the same (because this value won't affect the
+/// lattice posterior).  But this would become confusing when we compute
+/// corpus-level diagnostics such as the MMI objective function.  Instead,
+/// imagine speedup_factor = 100 (it must be >= 1.0)... with probability (1.0 /
+/// speedup_factor) we compute those likelihoods and multiply them by
+/// speedup_factor; otherwise we set them to zero.  This gives the right
+/// expected probability so our corpus-level diagnostics will be about right.
+bool RescoreCompactLatticeSpeedup(
+    const TransitionInformation &tmodel,
+    BaseFloat speedup_factor,
+    DecodableInterface *decodable,
+    CompactLattice *clat);
+
+
+/// This function *adds* the negated scores obtained from the Decodable object,
+/// to the acoustic scores on the arcs.  If you want to replace them, you should
+/// use ScaleCompactLattice to first set the acoustic scores to zero.  Returns
+/// true on success, false on error (e.g. some kind of mismatched inputs).
+/// The input labels, if nonzero, are interpreted as transition-ids or whatever
+/// other index the Decodable object expects.
+bool RescoreLattice(DecodableInterface *decodable,
+                    Lattice *lat);
+
+/// This function Composes a CompactLattice format lattice with a
+/// DeterministicOnDemandFst<fst::StdFst> format fst, and outputs another
+/// CompactLattice format lattice. The first element (the one that corresponds
+/// to LM weight) in CompactLatticeWeight is used for composition.
+///
+/// Note that the DeterministicOnDemandFst interface is not "const", therefore
+/// we cannot use "const" for <det_fst>.
+void ComposeCompactLatticeDeterministic(
+    const CompactLattice& clat,
+    fst::DeterministicOnDemandFst<fst::StdArc>* det_fst,
+    CompactLattice* composed_clat);
+
+/// This function computes the mapping from the pair 
+/// (frame-index, transition-id) to the pair 
+/// (sum-of-acoustic-scores, num-of-occurences) over all occurences of the 
+/// transition-id in that frame.
+/// frame-index in the lattice. 
+/// This function is useful for retaining the acoustic scores in a 
+/// non-compact lattice after a process like determinization where the 
+/// frame-level acoustic scores are typically lost.
+/// The function ReplaceAcousticScoresFromMap is used to restore the 
+/// acoustic scores computed by this function.
+///
+///   @param [in] lat   Input lattice. Expected to be top-sorted. Otherwise the 
+///                     function will crash. 
+///   @param [out] acoustic_scores  
+///                     Pointer to a map from the pair (frame-index,
+///                     transition-id) to a pair (sum-of-acoustic-scores,
+///                     num-of-occurences).
+///                     Usually the acoustic scores for a pdf-id (and hence
+///                     transition-id) on a frame will be the same for all the
+///                     occurences of the pdf-id in that frame. 
+///                     But if not, we will take the average of the acoustic
+///                     scores. Hence, we store both the sum-of-acoustic-scores
+///                     and the num-of-occurences of the transition-id in that
+///                     frame.
+void ComputeAcousticScoresMap(
+    const Lattice &lat,
+    unordered_map<std::pair<int32, int32>, std::pair<BaseFloat, int32>,
+                                        PairHasher<int32> > *acoustic_scores);
+
+/// This function restores acoustic scores computed using the function
+/// ComputeAcousticScoresMap into the lattice.
+///
+///   @param [in] acoustic_scores  
+///                      A map from the pair (frame-index, transition-id) to a
+///                      pair (sum-of-acoustic-scores, num-of-occurences) of 
+///                      the occurences of the transition-id in that frame.
+///                      See the comments for ComputeAcousticScoresMap for 
+///                      details.
+///   @param [out] lat   Pointer to the output lattice.
+void ReplaceAcousticScoresFromMap(
+    const unordered_map<std::pair<int32, int32>, std::pair<BaseFloat, int32>,
+                                        PairHasher<int32> > &acoustic_scores,
+    Lattice *lat);
+
+}  // namespace kaldi
+
+#endif  // KALDI_LAT_LATTICE_FUNCTIONS_H_
diff --git a/speechx/speechx/kaldi/matrix/BUILD b/speechx/speechx/kaldi/matrix/BUILD
deleted file mode 100644
index cefac6fc0..000000000
--- a/speechx/speechx/kaldi/matrix/BUILD
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright (c) 2020 PeachLab. All Rights Reserved.
-# Author : goat.zhou@qq.com (Yang Zhou)
-
-package(default_visibility = ["//visibility:public"])
-
-cc_library(
-    name = 'kaldi-matrix',
-    srcs = [
-       	'compressed-matrix.cc',
-				'kaldi-matrix.cc',
-				'kaldi-vector.cc',
-				'matrix-functions.cc',
-				'optimization.cc',
-				'packed-matrix.cc',
-				'qr.cc',
-				'sparse-matrix.cc',
-				'sp-matrix.cc',
-				'srfft.cc',
-				'tp-matrix.cc',
-	  ],
-    hdrs = glob(["*.h"]),
-    deps = [
-		    '//base:kaldi-base',
-        '//common/third_party/openblas:openblas',
-    ],
-    linkopts=['-lgfortran'],
-)
-
-cc_binary(
-    name = 'matrix-lib-test',
-    srcs = [
-        'matrix-lib-test.cc',
-        ],
-    deps = [
-       	':kaldi-matrix',
-        '//util:kaldi-util',
-        ],
-)
-
diff --git a/speechx/speechx/kaldi/matrix/kaldi-blas.h b/speechx/speechx/kaldi/matrix/kaldi-blas.h
index b08d8c51c..143781c8e 100644
--- a/speechx/speechx/kaldi/matrix/kaldi-blas.h
+++ b/speechx/speechx/kaldi/matrix/kaldi-blas.h
@@ -96,8 +96,8 @@
 #elif defined(HAVE_OPENBLAS)
   // getting cblas.h and lapacke.h from <openblas-install-dir>/.
   // putting in "" not <> to search -I before system libraries.
-  #include "third_party/openblas/cblas.h"
-  #include "third_party/openblas/lapacke.h"
+  #include "cblas.h"
+  #include "lapacke.h"
   #undef I
   #undef complex
   // get rid of macros from f2c.h -- these are dangerous.
diff --git a/speechx/speechx/nnet/CMakeLists.txt b/speechx/speechx/nnet/CMakeLists.txt
new file mode 100644
index 000000000..cee881de8
--- /dev/null
+++ b/speechx/speechx/nnet/CMakeLists.txt
@@ -0,0 +1,7 @@
+project(nnet)
+
+add_library(nnet STATIC
+  decodable.cc
+  paddle_nnet.cc
+)
+target_link_libraries(nnet absl::strings)
\ No newline at end of file
diff --git a/speechx/speechx/nnet/decodable-itf.h b/speechx/speechx/nnet/decodable-itf.h
new file mode 100644
index 000000000..8e9a5a72a
--- /dev/null
+++ b/speechx/speechx/nnet/decodable-itf.h
@@ -0,0 +1,155 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// itf/decodable-itf.h
+
+// Copyright 2009-2011  Microsoft Corporation;  Saarland University;
+//                      Mirko Hannemann;  Go Vivace Inc.;
+//                2013  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_ITF_DECODABLE_ITF_H_
+#define KALDI_ITF_DECODABLE_ITF_H_ 1
+#include "base/kaldi-common.h"
+
+namespace kaldi {
+/// @ingroup Interfaces
+/// @{
+
+
+/**
+    DecodableInterface provides a link between the (acoustic-modeling and
+    feature-processing) code and the decoder.  The idea is to make this
+    interface as small as possible, and to make it as agnostic as possible about
+    the form of the acoustic model (e.g. don't assume the probabilities are a
+    function of just a vector of floats), and about the decoder (e.g. don't
+    assume it accesses frames in strict left-to-right order).  For normal
+    models, without on-line operation, the "decodable" sub-class will just be a
+    wrapper around a matrix of features and an acoustic model, and it will
+    answer the question 'what is the acoustic likelihood for this index and this
+    frame?'.
+
+    For online decoding, where the features are coming in in real time, it is
+    important to understand the IsLastFrame() and NumFramesReady() functions.
+    There are two ways these are used: the old online-decoding code, in
+   ../online/,
+    and the new online-decoding code, in ../online2/.  In the old
+   online-decoding
+    code, the decoder would do:
+    \code{.cc}
+    for (int frame = 0; !decodable.IsLastFrame(frame); frame++) {
+      // Process this frame
+    }
+    \endcode
+   and the call to IsLastFrame would block if the features had not arrived yet.
+   The decodable object would have to know when to terminate the decoding.  This
+   online-decoding mode is still supported, it is what happens when you call,
+   for
+   example, LatticeFasterDecoder::Decode().
+
+   We realized that this "blocking" mode of decoding is not very convenient
+   because it forces the program to be multi-threaded and makes it complex to
+   control endpointing.  In the "new" decoding code, you don't call (for
+   example)
+   LatticeFasterDecoder::Decode(), you call
+   LatticeFasterDecoder::InitDecoding(),
+   and then each time you get more features, you provide them to the decodable
+   object, and you call LatticeFasterDecoder::AdvanceDecoding(), which does
+   something like this:
+   \code{.cc}
+   while (num_frames_decoded_ < decodable.NumFramesReady()) {
+     // Decode one more frame [increments num_frames_decoded_]
+   }
+   \endcode
+   So the decodable object never has IsLastFrame() called.  For decoding where
+   you are starting with a matrix of features, the NumFramesReady() function
+   will
+   always just return the number of frames in the file, and IsLastFrame() will
+   return true for the last frame.
+
+   For truly online decoding, the "old" online decodable objects in ../online/
+   have a "blocking" IsLastFrame() and will crash if you call NumFramesReady().
+   The "new" online decodable objects in ../online2/ return the number of frames
+   currently accessible if you call NumFramesReady().  You will likely not need
+   to call IsLastFrame(), but we implement it to only return true for the last
+   frame of the file once we've decided to terminate decoding.
+*/
+class DecodableInterface {
+  public:
+    /// Returns the log likelihood, which will be negated in the decoder.
+    /// The "frame" starts from zero.  You should verify that NumFramesReady() >
+    /// frame
+    /// before calling this.
+    virtual BaseFloat LogLikelihood(int32 frame, int32 index) = 0;
+
+    /// Returns true if this is the last frame.  Frames are zero-based, so the
+    /// first frame is zero.  IsLastFrame(-1) will return false, unless the file
+    /// is empty (which is a case that I'm not sure all the code will handle, so
+    /// be careful).  Caution: the behavior of this function in an online
+    /// setting
+    /// is being changed somewhat.  In future it may return false in cases where
+    /// we haven't yet decided to terminate decoding, but later true if we
+    /// decide
+    /// to terminate decoding.  The plan in future is to rely more on
+    /// NumFramesReady(), and in future, IsLastFrame() would always return false
+    /// in an online-decoding setting, and would only return true in a
+    /// decoding-from-matrix setting where we want to allow the last delta or
+    /// LDA
+    /// features to be flushed out for compatibility with the baseline setup.
+    virtual bool IsLastFrame(int32 frame) const = 0;
+
+    /// The call NumFramesReady() will return the number of frames currently
+    /// available
+    /// for this decodable object.  This is for use in setups where you don't
+    /// want the
+    /// decoder to block while waiting for input.  This is newly added as of Jan
+    /// 2014,
+    /// and I hope, going forward, to rely on this mechanism more than
+    /// IsLastFrame to
+    /// know when to stop decoding.
+    virtual int32 NumFramesReady() const {
+        KALDI_ERR
+            << "NumFramesReady() not implemented for this decodable type.";
+        return -1;
+    }
+
+    /// Returns the number of states in the acoustic model
+    /// (they will be indexed one-based, i.e. from 1 to NumIndices();
+    /// this is for compatibility with OpenFst).
+    virtual int32 NumIndices() const = 0;
+
+    virtual bool FrameLogLikelihood(
+        int32 frame, std::vector<kaldi::BaseFloat>* likelihood) = 0;
+
+
+    virtual ~DecodableInterface() {}
+};
+/// @}
+}  // namespace Kaldi
+
+#endif  // KALDI_ITF_DECODABLE_ITF_H_
diff --git a/speechx/speechx/nnet/decodable.cc b/speechx/speechx/nnet/decodable.cc
new file mode 100644
index 000000000..542168d24
--- /dev/null
+++ b/speechx/speechx/nnet/decodable.cc
@@ -0,0 +1,88 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "nnet/decodable.h"
+
+namespace ppspeech {
+
+using kaldi::BaseFloat;
+using kaldi::Matrix;
+using std::vector;
+using kaldi::Vector;
+
+Decodable::Decodable(const std::shared_ptr<NnetInterface>& nnet,
+                     const std::shared_ptr<FrontendInterface>& frontend)
+    : frontend_(frontend), nnet_(nnet), frame_offset_(0), frames_ready_(0) {}
+
+void Decodable::Acceptlikelihood(const Matrix<BaseFloat>& likelihood) {
+    nnet_cache_ = likelihood;
+    frames_ready_ += likelihood.NumRows();
+}
+
+// Decodable::Init(DecodableConfig config) {
+//}
+
+bool Decodable::IsLastFrame(int32 frame) const {
+    CHECK_LE(frame, frames_ready_);
+    return IsInputFinished() && (frame == frames_ready_ - 1);
+}
+
+int32 Decodable::NumIndices() const { return 0; }
+
+BaseFloat Decodable::LogLikelihood(int32 frame, int32 index) {
+    CHECK_LE(index, nnet_cache_.NumCols());
+    return 0;
+}
+
+bool Decodable::EnsureFrameHaveComputed(int32 frame) {
+    if (frame >= frames_ready_) {
+        return AdvanceChunk();
+    }
+    return true;
+}
+
+bool Decodable::AdvanceChunk() {
+    Vector<BaseFloat> features;
+    if (frontend_ == NULL || frontend_->Read(&features) == false) {
+        return false;
+    }
+    int32 nnet_dim = 0;
+    Vector<BaseFloat> inferences;
+    nnet_->FeedForward(features, frontend_->Dim(), &inferences, &nnet_dim);
+    nnet_cache_.Resize(inferences.Dim() / nnet_dim, nnet_dim);
+    nnet_cache_.CopyRowsFromVec(inferences);
+    frame_offset_ = frames_ready_;
+    frames_ready_ += nnet_cache_.NumRows();
+    return true;
+}
+
+bool Decodable::FrameLogLikelihood(int32 frame, vector<BaseFloat>* likelihood) {
+    std::vector<BaseFloat> result;
+    if (EnsureFrameHaveComputed(frame) == false) return false;
+    likelihood->resize(nnet_cache_.NumCols());
+    for (int32 idx = 0; idx < nnet_cache_.NumCols(); ++idx) {
+        (*likelihood)[idx] = nnet_cache_(frame - frame_offset_, idx);
+    }
+    return true;
+}
+
+void Decodable::Reset() {
+    if (frontend_ != nullptr) frontend_->Reset();
+    if (nnet_ != nullptr) nnet_->Reset();
+    frame_offset_ = 0;
+    frames_ready_ = 0;
+    nnet_cache_.Resize(0, 0);
+}
+
+}  // namespace ppspeech
\ No newline at end of file
diff --git a/speechx/speechx/nnet/decodable.h b/speechx/speechx/nnet/decodable.h
new file mode 100644
index 000000000..f035a338c
--- /dev/null
+++ b/speechx/speechx/nnet/decodable.h
@@ -0,0 +1,56 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/common.h"
+#include "frontend/audio/frontend_itf.h"
+#include "kaldi/matrix/kaldi-matrix.h"
+#include "nnet/decodable-itf.h"
+#include "nnet/nnet_itf.h"
+
+namespace ppspeech {
+
+struct DecodableOpts;
+
+class Decodable : public kaldi::DecodableInterface {
+  public:
+    explicit Decodable(const std::shared_ptr<NnetInterface>& nnet,
+                       const std::shared_ptr<FrontendInterface>& frontend);
+    // void Init(DecodableOpts config);
+    virtual kaldi::BaseFloat LogLikelihood(int32 frame, int32 index);
+    virtual bool IsLastFrame(int32 frame) const;
+    virtual int32 NumIndices() const;
+    virtual bool FrameLogLikelihood(int32 frame,
+                                    std::vector<kaldi::BaseFloat>* likelihood);
+    // for offline test
+    void Acceptlikelihood(const kaldi::Matrix<kaldi::BaseFloat>& likelihood);
+    void Reset();
+    bool IsInputFinished() const { return frontend_->IsFinished(); }
+    bool EnsureFrameHaveComputed(int32 frame);
+
+  private:
+    bool AdvanceChunk();
+    std::shared_ptr<FrontendInterface> frontend_;
+    std::shared_ptr<NnetInterface> nnet_;
+    kaldi::Matrix<kaldi::BaseFloat> nnet_cache_;
+    // std::vector<std::vector<kaldi::BaseFloat>> nnet_cache_;
+    int32 frame_offset_;
+    int32 frames_ready_;
+    // todo: feature frame mismatch with nnet inference frame
+    // eg: 35 frame features output 8 frame inferences
+    // so use subsampled_frame
+    int32 current_log_post_subsampled_offset_;
+    int32 num_chunk_computed_;
+};
+
+}  // namespace ppspeech
diff --git a/speechx/speechx/nnet/nnet_itf.h b/speechx/speechx/nnet/nnet_itf.h
new file mode 100644
index 000000000..ac040fbaa
--- /dev/null
+++ b/speechx/speechx/nnet/nnet_itf.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#pragma once
+
+#include "base/basic_types.h"
+#include "kaldi/base/kaldi-types.h"
+#include "kaldi/matrix/kaldi-matrix.h"
+
+namespace ppspeech {
+
+class NnetInterface {
+  public:
+    virtual void FeedForward(const kaldi::Vector<kaldi::BaseFloat>& features,
+                             int32 feature_dim,
+                             kaldi::Vector<kaldi::BaseFloat>* inferences,
+                             int32* inference_dim) = 0;
+    virtual void Reset() = 0;
+    virtual ~NnetInterface() {}
+};
+
+}  // namespace ppspeech
diff --git a/speechx/speechx/nnet/paddle_nnet.cc b/speechx/speechx/nnet/paddle_nnet.cc
new file mode 100644
index 000000000..c4b91cf6a
--- /dev/null
+++ b/speechx/speechx/nnet/paddle_nnet.cc
@@ -0,0 +1,208 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "nnet/paddle_nnet.h"
+#include "absl/strings/str_split.h"
+
+namespace ppspeech {
+
+using std::vector;
+using std::string;
+using std::shared_ptr;
+using kaldi::Matrix;
+using kaldi::Vector;
+
+void PaddleNnet::InitCacheEncouts(const ModelOptions& opts) {
+    std::vector<std::string> cache_names;
+    cache_names = absl::StrSplit(opts.cache_names, ",");
+    std::vector<std::string> cache_shapes;
+    cache_shapes = absl::StrSplit(opts.cache_shape, ",");
+    assert(cache_shapes.size() == cache_names.size());
+
+    cache_encouts_.clear();
+    cache_names_idx_.clear();
+    for (size_t i = 0; i < cache_shapes.size(); i++) {
+        std::vector<std::string> tmp_shape;
+        tmp_shape = absl::StrSplit(cache_shapes[i], "-");
+        std::vector<int> cur_shape;
+        std::transform(tmp_shape.begin(),
+                       tmp_shape.end(),
+                       std::back_inserter(cur_shape),
+                       [](const std::string& s) { return atoi(s.c_str()); });
+        cache_names_idx_[cache_names[i]] = i;
+        std::shared_ptr<Tensor<BaseFloat>> cache_eout =
+            std::make_shared<Tensor<BaseFloat>>(cur_shape);
+        cache_encouts_.push_back(cache_eout);
+    }
+}
+
+PaddleNnet::PaddleNnet(const ModelOptions& opts) : opts_(opts) {
+    paddle_infer::Config config;
+    config.SetModel(opts.model_path, opts.params_path);
+    if (opts.use_gpu) {
+        config.EnableUseGpu(500, 0);
+    }
+    config.SwitchIrOptim(opts.switch_ir_optim);
+    if (opts.enable_fc_padding == false) {
+        config.DisableFCPadding();
+    }
+    if (opts.enable_profile) {
+        config.EnableProfile();
+    }
+    pool.reset(
+        new paddle_infer::services::PredictorPool(config, opts.thread_num));
+    if (pool == nullptr) {
+        LOG(ERROR) << "create the predictor pool failed";
+    }
+    pool_usages.resize(opts.thread_num);
+    std::fill(pool_usages.begin(), pool_usages.end(), false);
+    LOG(INFO) << "load paddle model success";
+
+    LOG(INFO) << "start to check the predictor input and output names";
+    LOG(INFO) << "input names: " << opts.input_names;
+    LOG(INFO) << "output names: " << opts.output_names;
+    vector<string> input_names_vec = absl::StrSplit(opts.input_names, ",");
+    vector<string> output_names_vec = absl::StrSplit(opts.output_names, ",");
+    paddle_infer::Predictor* predictor = GetPredictor();
+
+    std::vector<std::string> model_input_names = predictor->GetInputNames();
+    assert(input_names_vec.size() == model_input_names.size());
+    for (size_t i = 0; i < model_input_names.size(); i++) {
+        assert(input_names_vec[i] == model_input_names[i]);
+    }
+
+    std::vector<std::string> model_output_names = predictor->GetOutputNames();
+    assert(output_names_vec.size() == model_output_names.size());
+    for (size_t i = 0; i < output_names_vec.size(); i++) {
+        assert(output_names_vec[i] == model_output_names[i]);
+    }
+    ReleasePredictor(predictor);
+    InitCacheEncouts(opts);
+}
+
+void PaddleNnet::Reset() { InitCacheEncouts(opts_); }
+
+paddle_infer::Predictor* PaddleNnet::GetPredictor() {
+    LOG(INFO) << "attempt to get a new predictor instance " << std::endl;
+    paddle_infer::Predictor* predictor = nullptr;
+    std::lock_guard<std::mutex> guard(pool_mutex);
+    int pred_id = 0;
+
+    while (pred_id < pool_usages.size()) {
+        if (pool_usages[pred_id] == false) {
+            predictor = pool->Retrive(pred_id);
+            break;
+        }
+        ++pred_id;
+    }
+
+    if (predictor) {
+        pool_usages[pred_id] = true;
+        predictor_to_thread_id[predictor] = pred_id;
+        LOG(INFO) << pred_id << " predictor create success";
+    } else {
+        LOG(INFO) << "Failed to get predictor from pool !!!";
+    }
+
+    return predictor;
+}
+
+int PaddleNnet::ReleasePredictor(paddle_infer::Predictor* predictor) {
+    LOG(INFO) << "attempt to releae a predictor";
+    std::lock_guard<std::mutex> guard(pool_mutex);
+    auto iter = predictor_to_thread_id.find(predictor);
+
+    if (iter == predictor_to_thread_id.end()) {
+        LOG(INFO) << "there is no such predictor";
+        return 0;
+    }
+
+    LOG(INFO) << iter->second << " predictor will be release";
+    pool_usages[iter->second] = false;
+    predictor_to_thread_id.erase(predictor);
+    LOG(INFO) << "release success";
+    return 0;
+}
+
+shared_ptr<Tensor<BaseFloat>> PaddleNnet::GetCacheEncoder(const string& name) {
+    auto iter = cache_names_idx_.find(name);
+    if (iter == cache_names_idx_.end()) {
+        return nullptr;
+    }
+    assert(iter->second < cache_encouts_.size());
+    return cache_encouts_[iter->second];
+}
+
+void PaddleNnet::FeedForward(const Vector<BaseFloat>& features,
+                             int32 feature_dim,
+                             Vector<BaseFloat>* inferences,
+                             int32* inference_dim) {
+    paddle_infer::Predictor* predictor = GetPredictor();
+    int feat_row = features.Dim() / feature_dim;
+    std::vector<std::string> input_names = predictor->GetInputNames();
+    std::vector<std::string> output_names = predictor->GetOutputNames();
+    LOG(INFO) << "feat info: rows, cols: " << feat_row << ", " << feature_dim;
+
+    std::unique_ptr<paddle_infer::Tensor> input_tensor =
+        predictor->GetInputHandle(input_names[0]);
+    std::vector<int> INPUT_SHAPE = {1, feat_row, feature_dim};
+    input_tensor->Reshape(INPUT_SHAPE);
+    input_tensor->CopyFromCpu(features.Data());
+    std::unique_ptr<paddle_infer::Tensor> input_len =
+        predictor->GetInputHandle(input_names[1]);
+    std::vector<int> input_len_size = {1};
+    input_len->Reshape(input_len_size);
+    std::vector<int64_t> audio_len;
+    audio_len.push_back(feat_row);
+    input_len->CopyFromCpu(audio_len.data());
+
+    std::unique_ptr<paddle_infer::Tensor> h_box =
+        predictor->GetInputHandle(input_names[2]);
+    shared_ptr<Tensor<BaseFloat>> h_cache = GetCacheEncoder(input_names[2]);
+    h_box->Reshape(h_cache->get_shape());
+    h_box->CopyFromCpu(h_cache->get_data().data());
+    std::unique_ptr<paddle_infer::Tensor> c_box =
+        predictor->GetInputHandle(input_names[3]);
+    shared_ptr<Tensor<float>> c_cache = GetCacheEncoder(input_names[3]);
+    c_box->Reshape(c_cache->get_shape());
+    c_box->CopyFromCpu(c_cache->get_data().data());
+    bool success = predictor->Run();
+
+    if (success == false) {
+        LOG(INFO) << "predictor run occurs error";
+    }
+
+    LOG(INFO) << "get the model success";
+    std::unique_ptr<paddle_infer::Tensor> h_out =
+        predictor->GetOutputHandle(output_names[2]);
+    assert(h_cache->get_shape() == h_out->shape());
+    h_out->CopyToCpu(h_cache->get_data().data());
+    std::unique_ptr<paddle_infer::Tensor> c_out =
+        predictor->GetOutputHandle(output_names[3]);
+    assert(c_cache->get_shape() == c_out->shape());
+    c_out->CopyToCpu(c_cache->get_data().data());
+
+    // get result
+    std::unique_ptr<paddle_infer::Tensor> output_tensor =
+        predictor->GetOutputHandle(output_names[0]);
+    std::vector<int> output_shape = output_tensor->shape();
+    int32 row = output_shape[1];
+    int32 col = output_shape[2];
+    inferences->Resize(row * col);
+    *inference_dim = col;
+    output_tensor->CopyToCpu(inferences->Data());
+    ReleasePredictor(predictor);
+}
+
+}  // namespace ppspeech
\ No newline at end of file
diff --git a/speechx/speechx/nnet/paddle_nnet.h b/speechx/speechx/nnet/paddle_nnet.h
new file mode 100644
index 000000000..906994d06
--- /dev/null
+++ b/speechx/speechx/nnet/paddle_nnet.h
@@ -0,0 +1,133 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#pragma once
+
+
+#include "kaldi/matrix/kaldi-matrix.h"
+#include "kaldi/util/options-itf.h"
+
+#include "base/common.h"
+#include "nnet/nnet_itf.h"
+#include "paddle_inference_api.h"
+
+#include <numeric>
+
+namespace ppspeech {
+
+struct ModelOptions {
+    std::string model_path;
+    std::string params_path;
+    int thread_num;
+    bool use_gpu;
+    bool switch_ir_optim;
+    std::string input_names;
+    std::string output_names;
+    std::string cache_names;
+    std::string cache_shape;
+    bool enable_fc_padding;
+    bool enable_profile;
+    ModelOptions()
+        : model_path("avg_1.jit.pdmodel"),
+          params_path("avg_1.jit.pdiparams"),
+          thread_num(2),
+          use_gpu(false),
+          input_names(
+              "audio_chunk,audio_chunk_lens,chunk_state_h_box,chunk_state_c_"
+              "box"),
+          output_names(
+              "save_infer_model/scale_0.tmp_1,save_infer_model/"
+              "scale_1.tmp_1,save_infer_model/scale_2.tmp_1,save_infer_model/"
+              "scale_3.tmp_1"),
+          cache_names("chunk_state_h_box,chunk_state_c_box"),
+          cache_shape("3-1-1024,3-1-1024"),
+          switch_ir_optim(false),
+          enable_fc_padding(false),
+          enable_profile(false) {}
+
+    void Register(kaldi::OptionsItf* opts) {
+        opts->Register("model-path", &model_path, "model file path");
+        opts->Register("model-params", &params_path, "params model file path");
+        opts->Register("thread-num", &thread_num, "thread num");
+        opts->Register("use-gpu", &use_gpu, "if use gpu");
+        opts->Register("input-names", &input_names, "paddle input names");
+        opts->Register("output-names", &output_names, "paddle output names");
+        opts->Register("cache-names", &cache_names, "cache names");
+        opts->Register("cache-shape", &cache_shape, "cache shape");
+        opts->Register("switch-ir-optiom",
+                       &switch_ir_optim,
+                       "paddle SwitchIrOptim option");
+        opts->Register("enable-fc-padding",
+                       &enable_fc_padding,
+                       "paddle EnableFCPadding option");
+        opts->Register(
+            "enable-profile", &enable_profile, "paddle EnableProfile option");
+    }
+};
+
+template <typename T>
+class Tensor {
+  public:
+    Tensor() {}
+    Tensor(const std::vector<int>& shape) : _shape(shape) {
+        int data_size = std::accumulate(
+            _shape.begin(), _shape.end(), 1, std::multiplies<int>());
+        LOG(INFO) << "data size: " << data_size;
+        _data.resize(data_size, 0);
+    }
+    void reshape(const std::vector<int>& shape) {
+        _shape = shape;
+        int data_size = std::accumulate(
+            _shape.begin(), _shape.end(), 1, std::multiplies<int>());
+        _data.resize(data_size, 0);
+    }
+    const std::vector<int>& get_shape() const { return _shape; }
+    std::vector<T>& get_data() { return _data; }
+
+  private:
+    std::vector<int> _shape;
+    std::vector<T> _data;
+};
+
+class PaddleNnet : public NnetInterface {
+  public:
+    PaddleNnet(const ModelOptions& opts);
+    virtual void FeedForward(const kaldi::Vector<kaldi::BaseFloat>& features,
+                             int32 feature_dim,
+                             kaldi::Vector<kaldi::BaseFloat>* inferences,
+                             int32* inference_dim);
+    void Dim();
+    virtual void Reset();
+    std::shared_ptr<Tensor<kaldi::BaseFloat>> GetCacheEncoder(
+        const std::string& name);
+    void InitCacheEncouts(const ModelOptions& opts);
+
+  private:
+    paddle_infer::Predictor* GetPredictor();
+    int ReleasePredictor(paddle_infer::Predictor* predictor);
+
+    std::unique_ptr<paddle_infer::services::PredictorPool> pool;
+    std::vector<bool> pool_usages;
+    std::mutex pool_mutex;
+    std::map<paddle_infer::Predictor*, int> predictor_to_thread_id;
+    std::map<std::string, int> cache_names_idx_;
+    std::vector<std::shared_ptr<Tensor<kaldi::BaseFloat>>> cache_encouts_;
+    ModelOptions opts_;
+
+  public:
+    DISALLOW_COPY_AND_ASSIGN(PaddleNnet);
+};
+
+}  // namespace ppspeech
diff --git a/speechx/speechx/utils/CMakeLists.txt b/speechx/speechx/utils/CMakeLists.txt
index e69de29bb..b5e2495e0 100644
--- a/speechx/speechx/utils/CMakeLists.txt
+++ b/speechx/speechx/utils/CMakeLists.txt
@@ -0,0 +1,4 @@
+
+add_library(utils
+  file_utils.cc
+)
diff --git a/speechx/speechx/utils/file_utils.cc b/speechx/speechx/utils/file_utils.cc
new file mode 100644
index 000000000..b8e51760a
--- /dev/null
+++ b/speechx/speechx/utils/file_utils.cc
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "utils/file_utils.h"
+
+namespace ppspeech {
+
+bool ReadFileToVector(const std::string& filename,
+                      std::vector<std::string>* vocabulary) {
+    std::ifstream file_in(filename);
+    if (!file_in) {
+        std::cerr << "please input a valid file" << std::endl;
+        return false;
+    }
+
+    std::string line;
+    while (std::getline(file_in, line)) {
+        vocabulary->emplace_back(line);
+    }
+
+    return true;
+}
+}
\ No newline at end of file
diff --git a/speechx/speechx/utils/file_utils.h b/speechx/speechx/utils/file_utils.h
new file mode 100644
index 000000000..f82d41a5b
--- /dev/null
+++ b/speechx/speechx/utils/file_utils.h
@@ -0,0 +1,21 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/common.h"
+
+namespace ppspeech {
+
+bool ReadFileToVector(const std::string& filename,
+                      std::vector<std::string>* data);
+}
diff --git a/speechx/tools/setup_valgrind.sh b/speechx/tools/setup_valgrind.sh
new file mode 100755
index 000000000..b887087e4
--- /dev/null
+++ b/speechx/tools/setup_valgrind.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+VALGRIND_VERSION=3.18.1
+
+set -e
+
+tarball=valgrind-3.18.1.tar.bz2
+
+url=https://sourceware.org/pub/valgrind/valgrind-3.18.1.tar.bz2
+
+if [ -f $tarball ]; then
+  echo "use the $tarball have downloaded."
+else
+  wget -c -t3 --no-check-certificate $url
+fi
+
+tar xjfv $tarball
+
+mv valgrind-3.18.1 valgrind
+
+prefix=$PWD/valgrind/install
+cd ./valgrind/
+  ./configure --prefix=$prefix
+  make
+  make install
+cd -
diff --git a/tests/benchmark/conformer/run_benchmark.sh b/tests/benchmark/conformer/run_benchmark.sh
index 16cd410e2..ef9ef8abe 100644
--- a/tests/benchmark/conformer/run_benchmark.sh
+++ b/tests/benchmark/conformer/run_benchmark.sh
@@ -48,7 +48,8 @@ function _train(){
 
     case ${run_mode} in
     sp) train_cmd="python -u ${BIN_DIR}/train.py "${train_cmd} ;;
-    mp) train_cmd="python -u ${BIN_DIR}/train.py "${train_cmd} ;;
+    mp) rm -rf ./mylog
+        train_cmd="python -u ${BIN_DIR}/train.py "${train_cmd} ;;
     *) echo "choose run_mode(sp or mp)"; exit 1;
     esac
     echo ${train_cmd}
diff --git a/tests/benchmark/pwgan/run_benchmark.sh b/tests/benchmark/pwgan/run_benchmark.sh
index d6a52d352..b9cc154fe 100755
--- a/tests/benchmark/pwgan/run_benchmark.sh
+++ b/tests/benchmark/pwgan/run_benchmark.sh
@@ -38,7 +38,8 @@ function _train(){
 
     case ${run_mode} in
     sp) train_cmd="python paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py --ngpu=1 ${train_cmd}" ;;
-    mp) train_cmd="python paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py --ngpu=8 ${train_cmd}"
+    mp) rm -rf ./mylog 
+        train_cmd="python paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py --ngpu=8 ${train_cmd}"
         log_parse_file="mylog/workerlog.0" ;;
     *) echo "choose run_mode(sp or mp)"; exit 1;
     esac
diff --git a/tests/test_tipc/benchmark_train.sh b/tests/test_tipc/benchmark_train.sh
index cc61567e3..4b7677c72 100644
--- a/tests/test_tipc/benchmark_train.sh
+++ b/tests/test_tipc/benchmark_train.sh
@@ -61,7 +61,7 @@ function get_repo_name(){
     cur_dir=$(pwd)
     IFS="/"
     arr=(${cur_dir})
-    echo ${arr[-1]}
+    echo ${arr[-2]}
 }
 
 FILENAME=$1
@@ -141,7 +141,6 @@ else
     batch_size=${params_list[1]}
     batch_size=`echo  ${batch_size} | tr -cd "[0-9]" `
     precision=${params_list[2]}
-    # run_process_type=${params_list[3]}
     run_mode=${params_list[3]}
     device_num=${params_list[4]}
     IFS=";"
@@ -166,10 +165,9 @@ for batch_size in ${batch_size_list[*]}; do
             gpu_id=$(set_gpu_id $device_num)
 
             if [ ${#gpu_id} -le 1 ];then
-                run_process_type="SingleP"
                 log_path="$SAVE_LOG/profiling_log"
                 mkdir -p $log_path
-                log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_profiling"
+                log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_profiling"
                 func_sed_params "$FILENAME" "${line_gpuid}" "0"  # sed used gpu_id 
                 # set profile_option params
                 tmp=`sed -i "${line_profile}s/.*/${profile_option}/" "${FILENAME}"`
@@ -185,8 +183,8 @@ for batch_size in ${batch_size_list[*]}; do
                 speed_log_path="$SAVE_LOG/index"
                 mkdir -p $log_path
                 mkdir -p $speed_log_path
-                log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_log"
-                speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_speed"
+                log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_log"
+                speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_speed"
                 func_sed_params "$FILENAME" "${line_profile}" "null"  # sed profile_id as null
                 cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 "
                 echo $cmd
@@ -197,13 +195,12 @@ for batch_size in ${batch_size_list[*]}; do
                 eval "cat ${log_path}/${log_name}"
 
                 # parser log
-                _model_name="${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}"
+                _model_name="${model_name}_bs${batch_size}_${precision}_${run_mode}"
                 cmd="${python} ${BENCHMARK_ROOT}/scripts/analysis.py --filename ${log_path}/${log_name} \
                         --speed_log_file '${speed_log_path}/${speed_log_name}' \
                         --model_name ${_model_name} \
                         --base_batch_size ${batch_size} \
                         --run_mode ${run_mode} \
-                        --run_process_type ${run_process_type} \
                         --fp_item ${precision} \
                         --keyword ips: \
                         --skip_steps 2 \
@@ -217,13 +214,12 @@ for batch_size in ${batch_size_list[*]}; do
             else
                 IFS=";"
                 unset_env=`unset CUDA_VISIBLE_DEVICES`
-                run_process_type="MultiP"
                 log_path="$SAVE_LOG/train_log"
                 speed_log_path="$SAVE_LOG/index"
                 mkdir -p $log_path
                 mkdir -p $speed_log_path
-                log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_log"
-                speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}_${device_num}_speed"
+                log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_log"
+                speed_log_name="${repo_name}_${model_name}_bs${batch_size}_${precision}_${run_mode}_${device_num}_speed"
                 func_sed_params "$FILENAME" "${line_gpuid}" "$gpu_id"  # sed used gpu_id 
                 func_sed_params "$FILENAME" "${line_profile}" "null"  # sed --profile_option as null
                 cmd="bash test_tipc/test_train_inference_python.sh ${FILENAME} benchmark_train > ${log_path}/${log_name} 2>&1 "
@@ -234,14 +230,13 @@ for batch_size in ${batch_size_list[*]}; do
                 export model_run_time=$((${job_et}-${job_bt}))
                 eval "cat ${log_path}/${log_name}"
                 # parser log
-                _model_name="${model_name}_bs${batch_size}_${precision}_${run_process_type}_${run_mode}"
+                _model_name="${model_name}_bs${batch_size}_${precision}_${run_mode}"
                 
                 cmd="${python} ${BENCHMARK_ROOT}/scripts/analysis.py --filename ${log_path}/${log_name} \
                         --speed_log_file '${speed_log_path}/${speed_log_name}' \
                         --model_name ${_model_name} \
                         --base_batch_size ${batch_size} \
                         --run_mode ${run_mode} \
-                        --run_process_type ${run_process_type} \
                         --fp_item ${precision} \
                         --keyword ips: \
                         --skip_steps 2 \
@@ -255,4 +250,4 @@ for batch_size in ${batch_size_list[*]}; do
             fi
         done
     done
-done
\ No newline at end of file
+done
diff --git a/tests/test_tipc/prepare.sh b/tests/test_tipc/prepare.sh
index b46b20322..31dff320f 100644
--- a/tests/test_tipc/prepare.sh
+++ b/tests/test_tipc/prepare.sh
@@ -26,8 +26,10 @@ if [ ${MODE} = "benchmark_train" ];then
     curPath=$(readlink -f "$(dirname "$0")")
         echo "curPath:"${curPath}
     cd ${curPath}/../..
-    apt-get install libsndfile1
-    pip install pytest-runner kaldiio setuptools_scm -i https://pypi.tuna.tsinghua.edu.cn/simple 
+    apt-get install libsndfile1 -y 
+    pip install pytest-runner  -i https://pypi.tuna.tsinghua.edu.cn/simple
+    pip install kaldiio  -i https://pypi.tuna.tsinghua.edu.cn/simple
+    pip install setuptools_scm -i https://pypi.tuna.tsinghua.edu.cn/simple 
     pip install . -i https://pypi.tuna.tsinghua.edu.cn/simple 
     cd -
     if [ ${model_name} == "conformer" ]; then
diff --git a/tests/unit/cli/test_cli.sh b/tests/unit/cli/test_cli.sh
index 748e5608d..96ab84d65 100755
--- a/tests/unit/cli/test_cli.sh
+++ b/tests/unit/cli/test_cli.sh
@@ -19,8 +19,11 @@ paddlespeech tts --voc mb_melgan_csmsc --input "你好，欢迎使用百度飞
 paddlespeech tts --voc style_melgan_csmsc --input "你好，欢迎使用百度飞桨深度学习框架！"
 paddlespeech tts --voc hifigan_csmsc --input "你好，欢迎使用百度飞桨深度学习框架！"
 paddlespeech tts --am fastspeech2_aishell3 --voc pwgan_aishell3 --input "你好，欢迎使用百度飞桨深度学习框架！" --spk_id 0
+paddlespeech tts --am fastspeech2_aishell3 --voc hifigan_aishell3 --input "你好，欢迎使用百度飞桨深度学习框架！" --spk_id 0
 paddlespeech tts --am fastspeech2_ljspeech --voc pwgan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get."
+paddlespeech tts --am fastspeech2_ljspeech --voc hifigan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get."
 paddlespeech tts --am fastspeech2_vctk --voc pwgan_vctk --input "Life was like a box of chocolates, you never know what you're gonna get." --lang en --spk_id 0
+paddlespeech tts --am fastspeech2_vctk --voc hifigan_vctk --input "Life was like a box of chocolates, you never know what you're gonna get." --lang en --spk_id 0
 paddlespeech tts --am tacotron2_csmsc --input "你好，欢迎使用百度飞桨深度学习框架！"
 paddlespeech tts --am tacotron2_csmsc --voc wavernn_csmsc --input "你好，欢迎使用百度飞桨深度学习框架！"
 paddlespeech tts --am tacotron2_ljspeech --voc pwgan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get."
@@ -28,3 +31,28 @@ paddlespeech tts --am tacotron2_ljspeech --voc pwgan_ljspeech --lang en --input
 
 # Speech Translation (only support linux)
 paddlespeech st --input ./en.wav
+
+
+# batch process
+echo -e "1 欢迎光临。\n2 谢谢惠顾。" | paddlespeech tts
+
+# shell pipeline
+paddlespeech asr --input ./zh.wav | paddlespeech text --task punc
+
+# stats
+paddlespeech stats --task asr
+paddlespeech stats --task tts
+paddlespeech stats --task cls
+
+# Speaker Verification 
+wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav
+paddlespeech vector --task spk --input 85236145389.wav
+
+echo -e "demo1 85236145389.wav \n demo2 85236145389.wav" > vec.job
+paddlespeech vector --task spk --input vec.job
+
+echo -e "demo3 85236145389.wav \n demo4 85236145389.wav" | paddlespeech vector --task spk
+rm 85236145389.wav 
+rm vec.job
+
+
diff --git a/tests/unit/server/change_yaml.py b/tests/unit/server/change_yaml.py
index 1f063d8f5..cdeaebdbc 100644
--- a/tests/unit/server/change_yaml.py
+++ b/tests/unit/server/change_yaml.py
@@ -25,13 +25,15 @@ def change_device(yamlfile: str, engine: str, device: str):
 
     with open(tmp_yamlfile) as f, open(yamlfile, "w+", encoding="utf-8") as fw:
         y = yaml.safe_load(f)
-        if engine == 'asr_python' or engine == 'tts_python':
+        if engine == 'asr_python' or engine == 'tts_python' or engine == 'cls_python':
             y[engine]['device'] = set_device
         elif engine == 'asr_inference':
             y[engine]['am_predictor_conf']['device'] = set_device
         elif engine == 'tts_inference':
             y[engine]['am_predictor_conf']['device'] = set_device
             y[engine]['voc_predictor_conf']['device'] = set_device
+        elif engine == 'cls_inference':
+            y[engine]['predictor_conf']['device'] = set_device
         else:
             print(
                 "Please set correct engine: asr_python, tts_python, asr_inference, tts_inference."
@@ -84,6 +86,8 @@ if __name__ == "__main__":
             'enginetype-asr_inference',
             'enginetype-tts_python',
             'enginetype-tts_inference',
+            'enginetype-cls_python',
+            'enginetype-cls_inference',
             'device-asr_python-cpu',
             'device-asr_python-gpu',
             'device-asr_inference-cpu',
@@ -92,6 +96,10 @@ if __name__ == "__main__":
             'device-tts_python-gpu',
             'device-tts_inference-cpu',
             'device-tts_inference-gpu',
+            'device-cls_python-cpu',
+            'device-cls_python-gpu',
+            'device-cls_inference-cpu',
+            'device-cls_inference-gpu',
         ],
         required=True)
     args = parser.parse_args()
diff --git a/tests/unit/server/conf/application.yaml b/tests/unit/server/conf/application.yaml
index 6048450b7..2b1a05998 100644
--- a/tests/unit/server/conf/application.yaml
+++ b/tests/unit/server/conf/application.yaml
@@ -9,12 +9,14 @@ port: 8090
 # The task format in the engin_list is: <speech task>_<engine type>
 # task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference']
 
-engine_list: ['asr_python', 'tts_python']
+engine_list: ['asr_python', 'tts_python', 'cls_python']
 
 
 #################################################################################
 #                                ENGINE CONFIG                                  #
 #################################################################################
+
+################################### ASR #########################################
 ################### speech task: asr; engine_type: python #######################
 asr_python:
     model: 'conformer_wenetspeech'
@@ -46,6 +48,7 @@ asr_inference:
         summary: True  # False -> do not show predictor config
 
 
+################################### TTS #########################################
 ################### speech task: tts; engine_type: python #######################
 tts_python: 
     # am (acoustic model) choices=['speedyspeech_csmsc', 'fastspeech2_csmsc', 
@@ -105,3 +108,30 @@ tts_inference:
     # others
     lang: 'zh'
 
+
+################################### CLS #########################################
+################### speech task: cls; engine_type: python #######################
+cls_python:
+    # model choices=['panns_cnn14', 'panns_cnn10', 'panns_cnn6']
+    model: 'panns_cnn14'
+    cfg_path: # [optional] Config of cls task.
+    ckpt_path: # [optional] Checkpoint file of model.
+    label_file: # [optional] Label file of cls task.
+    device:  # set 'gpu:id' or 'cpu'
+
+
+################### speech task: cls; engine_type: inference #######################
+cls_inference:
+    # model_type choices=['panns_cnn14', 'panns_cnn10', 'panns_cnn6']
+    model_type: 'panns_cnn14' 
+    cfg_path: 
+    model_path:  # the pdmodel file of am static model [optional]
+    params_path:  # the pdiparams file of am static model [optional]
+    label_file:  # [optional] Label file of cls task.
+
+    predictor_conf:
+        device:  # set 'gpu:id' or 'cpu'
+        switch_ir_optim: True
+        glog_info: False  # True -> print glog
+        summary: True  # False -> do not show predictor config
+
diff --git a/tests/unit/server/test_server_client.sh b/tests/unit/server/test_server_client.sh
index b48e7111b..e7ae7604d 100644
--- a/tests/unit/server/test_server_client.sh
+++ b/tests/unit/server/test_server_client.sh
@@ -33,15 +33,21 @@ ClientTest(){
     ((test_times+=1))
     paddlespeech_client tts --server_ip $server_ip --port $port --input "您好，欢迎使用百度飞桨语音合成服务。" --output output.wav 
     ((test_times+=1))  
+
+    # test cls client
+    paddlespeech_client cls --server_ip $server_ip --port $port --input ./zh.wav 
+    ((test_times+=1))
+    paddlespeech_client cls --server_ip $server_ip --port $port --input ./zh.wav 
+    ((test_times+=1)) 
 }
 
 GetTestResult() {
     # Determine if the test was successful
     response_success_time=$(cat log/server.log | grep "200 OK" -c)
     if (( $response_success_time == $test_times )) ; then
-        echo "Testing successfully. The service configuration is: asr engine type: $1; tts engine type: $1; device: $2."  | tee -a ./log/test_result.log
+        echo "Testing successfully. The service configuration is: asr engine type: $1; tts engine type: $1; cls engine type: $1; device: $2."  | tee -a ./log/test_result.log
     else
-        echo "Testing failed. The service configuration is: asr engine type: $1; tts engine type: $1; device: $2." | tee -a ./log/test_result.log
+        echo "Testing failed. The service configuration is: asr engine type: $1; tts engine type: $1; cls engine type: $1; device: $2." | tee -a ./log/test_result.log
     fi
     test_times=$response_success_time
 }
@@ -74,8 +80,8 @@ target_start_num=0  # the number of start service
 test_times=0  # The number of client test
 error_time=0  # The number of error occurrences in the startup failure server.log.wf file
 
-# start server: asr engine type: python; tts engine type: python; device: gpu
-echo "Start the service: asr engine type: python; tts engine type: python; device: gpu"  | tee -a ./log/test_result.log
+# start server: asr engine type: python; tts engine type: python; cls engine type: python; device: gpu
+echo "Start the service: asr engine type: python; tts engine type: python; cls engine type: python; device: gpu"  | tee -a ./log/test_result.log
 ((target_start_num+=1))
 StartService
 
@@ -98,11 +104,12 @@ echo "**************************************************************************
 
 
 
-# start server: asr engine type: python; tts engine type: python; device: cpu
-python change_yaml.py --change_task device-asr_python-cpu    # change asr.yaml device: cpu
-python change_yaml.py --change_task device-tts_python-cpu    # change tts.yaml device: cpu
+# start server: asr engine type: python; tts engine type: python; cls engine type: python; device: cpu
+python change_yaml.py --change_task device-asr_python-cpu    # change asr_python device: cpu
+python change_yaml.py --change_task device-tts_python-cpu    # change tts_python device: cpu
+python change_yaml.py --change_task device-cls_python-cpu    # change cls_python device: cpu
 
-echo "Start the service: asr engine type: python; tts engine type: python; device: cpu"  | tee -a ./log/test_result.log
+echo "Start the service: asr engine type: python; tts engine type: python; cls engine type: python; device: cpu"  | tee -a ./log/test_result.log
 ((target_start_num+=1))
 StartService
 
@@ -124,11 +131,12 @@ sleep 2s
 echo "**************************************************************************************" | tee -a ./log/test_result.log
 
 
-# start server: asr engine type: inference; tts engine type: inference; device: gpu
-python change_yaml.py --change_task enginetype-asr_inference    # change application.yaml, asr engine_type: inference; asr engine_backend: asr_pd.yaml
-python change_yaml.py --change_task enginetype-tts_inference    # change application.yaml, tts engine_type: inference; tts engine_backend: tts_pd.yaml
+# start server: asr engine type: inference; tts engine type: inference; cls engine type: inference; device: gpu
+python change_yaml.py --change_task enginetype-asr_inference    # change engine_list: 'asr_python' -> 'asr_inference'
+python change_yaml.py --change_task enginetype-tts_inference    # change engine_list: 'tts_python' -> 'tts_inference'
+python change_yaml.py --change_task enginetype-cls_inference    # change engine_list: 'cls_python' -> 'cls_inference'
 
-echo "Start the service: asr engine type: inference; tts engine type: inference; device: gpu"  | tee -a ./log/test_result.log
+echo "Start the service: asr engine type: inference; tts engine type: inference; cls engine type: inference; device: gpu"  | tee -a ./log/test_result.log
 ((target_start_num+=1))
 StartService
 
@@ -150,11 +158,12 @@ sleep 2s
 echo "**************************************************************************************" | tee -a ./log/test_result.log
 
 
-# start server: asr engine type: inference; tts engine type: inference; device: cpu
-python change_yaml.py --change_task device-asr_inference-cpu    # change asr_pd.yaml device: cpu
-python change_yaml.py --change_task device-tts_inference-cpu    # change tts_pd.yaml device: cpu
+# start server: asr engine type: inference; tts engine type: inference; cls engine type: inference; device: cpu
+python change_yaml.py --change_task device-asr_inference-cpu    # change asr_inference device: cpu
+python change_yaml.py --change_task device-tts_inference-cpu    # change tts_inference device: cpu
+python change_yaml.py --change_task device-cls_inference-cpu    # change cls_inference device: cpu
 
-echo "start the service: asr engine type: inference; tts engine type: inference; device: cpu"  | tee -a ./log/test_result.log
+echo "start the service: asr engine type: inference; tts engine type: inference; cls engine type: inference; device: cpu"  | tee -a ./log/test_result.log
 ((target_start_num+=1))
 StartService
 
diff --git a/tests/unit/vector/conftest.py b/tests/unit/vector/conftest.py
new file mode 100644
index 000000000..cc5dccd19
--- /dev/null
+++ b/tests/unit/vector/conftest.py
@@ -0,0 +1,25 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def pytest_addoption(parser):
+    parser.addoption("--device", action="store", default="cpu")
+
+
+def pytest_generate_tests(metafunc):
+    # This is called for every test. Only get/set command line arguments
+    # if the argument is specified in the list of test "fixturenames".
+    option_value = metafunc.config.option.device
+    if "device" in metafunc.fixturenames and option_value is not None:
+        metafunc.parametrize("device", [option_value])
diff --git a/tests/unit/vector/test_augment.py b/tests/unit/vector/test_augment.py
new file mode 100644
index 000000000..5ae01da41
--- /dev/null
+++ b/tests/unit/vector/test_augment.py
@@ -0,0 +1,130 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+
+
+def test_add_noise(tmpdir, device):
+    paddle.device.set_device(device)
+    from paddlespeech.vector.io.augment import AddNoise
+
+    test_waveform = paddle.sin(
+        paddle.arange(16000.0, dtype="float32")).unsqueeze(0)
+    test_noise = paddle.cos(
+        paddle.arange(16000.0, dtype="float32")).unsqueeze(0)
+    wav_lens = paddle.ones([1], dtype="float32")
+
+    # Edge cases
+    no_noise = AddNoise(mix_prob=0.0)
+    assert no_noise(test_waveform, wav_lens).allclose(test_waveform)
+
+
+def test_speed_perturb(device):
+    paddle.device.set_device(device)
+    from paddlespeech.vector.io.augment import SpeedPerturb
+
+    test_waveform = paddle.sin(
+        paddle.arange(16000.0, dtype="float32")).unsqueeze(0)
+
+    # Edge cases
+    no_perturb = SpeedPerturb(16000, perturb_prob=0.0)
+    assert no_perturb(test_waveform).allclose(test_waveform)
+    no_perturb = SpeedPerturb(16000, speeds=[100])
+    assert no_perturb(test_waveform).allclose(test_waveform)
+
+    # # Half speed
+    half_speed = SpeedPerturb(16000, speeds=[50])
+    assert half_speed(test_waveform).allclose(test_waveform[:, ::2], atol=3e-1)
+
+
+def test_babble(device):
+    paddle.device.set_device(device)
+    from paddlespeech.vector.io.augment import AddBabble
+
+    test_waveform = paddle.stack(
+        (paddle.sin(paddle.arange(16000.0, dtype="float32")),
+         paddle.cos(paddle.arange(16000.0, dtype="float32")), ))
+    lengths = paddle.ones([2])
+
+    # Edge cases
+    no_babble = AddBabble(mix_prob=0.0)
+    assert no_babble(test_waveform, lengths).allclose(test_waveform)
+    no_babble = AddBabble(speaker_count=1, snr_low=1000, snr_high=1000)
+    assert no_babble(test_waveform, lengths).allclose(test_waveform)
+
+    # One babbler just averages the two speakers
+    babble = AddBabble(speaker_count=1).to(device)
+    expected = (test_waveform + test_waveform.roll(1, 0)) / 2
+    assert babble(test_waveform, lengths).allclose(expected, atol=1e-4)
+
+
+def test_drop_freq(device):
+    paddle.device.set_device(device)
+    from paddlespeech.vector.io.augment import DropFreq
+
+    test_waveform = paddle.sin(
+        paddle.arange(16000.0, dtype="float32")).unsqueeze(0)
+
+    # Edge cases
+    no_drop = DropFreq(drop_prob=0.0)
+    assert no_drop(test_waveform).allclose(test_waveform)
+    no_drop = DropFreq(drop_count_low=0, drop_count_high=0)
+    assert no_drop(test_waveform).allclose(test_waveform)
+
+    # Check case where frequency range *does not* include signal frequency
+    drop_diff_freq = DropFreq(drop_freq_low=0.5, drop_freq_high=0.9)
+    assert drop_diff_freq(test_waveform).allclose(test_waveform, atol=1e-1)
+
+    # Check case where frequency range *does* include signal frequency
+    drop_same_freq = DropFreq(drop_freq_low=0.28, drop_freq_high=0.28)
+    assert drop_same_freq(test_waveform).allclose(
+        paddle.zeros([1, 16000]), atol=4e-1)
+
+
+def test_drop_chunk(device):
+    paddle.device.set_device(device)
+    from paddlespeech.vector.io.augment import DropChunk
+
+    test_waveform = paddle.sin(
+        paddle.arange(16000.0, dtype="float32")).unsqueeze(0)
+    lengths = paddle.ones([1])
+
+    # Edge cases
+    no_drop = DropChunk(drop_prob=0.0)
+    assert no_drop(test_waveform, lengths).allclose(test_waveform)
+    no_drop = DropChunk(drop_length_low=0, drop_length_high=0)
+    assert no_drop(test_waveform, lengths).allclose(test_waveform)
+    no_drop = DropChunk(drop_count_low=0, drop_count_high=0)
+    assert no_drop(test_waveform, lengths).allclose(test_waveform)
+    no_drop = DropChunk(drop_start=0, drop_end=0)
+    assert no_drop(test_waveform, lengths).allclose(test_waveform)
+
+    # Specify all parameters to ensure it is deterministic
+    dropper = DropChunk(
+        drop_length_low=100,
+        drop_length_high=100,
+        drop_count_low=1,
+        drop_count_high=1,
+        drop_start=100,
+        drop_end=200,
+        noise_factor=0.0, )
+    expected_waveform = test_waveform.clone()
+    expected_waveform[:, 100:200] = 0.0
+
+    assert dropper(test_waveform, lengths).allclose(expected_waveform)
+
+    # Make sure amplitude is similar before and after
+    dropper = DropChunk(noise_factor=1.0)
+    drop_amplitude = dropper(test_waveform, lengths).abs().mean()
+    orig_amplitude = test_waveform.abs().mean()
+    assert drop_amplitude.allclose(orig_amplitude, atol=1e-2)
diff --git a/third_party/ctc_decoders/setup.py b/third_party/ctc_decoders/setup.py
index 4a11b890d..ce2787e3f 100644
--- a/third_party/ctc_decoders/setup.py
+++ b/third_party/ctc_decoders/setup.py
@@ -127,7 +127,7 @@ decoders_module = [
 
 setup(
     name='paddlespeech_ctcdecoders',
-    version='0.1.1',
+    version='0.2.0',
     description="CTC decoders in paddlespeech",
     author="PaddlePaddle Speech and Language Team",
     author_email="paddlesl@baidu.com",
diff --git a/utils/DER.py b/utils/DER.py
index d6ab695d8..59bcbec47 100755
--- a/utils/DER.py
+++ b/utils/DER.py
@@ -26,9 +26,9 @@ import argparse
 import os
 import re
 import subprocess
-from distutils.util import strtobool
 
 import numpy as np
+from distutils.util import strtobool
 
 FILE_IDS = re.compile(r"(?<=Speaker Diarization for).+(?=\*\*\*)")
 SCORED_SPEAKER_TIME = re.compile(r"(?<=SCORED SPEAKER TIME =)[\d.]+")
diff --git a/utils/addjson.py b/utils/addjson.py
index 013d14727..e1be7ab31 100755
--- a/utils/addjson.py
+++ b/utils/addjson.py
@@ -10,8 +10,8 @@ import codecs
 import json
 import logging
 import sys
-from distutils.util import strtobool
 
+from distutils.util import strtobool
 from espnet.utils.cli_utils import get_commandline_args
 
 is_python2 = sys.version_info[0] == 2
diff --git a/utils/apply-cmvn.py b/utils/apply-cmvn.py
index b92e58f45..cf91bdfcd 100755
--- a/utils/apply-cmvn.py
+++ b/utils/apply-cmvn.py
@@ -1,10 +1,10 @@
 #!/usr/bin/env python3
 import argparse
 import logging
-from distutils.util import strtobool
 
 import kaldiio
 import numpy
+from distutils.util import strtobool
 
 from paddlespeech.s2t.transform.cmvn import CMVN
 from paddlespeech.s2t.utils.cli_readers import file_reader_helper
diff --git a/utils/copy-feats.py b/utils/copy-feats.py
index 2e1208814..dc7a70b45 100755
--- a/utils/copy-feats.py
+++ b/utils/copy-feats.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 import argparse
 import logging
+
 from distutils.util import strtobool
 
 from paddlespeech.s2t.transform.transformation import Transformation
diff --git a/utils/generate_infer_yaml.py b/utils/generate_infer_yaml.py
index b8a797ad9..02ff262b0 100644
--- a/utils/generate_infer_yaml.py
+++ b/utils/generate_infer_yaml.py
@@ -148,7 +148,7 @@ def merge_configs(
     for item in remove_train_list:
         try:
             remove_config_part(config, [item])
-        except:
+        except Exception as e:
             print(item + " " + "can not be removed")
 
     # Save the config
diff --git a/utils/merge_scp2json.py b/utils/merge_scp2json.py
index 650e46698..99db6bac8 100755
--- a/utils/merge_scp2json.py
+++ b/utils/merge_scp2json.py
@@ -5,9 +5,10 @@ import codecs
 import json
 import logging
 import sys
-from distutils.util import strtobool
 from io import open
 
+from distutils.util import strtobool
+
 from paddlespeech.s2t.utils.cli_utils import get_commandline_args
 
 PY2 = sys.version_info[0] == 2