Merge branch 'develop' into amp

1 year ago · bc365cbb52
parent f3d567f93b 7cab869d63
commit bc365cbb52
816 changed files with 30798 additions and 32546 deletions
--- a/.github/CONTRIBUTING.md
+++ b/.github/CONTRIBUTING.md
@ -27,4 +27,4 @@ git commit -m "xxxxxx, test=doc"
 1. 虽然跳过了 CI，但是还要先排队排到才能跳过，所以非自己方向看到 pending 不要着急 🤣
 2. 在 `git commit --amend` 的时候才加 `test=xxx` 可能不太有效
 3. 一个 pr 多次提交 commit 注意每次都要加 `test=xxx`，因为每个 commit 都会触发 CI
-4. 删除 python 环境中已经安装好的的 paddlespeech，否则可能会影响 import paddlespeech 的顺序</div>
+4. 删除 python 环境中已经安装好的 paddlespeech，否则可能会影响 import paddlespeech 的顺序</div>
--- a/.mergify.yml
+++ b/.mergify.yml
@ -136,7 +136,7 @@ pull_request_rules:
        add: ["Docker"]
  - name: "auto add label=Deployment"
    conditions:
-      - files~=^speechx/
+      - files~=^runtime/
    actions:
      label:
        add: ["Deployment"]
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -3,8 +3,12 @@ repos:
    rev: v0.16.0
    hooks:
    -   id: yapf
-        files: \.py$
-        exclude: (?=third_party).*(\.py)$
+        name: yapf
+        language: python
+        entry: yapf
+        args: [-i, -vv]
+        types: [python]
+        exclude: (?=runtime/engine/kaldi|audio/paddleaudio/src|third_party).*(\.cpp|\.cc|\.h\.hpp|\.py)$

 -   repo: https://github.com/pre-commit/pre-commit-hooks
    rev: a11d9314b22d8f8c7556443875b731ef05965464
@ -31,7 +35,7 @@ repos:
        -  --ignore=E501,E228,E226,E261,E266,E128,E402,W503
        -  --builtins=G,request
        -  --jobs=1
-        exclude: (?=speechx/speechx/kaldi|audio/paddleaudio/src|audio/paddleaudio/third_party|third_party).*(\.cpp|\.cc|\.h\.hpp|\.py)$
+        exclude: (?=runtime/engine/kaldi|audio/paddleaudio/src|third_party).*(\.cpp|\.cc|\.h\.hpp|\.py)$

 -   repo : https://github.com/Lucas-C/pre-commit-hooks
    rev: v1.0.1
@ -53,16 +57,16 @@ repos:
        entry: bash .pre-commit-hooks/clang-format.hook -i
        language: system
        files: \.(h\+\+|h|hh|hxx|hpp|cuh|c|cc|cpp|cu|c\+\+|cxx|tpp|txx)$
-        exclude: (?=speechx/speechx/kaldi|audio/paddleaudio/src|audio/paddleaudio/third_party/kaldi-native-fbank/csrc|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin|third_party/ctc_decoders).*(\.cpp|\.cc|\.h|\.hpp|\.py)$ 
+        exclude: (?=runtime/engine/kaldi|audio/paddleaudio/src|runtime/patch|runtime/tools/fstbin|runtime/tools/lmbin|third_party/ctc_decoders|runtime/engine/common/utils).*(\.cpp|\.cc|\.h|\.hpp|\.py)$ 
    -   id: cpplint
        name: cpplint
        description: Static code analysis of C/C++ files
        language: python
        files: \.(h\+\+|h|hh|hxx|hpp|cuh|c|cc|cpp|cu|c\+\+|cxx|tpp|txx)$
-        exclude: (?=speechx/speechx/kaldi|audio/paddleaudio/src|audio/paddleaudio/third_party/kaldi-native-fbank/csrc|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin|third_party/ctc_decoders).*(\.cpp|\.cc|\.h|\.hpp|\.py)$ 
+        exclude: (?=runtime/engine/kaldi|runtime/engine/common/matrix|audio/paddleaudio/src|runtime/patch|runtime/tools/fstbin|runtime/tools/lmbin|third_party/ctc_decoders|runtime/engine/common/utils).*(\.cpp|\.cc|\.h|\.hpp|\.py)$ 
        entry: cpplint --filter=-build,-whitespace,+whitespace/comma,-whitespace/indent
 -   repo: https://github.com/asottile/reorder_python_imports
    rev: v2.4.0
    hooks:
      - id: reorder-python-imports
-        exclude: (?=speechx/speechx/kaldi|audio/paddleaudio/src|speechx/patch|speechx/tools/fstbin|speechx/tools/lmbin|third_party/ctc_decoders).*(\.cpp|\.cc|\.h\.hpp|\.py)$
+        exclude: (?=runtime/engine/kaldi|audio/paddleaudio/src|runtime/patch|runtime/tools/fstbin|runtime/tools/lmbin|third_party/ctc_decoders).*(\.cpp|\.cc|\.h\.hpp|\.py)$
--- a/README.md
+++ b/README.md
@ -179,6 +179,7 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision

 ### Recent Update
 - 👑 2023.04.25: Add [AMP for U2 conformer](https://github.com/PaddlePaddle/PaddleSpeech/pull/3167).
+- 🔥 2023.04.06: Add [subtitle file (.srt format) generation example](./demos/streaming_asr_server).
 - 🔥 2023.03.14: Add SVS(Singing Voice Synthesis) examples with Opencpop dataset, including [DiffSinger](./examples/opencpop/svs1)、[PWGAN](./examples/opencpop/voc1) and [HiFiGAN](./examples/opencpop/voc5), the effect is continuously optimized.
 - 👑 2023.03.09: Add [Wav2vec2ASR-zh](./examples/aishell/asr3).
 - 🎉 2023.03.07: Add [TTS ARM Linux C++ Demo (with C++ Chinese Text Frontend)](./demos/TTSArmLinux).
@ -193,7 +194,7 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision
 - 👑 2022.11.18: Add [Whisper CLI and Demos](https://github.com/PaddlePaddle/PaddleSpeech/pull/2640), support multi language recognition and translation.
 - 🔥 2022.11.18: Add [Wav2vec2 CLI and Demos](./demos/speech_ssl), Support ASR and Feature Extraction.
 - 🎉 2022.11.17: Add [male voice for TTS](https://github.com/PaddlePaddle/PaddleSpeech/pull/2660).
- 🔥 2022.11.07: Add [U2/U2++ C++ High Performance Streaming ASR Deployment](./speechx/examples/u2pp_ol/wenetspeech).
+- 🔥 2022.11.07: Add [U2/U2++ C++ High Performance Streaming ASR Deployment](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/runtime/examples/u2pp_ol/wenetspeech).
 - 👑 2022.11.01: Add [Adversarial Loss](https://arxiv.org/pdf/1907.04448.pdf) for [Chinese English mixed TTS](./examples/zh_en_tts/tts3).
 - 🔥 2022.10.26: Add [Prosody Prediction](./examples/other/rhy) for TTS.
 - 🎉 2022.10.21: Add [SSML](https://github.com/PaddlePaddle/PaddleSpeech/discussions/2538) for TTS Chinese Text Frontend.
--- a/README_cn.md
+++ b/README_cn.md
@ -184,6 +184,7 @@

 ### 近期更新
 - 👑 2023.04.25: 新增 [U2 conformer 的 AMP 训练](https://github.com/PaddlePaddle/PaddleSpeech/pull/3167).
+- 👑 2023.04.06: 新增 [srt格式字幕生成功能](./demos/streaming_asr_server)。
 - 🔥 2023.03.14: 新增基于 Opencpop 数据集的 SVS (歌唱合成) 示例，包含 [DiffSinger](./examples/opencpop/svs1)、[PWGAN](./examples/opencpop/voc1) 和 [HiFiGAN](./examples/opencpop/voc5)，效果持续优化中。
 - 👑 2023.03.09: 新增 [Wav2vec2ASR-zh](./examples/aishell/asr3)。
 - 🎉 2023.03.07: 新增 [TTS ARM Linux C++ 部署示例 (包含 C++ 中文文本前端模块)](./demos/TTSArmLinux)。
--- a/audio/paddleaudio/backends/soundfile_backend.py
+++ b/audio/paddleaudio/backends/soundfile_backend.py
@ -191,7 +191,7 @@ def soundfile_save(y: np.ndarray, sr: int, file: os.PathLike) -> None:

    if sr <= 0:
        raise ParameterError(
-            f'Sample rate should be larger than 0, recieved sr = {sr}')
+            f'Sample rate should be larger than 0, received sr = {sr}')

    if y.dtype not in ['int16', 'int8']:
        warnings.warn(
--- a/dataset/aidatatang_200zh/aidatatang_200zh.py
+++ b/dataset/aidatatang_200zh/aidatatang_200zh.py
@ -18,139 +18,7 @@ Manifest file is a json-format file with each line containing the
 meta data (i.e. audio filepath, transcript and audio duration)
 of each audio file in the data set.
 """
-import argparse
-import codecs
-import json
-import os
-from pathlib import Path
-
-import soundfile
-
-from utils.utility import download
-from utils.utility import unpack
-
-DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
-
-URL_ROOT = 'http://www.openslr.org/resources/62'
-# URL_ROOT = 'https://openslr.magicdatatech.com/resources/62'
-DATA_URL = URL_ROOT + '/aidatatang_200zh.tgz'
-MD5_DATA = '6e0f4f39cd5f667a7ee53c397c8d0949'
-
-parser = argparse.ArgumentParser(description=__doc__)
-parser.add_argument(
-    "--target_dir",
-    default=DATA_HOME + "/aidatatang_200zh",
-    type=str,
-    help="Directory to save the dataset. (default: %(default)s)")
-parser.add_argument(
-    "--manifest_prefix",
-    default="manifest",
-    type=str,
-    help="Filepath prefix for output manifests. (default: %(default)s)")
-args = parser.parse_args()
-
-
-def create_manifest(data_dir, manifest_path_prefix):
-    print("Creating manifest %s ..." % manifest_path_prefix)
-    json_lines = []
-    transcript_path = os.path.join(data_dir, 'transcript',
-                                   'aidatatang_200_zh_transcript.txt')
-    transcript_dict = {}
-    for line in codecs.open(transcript_path, 'r', 'utf-8'):
-        line = line.strip()
-        if line == '':
-            continue
-        audio_id, text = line.split(' ', 1)
-        # remove withespace, charactor text
-        text = ''.join(text.split())
-        transcript_dict[audio_id] = text
-
-    data_types = ['train', 'dev', 'test']
-    for dtype in data_types:
-        del json_lines[:]
-        total_sec = 0.0
-        total_text = 0.0
-        total_num = 0
-
-        audio_dir = os.path.join(data_dir, 'corpus/', dtype)
-        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
-            for fname in filelist:
-                if not fname.endswith('.wav'):
-                    continue
-
-                audio_path = os.path.abspath(os.path.join(subfolder, fname))
-                audio_id = os.path.basename(fname)[:-4]
-                utt2spk = Path(audio_path).parent.name
-
-                audio_data, samplerate = soundfile.read(audio_path)
-                duration = float(len(audio_data) / samplerate)
-                text = transcript_dict[audio_id]
-                json_lines.append(
-                    json.dumps(
-                        {
-                            'utt': audio_id,
-                            'utt2spk': str(utt2spk),
-                            'feat': audio_path,
-                            'feat_shape': (duration, ),  # second
-                            'text': text,
-                        },
-                        ensure_ascii=False))
-
-                total_sec += duration
-                total_text += len(text)
-                total_num += 1
-
-        manifest_path = manifest_path_prefix + '.' + dtype
-        with codecs.open(manifest_path, 'w', 'utf-8') as fout:
-            for line in json_lines:
-                fout.write(line + '\n')
-
-        manifest_dir = os.path.dirname(manifest_path_prefix)
-        meta_path = os.path.join(manifest_dir, dtype) + '.meta'
-        with open(meta_path, 'w') as f:
-            print(f"{dtype}:", file=f)
-            print(f"{total_num} utts", file=f)
-            print(f"{total_sec / (60*60)} h", file=f)
-            print(f"{total_text} text", file=f)
-            print(f"{total_text / total_sec} text/sec", file=f)
-            print(f"{total_sec / total_num} sec/utt", file=f)
-
-
-def prepare_dataset(url, md5sum, target_dir, manifest_path, subset):
-    """Download, unpack and create manifest file."""
-    data_dir = os.path.join(target_dir, subset)
-    if not os.path.exists(data_dir):
-        filepath = download(url, md5sum, target_dir)
-        unpack(filepath, target_dir)
-        # unpack all audio tar files
-        audio_dir = os.path.join(data_dir, 'corpus')
-        for subfolder, dirlist, filelist in sorted(os.walk(audio_dir)):
-            for sub in dirlist:
-                print(f"unpack dir {sub}...")
-                for folder, _, filelist in sorted(
-                        os.walk(os.path.join(subfolder, sub))):
-                    for ftar in filelist:
-                        unpack(os.path.join(folder, ftar), folder, True)
-    else:
-        print("Skip downloading and unpacking. Data already exists in %s." %
-              target_dir)
-
-    create_manifest(data_dir, manifest_path)
-
-
-def main():
-    if args.target_dir.startswith('~'):
-        args.target_dir = os.path.expanduser(args.target_dir)
-
-    prepare_dataset(
-        url=DATA_URL,
-        md5sum=MD5_DATA,
-        target_dir=args.target_dir,
-        manifest_path=args.manifest_prefix,
-        subset='aidatatang_200zh')
-
-    print("Data download and manifest prepare done!")
-
+from paddlespeech.dataset.aidatatang_200zh import aidatatang_200zh_main

 if __name__ == '__main__':
-    main()
+    aidatatang_200zh_main()
--- a/dataset/aishell/README.md
+++ b/dataset/aishell/README.md
@ -1,3 +0,0 @@
-# [Aishell1](http://openslr.elda.org/33/)
-
-This Open Source Mandarin Speech Corpus, AISHELL-ASR0009-OS1, is 178 hours long. It is a part of AISHELL-ASR0009, of which utterance contains 11 domains, including smart home, autonomous driving, and industrial production. The whole recording was put in quiet indoor environment, using 3 different devices at the same time: high fidelity microphone (44.1kHz, 16-bit,); Android-system mobile phone (16kHz, 16-bit), iOS-system mobile phone (16kHz, 16-bit). Audios in high fidelity were re-sampled to 16kHz to build AISHELL- ASR0009-OS1. 400 speakers from different accent areas in China were invited to participate in the recording. The manual transcription accuracy rate is above 95%, through professional speech annotation and strict quality inspection. The corpus is divided into training, development and testing sets. ( This database is free for academic research, not in the commerce, if without permission. )
--- a/dataset/aishell/aishell.py
+++ b/dataset/aishell/aishell.py
@ -18,143 +18,7 @@ Manifest file is a json-format file with each line containing the
 meta data (i.e. audio filepath, transcript and audio duration)
 of each audio file in the data set.
 """
-import argparse
-import codecs
-import json
-import os
-from pathlib import Path
-
-import soundfile
-
-from utils.utility import download
-from utils.utility import unpack
-
-DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
-
-URL_ROOT = 'http://openslr.elda.org/resources/33'
-# URL_ROOT = 'https://openslr.magicdatatech.com/resources/33'
-DATA_URL = URL_ROOT + '/data_aishell.tgz'
-MD5_DATA = '2f494334227864a8a8fec932999db9d8'
-RESOURCE_URL = URL_ROOT + '/resource_aishell.tgz'
-MD5_RESOURCE = '957d480a0fcac85fc18e550756f624e5'
-
-parser = argparse.ArgumentParser(description=__doc__)
-parser.add_argument(
-    "--target_dir",
-    default=DATA_HOME + "/Aishell",
-    type=str,
-    help="Directory to save the dataset. (default: %(default)s)")
-parser.add_argument(
-    "--manifest_prefix",
-    default="manifest",
-    type=str,
-    help="Filepath prefix for output manifests. (default: %(default)s)")
-args = parser.parse_args()
-
-
-def create_manifest(data_dir, manifest_path_prefix):
-    print("Creating manifest %s ..." % manifest_path_prefix)
-    json_lines = []
-    transcript_path = os.path.join(data_dir, 'transcript',
-                                   'aishell_transcript_v0.8.txt')
-    transcript_dict = {}
-    for line in codecs.open(transcript_path, 'r', 'utf-8'):
-        line = line.strip()
-        if line == '':
-            continue
-        audio_id, text = line.split(' ', 1)
-        # remove withespace, charactor text
-        text = ''.join(text.split())
-        transcript_dict[audio_id] = text
-
-    data_types = ['train', 'dev', 'test']
-    for dtype in data_types:
-        del json_lines[:]
-        total_sec = 0.0
-        total_text = 0.0
-        total_num = 0
-
-        audio_dir = os.path.join(data_dir, 'wav', dtype)
-        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
-            for fname in filelist:
-                audio_path = os.path.abspath(os.path.join(subfolder, fname))
-                audio_id = os.path.basename(fname)[:-4]
-                # if no transcription for audio then skipped
-                if audio_id not in transcript_dict:
-                    continue
-
-                utt2spk = Path(audio_path).parent.name
-                audio_data, samplerate = soundfile.read(audio_path)
-                duration = float(len(audio_data) / samplerate)
-                text = transcript_dict[audio_id]
-                json_lines.append(
-                    json.dumps(
-                        {
-                            'utt': audio_id,
-                            'utt2spk': str(utt2spk),
-                            'feat': audio_path,
-                            'feat_shape': (duration, ),  # second
-                            'text': text
-                        },
-                        ensure_ascii=False))
-
-                total_sec += duration
-                total_text += len(text)
-                total_num += 1
-
-        manifest_path = manifest_path_prefix + '.' + dtype
-        with codecs.open(manifest_path, 'w', 'utf-8') as fout:
-            for line in json_lines:
-                fout.write(line + '\n')
-
-        manifest_dir = os.path.dirname(manifest_path_prefix)
-        meta_path = os.path.join(manifest_dir, dtype) + '.meta'
-        with open(meta_path, 'w') as f:
-            print(f"{dtype}:", file=f)
-            print(f"{total_num} utts", file=f)
-            print(f"{total_sec / (60*60)} h", file=f)
-            print(f"{total_text} text", file=f)
-            print(f"{total_text / total_sec} text/sec", file=f)
-            print(f"{total_sec / total_num} sec/utt", file=f)
-
-
-def prepare_dataset(url, md5sum, target_dir, manifest_path=None):
-    """Download, unpack and create manifest file."""
-    data_dir = os.path.join(target_dir, 'data_aishell')
-    if not os.path.exists(data_dir):
-        filepath = download(url, md5sum, target_dir)
-        unpack(filepath, target_dir)
-        # unpack all audio tar files
-        audio_dir = os.path.join(data_dir, 'wav')
-        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
-            for ftar in filelist:
-                unpack(os.path.join(subfolder, ftar), subfolder, True)
-    else:
-        print("Skip downloading and unpacking. Data already exists in %s." %
-              target_dir)
-
-    if manifest_path:
-        create_manifest(data_dir, manifest_path)
-
-
-def main():
-    if args.target_dir.startswith('~'):
-        args.target_dir = os.path.expanduser(args.target_dir)
-
-    prepare_dataset(
-        url=DATA_URL,
-        md5sum=MD5_DATA,
-        target_dir=args.target_dir,
-        manifest_path=args.manifest_prefix)
-
-    prepare_dataset(
-        url=RESOURCE_URL,
-        md5sum=MD5_RESOURCE,
-        target_dir=args.target_dir,
-        manifest_path=None)
-
-    print("Data download and manifest prepare done!")
-
+from paddlespeech.dataset.aishell import aishell_main

 if __name__ == '__main__':
-    main()
+    aishell_main()
--- a/dataset/librispeech/librispeech.py
+++ b/dataset/librispeech/librispeech.py
@ -28,8 +28,8 @@ from multiprocessing.pool import Pool
 import distutils.util
 import soundfile

-from utils.utility import download
-from utils.utility import unpack
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unpack

 URL_ROOT = "http://openslr.elda.org/resources/12"
 #URL_ROOT = "https://openslr.magicdatatech.com/resources/12"
--- a/dataset/mini_librispeech/mini_librispeech.py
+++ b/dataset/mini_librispeech/mini_librispeech.py
@ -27,8 +27,8 @@ from multiprocessing.pool import Pool

 import soundfile

-from utils.utility import download
-from utils.utility import unpack
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unpack

 URL_ROOT = "http://openslr.elda.org/resources/31"
 URL_TRAIN_CLEAN = URL_ROOT + "/train-clean-5.tar.gz"
--- a/dataset/musan/musan.py
+++ b/dataset/musan/musan.py
@ -29,8 +29,8 @@ import os

 import soundfile

-from utils.utility import download
-from utils.utility import unpack
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unpack

 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')

--- a/dataset/rir_noise/rir_noise.py
+++ b/dataset/rir_noise/rir_noise.py
@ -29,8 +29,8 @@ import os

 import soundfile

-from utils.utility import download
-from utils.utility import unzip
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unzip

 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')

--- a/dataset/thchs30/thchs30.py
+++ b/dataset/thchs30/thchs30.py
@ -27,8 +27,8 @@ from pathlib import Path

 import soundfile

-from utils.utility import download
-from utils.utility import unpack
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unpack

 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')

--- a/dataset/timit/timit.py
+++ b/dataset/timit/timit.py
@ -28,7 +28,7 @@ from pathlib import Path

 import soundfile

-from utils.utility import unzip
+from paddlespeech.dataset.download import unzip

 URL_ROOT = ""
 MD5_DATA = "45c68037c7fdfe063a43c851f181fb2d"
--- a/dataset/voxceleb/voxceleb1.py
+++ b/dataset/voxceleb/voxceleb1.py
@ -31,9 +31,9 @@ from pathlib import Path

 import soundfile

-from utils.utility import check_md5sum
-from utils.utility import download
-from utils.utility import unzip
+from paddlespeech.dataset.download import check_md5sum
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unzip

 # all the data will be download in the current data/voxceleb directory default
 DATA_HOME = os.path.expanduser('.')
--- a/dataset/voxceleb/voxceleb2.py
+++ b/dataset/voxceleb/voxceleb2.py
@ -27,9 +27,9 @@ from pathlib import Path

 import soundfile

-from utils.utility import check_md5sum
-from utils.utility import download
-from utils.utility import unzip
+from paddlespeech.dataset.download import check_md5sum
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unzip

 # all the data will be download in the current data/voxceleb directory default
 DATA_HOME = os.path.expanduser('.')
--- a/dataset/voxforge/voxforge.py
+++ b/dataset/voxforge/voxforge.py
@ -28,9 +28,9 @@ import subprocess

 import soundfile

-from utils.utility import download_multi
-from utils.utility import getfile_insensitive
-from utils.utility import unpack
+from paddlespeech.dataset.download import download_multi
+from paddlespeech.dataset.download import getfile_insensitive
+from paddlespeech.dataset.download import unpack

 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')

--- a/demos/TTSAndroid/README.md
+++ b/demos/TTSAndroid/README.md
@ -1,6 +1,6 @@
 # 语音合成 Java API Demo 使用指南

-在 Android 上实现语音合成功能，此 Demo 有很好的的易用性和开放性，如在 Demo 中跑自己训练好的模型等。
+在 Android 上实现语音合成功能，此 Demo 有很好的易用性和开放性，如在 Demo 中跑自己训练好的模型等。

 本文主要介绍语音合成 Demo 运行方法。

--- a/demos/TTSArmLinux/front.conf
+++ b/demos/TTSArmLinux/front.conf
@ -6,13 +6,13 @@
 --jieba_stop_word_path=./dict/jieba/stop_words.utf8

 # dict conf fastspeech2_0.4
--seperate_tone=false
+--separate_tone=false
 --word2phone_path=./dict/fastspeech2_nosil_baker_ckpt_0.4/word2phone_fs2.dict
 --phone2id_path=./dict/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
 --tone2id_path=./dict/fastspeech2_nosil_baker_ckpt_0.4/word2phone_fs2.dict

 # dict conf speedyspeech_0.5
-#--seperate_tone=true
+#--separate_tone=true
 #--word2phone_path=./dict/speedyspeech_nosil_baker_ckpt_0.5/word2phone.dict
 #--phone2id_path=./dict/speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt
 #--tone2id_path=./dict/speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt
--- a/demos/TTSCppFrontend/front_demo/front.conf
+++ b/demos/TTSCppFrontend/front_demo/front.conf
@ -6,13 +6,13 @@
 --jieba_stop_word_path=./front_demo/dict/jieba/stop_words.utf8

 # dict conf fastspeech2_0.4
--seperate_tone=false
+--separate_tone=false
 --word2phone_path=./front_demo/dict/fastspeech2_nosil_baker_ckpt_0.4/word2phone_fs2.dict
 --phone2id_path=./front_demo/dict/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
 --tone2id_path=./front_demo/dict/fastspeech2_nosil_baker_ckpt_0.4/word2phone_fs2.dict

 # dict conf speedyspeech_0.5
-#--seperate_tone=true
+#--separate_tone=true
 #--word2phone_path=./front_demo/dict/speedyspeech_nosil_baker_ckpt_0.5/word2phone.dict
 #--phone2id_path=./front_demo/dict/speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt
 #--tone2id_path=./front_demo/dict/speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt
--- a/demos/TTSCppFrontend/front_demo/front_demo.cpp
+++ b/demos/TTSCppFrontend/front_demo/front_demo.cpp
@ -20,7 +20,7 @@

 DEFINE_string(sentence, "你好，欢迎使用语音合成服务", "Text to be synthesized");
 DEFINE_string(front_conf, "./front_demo/front.conf", "Front conf file");
-// DEFINE_string(seperate_tone, "true", "If true, get phoneids and tonesid");
+// DEFINE_string(separate_tone, "true", "If true, get phoneids and tonesid");


 int main(int argc, char** argv) {
--- a/demos/TTSCppFrontend/front_demo/gentools/word2phones.py
+++ b/demos/TTSCppFrontend/front_demo/gentools/word2phones.py
@ -20,7 +20,7 @@ worddict = "./dict/jieba_part.dict.utf8"
 newdict = "./dict/word_phones.dict"


-def GenPhones(initials, finals, seperate=True):
+def GenPhones(initials, finals, separate=True):

    phones = []
    for c, v in zip(initials, finals):
@ -30,9 +30,9 @@ def GenPhones(initials, finals, seperate=True):
            elif c in ['zh', 'ch', 'sh', 'r']:
                v = re.sub('i', 'iii', v)
        if c:
-            if seperate is True:
+            if separate is True:
                phones.append(c + '0')
-            elif seperate is False:
+            elif separate is False:
                phones.append(c)
            else:
                print("Not sure whether phone and tone need to be separated")
--- a/demos/TTSCppFrontend/src/front/front_interface.cpp
+++ b/demos/TTSCppFrontend/src/front/front_interface.cpp
@ -126,7 +126,7 @@ int FrontEngineInterface::init() {
    }

    // 生成音调字典（音调到音调id的映射）
-    if (_seperate_tone == "true") {
+    if (_separate_tone == "true") {
        if (0 != GenDict(_tone2id_path, &tone_id_map)) {
            LOG(ERROR) << "Genarate tone2id dict failed";
            return -1;
@ -168,7 +168,7 @@ int FrontEngineInterface::ReadConfFile() {
    _jieba_stop_word_path = conf_map["jieba_stop_word_path"];

    // dict path
-    _seperate_tone = conf_map["seperate_tone"];
+    _separate_tone = conf_map["separate_tone"];
    _word2phone_path = conf_map["word2phone_path"];
    _phone2id_path = conf_map["phone2id_path"];
    _tone2id_path = conf_map["tone2id_path"];
@ -295,7 +295,7 @@ int FrontEngineInterface::GetWordsIds(
                    }
                }
            } else {  // 标点符号
-                if (_seperate_tone == "true") {
+                if (_separate_tone == "true") {
                    phone = "sp0";  // speedyspeech
                } else {
                    phone = "sp";  // fastspeech2
@ -354,7 +354,7 @@ int FrontEngineInterface::Phone2Phoneid(const std::string &phone,
    std::string temp_phone;
    for (int i = 0; i < phone_vec.size(); i++) {
        temp_phone = phone_vec[i];
-        if (_seperate_tone == "true") {
+        if (_separate_tone == "true") {
            phoneid->push_back(atoi(
                (phone_id_map[temp_phone.substr(0, temp_phone.length() - 1)])
                    .c_str()));
--- a/demos/TTSCppFrontend/src/front/front_interface.h
+++ b/demos/TTSCppFrontend/src/front/front_interface.h
@ -182,7 +182,7 @@ class FrontEngineInterface : public TextNormalizer {
    std::string _jieba_idf_path;
    std::string _jieba_stop_word_path;

-    std::string _seperate_tone;
+    std::string _separate_tone;
    std::string _word2phone_path;
    std::string _phone2id_path;
    std::string _tone2id_path;
--- a/demos/audio_searching/src/test_audio_search.py
+++ b/demos/audio_searching/src/test_audio_search.py
@ -14,8 +14,8 @@
 from audio_search import app
 from fastapi.testclient import TestClient

-from utils.utility import download
-from utils.utility import unpack
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unpack

 client = TestClient(app)

--- a/demos/audio_searching/src/test_vpr_search.py
+++ b/demos/audio_searching/src/test_vpr_search.py
@ -14,8 +14,8 @@
 from fastapi.testclient import TestClient
 from vpr_search import app

-from utils.utility import download
-from utils.utility import unpack
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unpack

 client = TestClient(app)

--- a/demos/speech_web/README.md
+++ b/demos/speech_web/README.md
@ -23,7 +23,7 @@ Paddle Speech Demo 是一个以 PaddleSpeech 的语音交互功能为主体开

 + ERNIE-SAT：语言-语音跨模态大模型 ERNIE-SAT 可视化展示示例，支持个性化合成，跨语言语音合成（音频为中文则输入英文文本进行合成），语音编辑（修改音频文字中间的结果）功能。 ERNIE-SAT 更多实现细节，可以参考：
  + [【ERNIE-SAT with AISHELL-3 dataset】](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/ernie_sat)
-  + [【ERNIE-SAT with with AISHELL3 and VCTK datasets】](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3_vctk/ernie_sat)
+  + [【ERNIE-SAT with AISHELL3 and VCTK datasets】](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3_vctk/ernie_sat)
  + [【ERNIE-SAT with VCTK dataset】](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/ernie_sat)

 运行效果：
--- a/demos/speech_web/speech_server/main.py
+++ b/demos/speech_web/speech_server/main.py
@ -260,7 +260,7 @@ async def websocket_endpoint_online(websocket: WebSocket):
                #              and we break the loop
                if message['signal'] == 'start':
                    resp = {"status": "ok", "signal": "server_ready"}
-                    # do something at begining here
+                    # do something at beginning here
                    # create the instance to process the audio
                    # connection_handler = chatbot.asr.connection_handler
                    connection_handler = PaddleASRConnectionHanddler(engine)
--- a/demos/streaming_asr_server/README.md
+++ b/demos/streaming_asr_server/README.md
--- a/demos/streaming_asr_server/README_cn.md
+++ b/demos/streaming_asr_server/README_cn.md
--- a/demos/streaming_asr_server/local/websocket_client_srt.py
+++ b/demos/streaming_asr_server/local/websocket_client_srt.py
@ -0,0 +1,162 @@
+#!/usr/bin/python
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# calc avg RTF(NOT Accurate): grep -rn RTF log.txt | awk '{print $NF}' | awk -F "=" '{sum += $NF} END {print "all time",sum, "audio num", NR,  "RTF", sum/NR}'
+# python3 websocket_client.py --server_ip 127.0.0.1 --port 8290 --punc.server_ip 127.0.0.1 --punc.port 8190 --wavfile ./zh.wav
+# python3 websocket_client.py --server_ip 127.0.0.1 --port 8290 --wavfile ./zh.wav
+import argparse
+import asyncio
+import codecs
+import os
+from pydub import AudioSegment
+import re
+
+from paddlespeech.cli.log import logger
+from paddlespeech.server.utils.audio_handler import ASRWsAudioHandler
+
+def convert_to_wav(input_file):
+    # Load audio file
+    audio = AudioSegment.from_file(input_file)
+
+    # Set parameters for audio file
+    audio = audio.set_channels(1)
+    audio = audio.set_frame_rate(16000)
+
+    # Create output filename
+    output_file = os.path.splitext(input_file)[0] + ".wav"
+
+    # Export audio file as WAV
+    audio.export(output_file, format="wav")
+
+    logger.info(f"{input_file} converted to {output_file}")
+
+def format_time(sec):
+    # Convert seconds to SRT format (HH:MM:SS,ms)
+    hours = int(sec/3600)
+    minutes = int((sec%3600)/60)
+    seconds = int(sec%60)
+    milliseconds = int((sec%1)*1000)
+    return f'{hours:02d}:{minutes:02d}:{seconds:02d},{milliseconds:03d}'
+
+def results2srt(results, srt_file):
+    """convert results from paddlespeech to srt format for subtitle
+    Args:
+        results (dict): results from paddlespeech
+    """
+    # times contains start and end time of each word
+    times = results['times']
+    # result contains the whole sentence including punctuation
+    result = results['result']
+    # split result into several sencences by '，' and '。'
+    sentences = re.split('，|。', result)[:-1]
+    # print("sentences: ", sentences)
+    # generate relative time for each sentence in sentences
+    relative_times = []
+    word_i = 0
+    for sentence in sentences:
+        relative_times.append([])
+        for word in sentence:
+            if relative_times[-1] == []:
+                relative_times[-1].append(times[word_i]['bg'])
+            if len(relative_times[-1]) == 1:
+                relative_times[-1].append(times[word_i]['ed'])
+            else:
+                relative_times[-1][1] = times[word_i]['ed']
+            word_i += 1
+    # print("relative_times: ", relative_times)
+    # generate srt file acoording to relative_times and sentences
+    with open(srt_file, 'w') as f:
+        for i in range(len(sentences)):
+            # Write index number
+            f.write(str(i+1)+'\n')
+            
+            # Write start and end times
+            start = format_time(relative_times[i][0])
+            end = format_time(relative_times[i][1])
+            f.write(start + ' --> ' + end + '\n')
+            
+            # Write text
+            f.write(sentences[i]+'\n\n')
+    logger.info(f"results saved to {srt_file}")
+
+def main(args):
+    logger.info("asr websocket client start")
+    handler = ASRWsAudioHandler(
+        args.server_ip,
+        args.port,
+        endpoint=args.endpoint,
+        punc_server_ip=args.punc_server_ip,
+        punc_server_port=args.punc_server_port)
+    loop = asyncio.get_event_loop()
+
+    # check if the wav file is mp3 format
+    # if so, convert it to wav format using convert_to_wav function
+    if args.wavfile and os.path.exists(args.wavfile):
+        if args.wavfile.endswith(".mp3"):
+            convert_to_wav(args.wavfile)
+            args.wavfile = args.wavfile.replace(".mp3", ".wav")
+
+    # support to process single audio file
+    if args.wavfile and os.path.exists(args.wavfile):
+        logger.info(f"start to process the wavscp: {args.wavfile}")
+        result = loop.run_until_complete(handler.run(args.wavfile))
+        # result = result["result"]
+        # logger.info(f"asr websocket client finished : {result}")
+        results2srt(result, args.wavfile.replace(".wav", ".srt"))
+
+    # support to process batch audios from wav.scp
+    if args.wavscp and os.path.exists(args.wavscp):
+        logger.info(f"start to process the wavscp: {args.wavscp}")
+        with codecs.open(args.wavscp, 'r', encoding='utf-8') as f,\
+             codecs.open("result.txt", 'w', encoding='utf-8') as w:
+            for line in f:
+                utt_name, utt_path = line.strip().split()
+                result = loop.run_until_complete(handler.run(utt_path))
+                result = result["result"]
+                w.write(f"{utt_name} {result}\n")
+
+
+if __name__ == "__main__":
+    logger.info("Start to do streaming asr client")
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--server_ip', type=str, default='127.0.0.1', help='server ip')
+    parser.add_argument('--port', type=int, default=8090, help='server port')
+    parser.add_argument(
+        '--punc.server_ip',
+        type=str,
+        default=None,
+        dest="punc_server_ip",
+        help='Punctuation server ip')
+    parser.add_argument(
+        '--punc.port',
+        type=int,
+        default=8091,
+        dest="punc_server_port",
+        help='Punctuation server port')
+    parser.add_argument(
+        "--endpoint",
+        type=str,
+        default="/paddlespeech/asr/streaming",
+        help="ASR websocket endpoint")
+    parser.add_argument(
+        "--wavfile",
+        action="store",
+        help="wav file path ",
+        default="./16_audio.wav")
+    parser.add_argument(
+        "--wavscp", type=str, default=None, help="The batch audios dict text")
+    args = parser.parse_args()
+
+    main(args)
--- a/docs/source/tts/quick_start.md
+++ b/docs/source/tts/quick_start.md
@ -79,8 +79,8 @@ checkpoint_name
 ├── snapshot_iter_*.pdz
 ├── speech_stats.npy
 ├── phone_id_map.txt
-├── spk_id_map.txt (optimal)
-└── tone_id_map.txt (optimal)
+├── spk_id_map.txt (optional)
+└── tone_id_map.txt (optional)
 ```
 **Vocoders:**
 ```text
--- a/docs/source/tts/quick_start_cn.md
+++ b/docs/source/tts/quick_start_cn.md
@ -87,8 +87,8 @@ checkpoint_name
 ├── snapshot_iter_*.pdz
 ├── speech_stats.npy
 ├── phone_id_map.txt
-├── spk_id_map.txt (optimal)
-└── tone_id_map.txt (optimal)
+├── spk_id_map.txt (optional)
+└── tone_id_map.txt (optional)
 ```
 **Vocoders:**
 ```text
--- a/docs/tutorial/st/st_tutorial.ipynb
+++ b/docs/tutorial/st/st_tutorial.ipynb
@ -62,7 +62,7 @@
    "collapsed": false
   },
   "source": [
-    "# 使用Transformer进行端到端语音翻译的的基本流程\n",
+    "# 使用Transformer进行端到端语音翻译的基本流程\n",
    "## 基础模型\n",
    "由于 ASR 章节已经介绍了 Transformer 以及语音特征抽取，在此便不做过多介绍，感兴趣的同学可以去相关章节进行了解。\n",
    "\n",
--- a/docs/tutorial/tts/tts_tutorial.ipynb
+++ b/docs/tutorial/tts/tts_tutorial.ipynb
@ -464,7 +464,7 @@
    "<br><center> FastSpeech2 网络结构图</center></br>\n",
    "\n",
    "\n",
-    "PaddleSpeech TTS 实现的 FastSpeech2 与论文不同的地方在于，我们使用的的是 phone 级别的 `pitch` 和 `energy`(与 FastPitch 类似)，这样的合成结果可以更加**稳定**。\n",
+    "PaddleSpeech TTS 实现的 FastSpeech2 与论文不同的地方在于，我们使用的是 phone 级别的 `pitch` 和 `energy`(与 FastPitch 类似)，这样的合成结果可以更加**稳定**。\n",
    "<center><img src=\"https://ai-studio-static-online.cdn.bcebos.com/862c21456c784c41a83a308b7d9707f0810cc3b3c6f94ed48c60f5d32d0072f0\"></center>\n",
    "<br><center> FastPitch 网络结构图</center></br>\n",
    "\n",
--- a/examples/aishell/asr0/local/train.sh
+++ b/examples/aishell/asr0/local/train.sh
@ -1,6 +1,6 @@
 #!/bin/bash

-if [ $# -lt 2 ] && [ $# -gt 3 ];then
+if [ $# -lt 2 ] || [ $# -gt 3 ];then
    echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ips(optional)"
    exit -1
 fi
--- a/examples/aishell/asr1/local/test.sh
+++ b/examples/aishell/asr1/local/test.sh
@ -1,15 +1,21 @@
 #!/bin/bash

-if [ $# != 3 ];then
-    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix"
-    exit -1
-fi
+set -e

 stage=0
 stop_stage=100
+
+source utils/parse_options.sh || exit 1;
+
 ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."

+
+if [ $# != 3 ];then
+    echo "usage: ${0} config_path decode_config_path ckpt_path_prefix"
+    exit -1
+fi
+
 config_path=$1
 decode_config_path=$2
 ckpt_prefix=$3
@ -92,6 +98,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
 fi

 if [ ${stage} -le 101 ] && [ ${stop_stage} -ge 101 ]; then
+    echo "using sclite to compute cer..."
    # format the reference test file for sclite
    python utils/format_rsl.py \
        --origin_ref data/manifest.test.raw \
--- a/examples/aishell/asr1/local/train.sh
+++ b/examples/aishell/asr1/local/train.sh
@ -17,7 +17,7 @@ if [ ${seed} != 0  ]; then
    echo "using seed $seed & FLAGS_cudnn_deterministic=True ..."
 fi

-if [ $# -lt 2 ] && [ $# -gt 3 ];then
+if [ $# -lt 2 ] || [ $# -gt 3 ];then
    echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ips(optional)"
    exit -1
 fi
--- a/examples/aishell/asr3/local/train.sh
+++ b/examples/aishell/asr3/local/train.sh
@ -1,6 +1,6 @@
 #!/bin/bash

-if [ $# -lt 2 ] && [ $# -gt 3 ];then
+if [ $# -lt 2 ] || [ $# -gt 3 ];then
    echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ips(optional)"
    exit -1
 fi
--- a/examples/csmsc/jets/README.md
+++ b/examples/csmsc/jets/README.md
@ -0,0 +1,108 @@
+# JETS with CSMSC
+This example contains code used to train a [JETS](https://arxiv.org/abs/2203.16852v1) model with [Chinese Standard Mandarin Speech Copus](https://www.data-baker.com/open_source.html).
+
+## Dataset
+### Download and Extract
+Download CSMSC from it's [Official Website](https://test.data-baker.com/data/index/source).
+
+### Get MFA Result and Extract
+We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get phonemes and durations for JETS.
+You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
+
+## Get Started
+Assume the path to the dataset is `~/datasets/BZNSYP`.
+Assume the path to the MFA result of CSMSC is `./baker_alignment_tone`.
+Run the command below to
+1. **source path**.
+2. preprocess the dataset.
+3. train the model.
+4. synthesize wavs.
+    - synthesize waveform from `metadata.jsonl`.
+    - synthesize waveform from a text file.
+
+```bash
+./run.sh
+```
+You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, running the following command will only preprocess the dataset.
+```bash
+./run.sh --stage 0 --stop-stage 0
+```
+### Data Preprocessing
+```bash
+./local/preprocess.sh ${conf_path}
+```
+When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.
+
+```text
+dump
+├── dev
+│   ├── norm
+│   └── raw
+├── phone_id_map.txt
+├── speaker_id_map.txt
+├── test
+│   ├── norm
+│   └── raw
+└── train
+    ├── feats_stats.npy
+    ├── norm
+    └── raw
+```
+The dataset is split into 3 parts, namely `train`, `dev`, and` test`, each of which contains a `norm` and `raw` subfolder. The raw folder contains wave、mel spectrogram、speech、pitch and energy features of each utterance, while the norm folder contains normalized ones. The statistics used to normalize features are computed from the training set, which is located in `dump/train/feats_stats.npy`.
+
+Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file that contains phones, text_lengths, the path of feats, feats_lengths, the path of pitch features, the path of energy features, the path of raw waves, speaker, and the id of each utterance.
+
+### Model Training
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
+```
+`./local/train.sh` calls `${BIN_DIR}/train.py`.
+Here's the complete help message.
+```text
+usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
+                [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
+                [--ngpu NGPU] [--phones-dict PHONES_DICT]
+
+Train a JETS model.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --config CONFIG       config file to overwrite default config.
+  --train-metadata TRAIN_METADATA
+                        training data.
+  --dev-metadata DEV_METADATA
+                        dev data.
+  --output-dir OUTPUT_DIR
+                        output dir.
+  --ngpu NGPU           if ngpu == 0, use cpu.
+  --phones-dict PHONES_DICT
+                        phone vocabulary file.
+```
+1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
+2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder.
+3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are saved in `checkpoints/` inside this directory.
+4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
+5. `--phones-dict` is the path of the phone vocabulary file.
+
+### Synthesizing
+
+`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
+
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
+```
+
+`./local/synthesize_e2e.sh` calls `${BIN_DIR}/synthesize_e2e.py`, which can synthesize waveform from text file.
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name}
+```
+
+## Pretrained Model
+
+The pretrained model can be downloaded here:
+
+- [jets_csmsc_ckpt_1.5.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/jets_csmsc_ckpt_1.5.0.zip)
+
+The static model can be downloaded here:
+
+- [jets_csmsc_static_1.5.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/jets_csmsc_static_1.5.0.zip)
--- a/examples/csmsc/jets/conf/default.yaml
+++ b/examples/csmsc/jets/conf/default.yaml
@ -0,0 +1,224 @@
+# This configuration tested on 4 GPUs (V100) with 32GB GPU
+# memory. It takes around 2 weeks to finish the training
+# but 100k iters model should generate reasonable results.
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+
+n_mels: 80
+fs: 22050         # sr
+n_fft: 1024        # FFT size (samples).
+n_shift: 256       # Hop size (samples). 12.5ms
+win_length: null   # Window length (samples). 50ms
+                   # If set to null, it will be the same as fft_size.
+window: "hann"     # Window function.
+fmin: 0            # minimum frequency for Mel basis
+fmax: null         # maximum frequency for Mel basis
+f0min: 80          # Minimum f0 for pitch extraction.
+f0max: 400         # Maximum f0 for pitch extraction.
+
+
+##########################################################
+#                  TTS MODEL SETTING                     #
+##########################################################
+model:
+    # generator related
+    generator_type: jets_generator
+    generator_params:
+        adim: 256         # attention dimension
+        aheads: 2         # number of attention heads
+        elayers: 4        # number of encoder layers
+        eunits: 1024      # number of encoder ff units
+        dlayers: 4        # number of decoder layers
+        dunits: 1024      # number of decoder ff units
+        positionwise_layer_type: conv1d   # type of position-wise layer
+        positionwise_conv_kernel_size: 3  # kernel size of position wise conv layer
+        duration_predictor_layers: 2      # number of layers of duration predictor
+        duration_predictor_chans: 256     # number of channels of duration predictor
+        duration_predictor_kernel_size: 3 # filter size of duration predictor
+        use_masking: True                 # whether to apply masking for padded part in loss calculation
+        encoder_normalize_before: True    # whether to perform layer normalization before the input
+        decoder_normalize_before: True    # whether to perform layer normalization before the input
+        encoder_type: transformer           # encoder type
+        decoder_type: transformer           # decoder type
+        conformer_rel_pos_type: latest               # relative positional encoding type
+        conformer_pos_enc_layer_type: rel_pos        # conformer positional encoding type
+        conformer_self_attn_layer_type: rel_selfattn # conformer self-attention type
+        conformer_activation_type: swish             # conformer activation type
+        use_macaron_style_in_conformer: true         # whether to use macaron style in conformer
+        use_cnn_in_conformer: true                   # whether to use CNN in conformer
+        conformer_enc_kernel_size: 7                 # kernel size in CNN module of conformer-based encoder
+        conformer_dec_kernel_size: 31                # kernel size in CNN module of conformer-based decoder
+        init_type: xavier_uniform                    # initialization type
+        init_enc_alpha: 1.0                          # initial value of alpha for encoder
+        init_dec_alpha: 1.0                          # initial value of alpha for decoder
+        transformer_enc_dropout_rate: 0.2            # dropout rate for transformer encoder layer
+        transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
+        transformer_enc_attn_dropout_rate: 0.2       # dropout rate for transformer encoder attention layer
+        transformer_dec_dropout_rate: 0.2            # dropout rate for transformer decoder layer
+        transformer_dec_positional_dropout_rate: 0.2 # dropout rate for transformer decoder positional encoding
+        transformer_dec_attn_dropout_rate: 0.2       # dropout rate for transformer decoder attention layer
+        pitch_predictor_layers: 5                    # number of conv layers in pitch predictor
+        pitch_predictor_chans: 256                   # number of channels of conv layers in pitch predictor
+        pitch_predictor_kernel_size: 5               # kernel size of conv leyers in pitch predictor
+        pitch_predictor_dropout: 0.5                 # dropout rate in pitch predictor
+        pitch_embed_kernel_size: 1                   # kernel size of conv embedding layer for pitch
+        pitch_embed_dropout: 0.0                     # dropout rate after conv embedding layer for pitch
+        stop_gradient_from_pitch_predictor: true     # whether to stop the gradient from pitch predictor to encoder
+        energy_predictor_layers: 2                   # number of conv layers in energy predictor
+        energy_predictor_chans: 256                  # number of channels of conv layers in energy predictor
+        energy_predictor_kernel_size: 3              # kernel size of conv leyers in energy predictor
+        energy_predictor_dropout: 0.5                # dropout rate in energy predictor
+        energy_embed_kernel_size: 1                  # kernel size of conv embedding layer for energy
+        energy_embed_dropout: 0.0                    # dropout rate after conv embedding layer for energy
+        stop_gradient_from_energy_predictor: false   # whether to stop the gradient from energy predictor to encoder
+        generator_out_channels: 1
+        generator_channels: 512
+        generator_global_channels: -1
+        generator_kernel_size: 7
+        generator_upsample_scales: [8, 8, 2, 2]
+        generator_upsample_kernel_sizes: [16, 16, 4, 4]
+        generator_resblock_kernel_sizes: [3, 7, 11]
+        generator_resblock_dilations: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
+        generator_use_additional_convs: true
+        generator_bias: true
+        generator_nonlinear_activation: "leakyrelu"
+        generator_nonlinear_activation_params:
+            negative_slope: 0.1
+        generator_use_weight_norm: true
+        segment_size: 64              # segment size for random windowed discriminator
+
+    # discriminator related
+    discriminator_type: hifigan_multi_scale_multi_period_discriminator
+    discriminator_params:
+        scales: 1
+        scale_downsample_pooling: "AvgPool1D"
+        scale_downsample_pooling_params:
+            kernel_size: 4
+            stride: 2
+            padding: 2
+        scale_discriminator_params:
+            in_channels: 1
+            out_channels: 1
+            kernel_sizes: [15, 41, 5, 3]
+            channels: 128
+            max_downsample_channels: 1024
+            max_groups: 16
+            bias: True
+            downsample_scales: [2, 2, 4, 4, 1]
+            nonlinear_activation: "leakyrelu"
+            nonlinear_activation_params:
+                negative_slope: 0.1
+            use_weight_norm: True
+            use_spectral_norm: False
+        follow_official_norm: False
+        periods: [2, 3, 5, 7, 11]
+        period_discriminator_params:
+            in_channels: 1
+            out_channels: 1
+            kernel_sizes: [5, 3]
+            channels: 32
+            downsample_scales: [3, 3, 3, 3, 1]
+            max_downsample_channels: 1024
+            bias: True
+            nonlinear_activation: "leakyrelu"
+            nonlinear_activation_params:
+                negative_slope: 0.1
+            use_weight_norm: True
+            use_spectral_norm: False
+    # others
+    sampling_rate: 22050          # needed in the inference for saving wav
+    cache_generator_outputs: True # whether to cache generator outputs in the training
+use_alignment_module: False       # whether to use alignment module
+      
+###########################################################
+#                        LOSS SETTING                     #
+###########################################################
+# loss function related
+generator_adv_loss_params:
+    average_by_discriminators: False # whether to average loss value by #discriminators
+    loss_type: mse                   # loss type, "mse" or "hinge"
+discriminator_adv_loss_params:
+    average_by_discriminators: False # whether to average loss value by #discriminators
+    loss_type: mse                   # loss type, "mse" or "hinge"
+feat_match_loss_params:
+    average_by_discriminators: False # whether to average loss value by #discriminators
+    average_by_layers: False         # whether to average loss value by #layers of each discriminator
+    include_final_outputs: True      # whether to include final outputs for loss calculation
+mel_loss_params:
+    fs: 22050          # must be the same as the training data
+    fft_size: 1024        # fft points
+    hop_size: 256    # hop size
+    win_length: null   # window length
+    window: hann       # window type
+    num_mels: 80         # number of Mel basis
+    fmin: 0            # minimum frequency for Mel basis
+    fmax: null         # maximum frequency for Mel basis
+    log_base: null     # null represent natural log
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_adv: 1.0        # loss scaling coefficient for adversarial loss
+lambda_mel: 45.0       # loss scaling coefficient for Mel loss
+lambda_feat_match: 2.0 # loss scaling coefficient for feat match loss
+lambda_var: 1.0        # loss scaling coefficient for duration loss
+lambda_align: 2.0         # loss scaling coefficient for KL divergence loss
+# others
+sampling_rate: 22050          # needed in the inference for saving wav
+cache_generator_outputs: True # whether to cache generator outputs in the training
+
+
+# extra module for additional inputs
+pitch_extract: dio           # pitch extractor type
+pitch_extract_conf:
+    reduction_factor: 1
+    use_token_averaged_f0: false
+pitch_normalize: global_mvn  # normalizer for the pitch feature
+energy_extract: energy       # energy extractor type
+energy_extract_conf:
+    reduction_factor: 1
+    use_token_averaged_energy: false
+energy_normalize: global_mvn # normalizer for the energy feature
+
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 32              # Batch size.
+num_workers: 4              # Number of workers in DataLoader.
+
+##########################################################
+#            OPTIMIZER & SCHEDULER SETTING               #
+##########################################################
+# optimizer setting for generator
+generator_optimizer_params:
+    beta1: 0.8
+    beta2: 0.99
+    epsilon: 1.0e-9
+    weight_decay: 0.0
+generator_scheduler: exponential_decay
+generator_scheduler_params:
+    learning_rate: 2.0e-4
+    gamma: 0.999875                   
+
+# optimizer setting for discriminator
+discriminator_optimizer_params:
+    beta1: 0.8
+    beta2: 0.99
+    epsilon: 1.0e-9
+    weight_decay: 0.0
+discriminator_scheduler: exponential_decay
+discriminator_scheduler_params:
+    learning_rate: 2.0e-4          
+    gamma: 0.999875
+generator_first: True # whether to start updating generator first
+
+##########################################################
+#                OTHER TRAINING SETTING                  #
+##########################################################
+num_snapshots: 10            # max number of snapshots to keep while training
+train_max_steps: 350000      # Number of training steps. == total_iters / ngpus, total_iters = 1000000
+save_interval_steps: 1000    # Interval steps to save checkpoint.
+eval_interval_steps: 250     # Interval steps to evaluate the network.
+seed: 777                    # random seed number
--- a/examples/csmsc/jets/local/inference.sh
+++ b/examples/csmsc/jets/local/inference.sh
@ -0,0 +1,15 @@
+#!/bin/bash
+
+train_output_path=$1
+
+stage=0
+stop_stage=0
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    python3 ${BIN_DIR}/inference.py \
+        --inference_dir=${train_output_path}/inference \
+        --am=jets_csmsc \
+        --text=${BIN_DIR}/../sentences.txt \
+        --output_dir=${train_output_path}/pd_infer_out \
+        --phones_dict=dump/phone_id_map.txt
+fi
--- a/examples/csmsc/jets/local/preprocess.sh
+++ b/examples/csmsc/jets/local/preprocess.sh
@ -0,0 +1,77 @@
+#!/bin/bash
+set -e
+stage=0
+stop_stage=100
+
+config_path=$1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # get durations from MFA's result
+    echo "Generate durations.txt from MFA results ..."
+    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
+        --inputdir=./baker_alignment_tone \
+        --output=durations.txt \
+        --config=${config_path}
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # extract features
+    echo "Extract features ..."
+    python3 ${BIN_DIR}/preprocess.py \
+        --dataset=baker \
+        --rootdir=~/datasets/BZNSYP/ \
+        --dumpdir=dump \
+        --dur-file=durations.txt \
+        --config=${config_path} \
+        --num-cpu=20 \
+        --cut-sil=True \
+        --token_average=True
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # get features' stats(mean and std)
+    echo "Get features' stats ..."
+    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --field-name="feats"
+
+    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --field-name="pitch"
+
+    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --field-name="energy"
+
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # normalize and covert phone/speaker to id, dev and test should use train's stats
+    echo "Normalize ..."
+    python3 ${BIN_DIR}/normalize.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --dumpdir=dump/train/norm \
+        --feats-stats=dump/train/feats_stats.npy \
+        --pitch-stats=dump/train/pitch_stats.npy \
+        --energy-stats=dump/train/energy_stats.npy \
+        --phones-dict=dump/phone_id_map.txt \
+        --speaker-dict=dump/speaker_id_map.txt
+
+    python3 ${BIN_DIR}/normalize.py \
+        --metadata=dump/dev/raw/metadata.jsonl \
+        --dumpdir=dump/dev/norm \
+        --feats-stats=dump/train/feats_stats.npy \
+        --pitch-stats=dump/train/pitch_stats.npy \
+        --energy-stats=dump/train/energy_stats.npy \
+        --phones-dict=dump/phone_id_map.txt \
+        --speaker-dict=dump/speaker_id_map.txt
+
+    python3 ${BIN_DIR}/normalize.py \
+        --metadata=dump/test/raw/metadata.jsonl \
+        --dumpdir=dump/test/norm \
+        --feats-stats=dump/train/feats_stats.npy \
+        --pitch-stats=dump/train/pitch_stats.npy \
+        --energy-stats=dump/train/energy_stats.npy \
+        --phones-dict=dump/phone_id_map.txt \
+        --speaker-dict=dump/speaker_id_map.txt
+fi
--- a/examples/csmsc/jets/local/synthesize.sh
+++ b/examples/csmsc/jets/local/synthesize.sh
@ -0,0 +1,18 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+stage=0
+stop_stage=0
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+    python3 ${BIN_DIR}/synthesize.py \
+        --config=${config_path} \
+        --ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --phones_dict=dump/phone_id_map.txt \
+        --test_metadata=dump/test/norm/metadata.jsonl \
+        --output_dir=${train_output_path}/test
+fi
--- a/examples/csmsc/jets/local/synthesize_e2e.sh
+++ b/examples/csmsc/jets/local/synthesize_e2e.sh
@ -0,0 +1,22 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
+stage=0
+stop_stage=0
+
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    FLAGS_allocator_strategy=naive_best_fit \
+    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+    python3 ${BIN_DIR}/synthesize_e2e.py \
+        --am=jets_csmsc \
+        --config=${config_path} \
+        --ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+        --phones_dict=dump/phone_id_map.txt \
+        --output_dir=${train_output_path}/test_e2e \
+        --text=${BIN_DIR}/../sentences.txt \
+        --inference_dir=${train_output_path}/inference
+fi
--- a/examples/csmsc/jets/local/train.sh
+++ b/examples/csmsc/jets/local/train.sh
@ -0,0 +1,12 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+
+python3 ${BIN_DIR}/train.py \
+    --train-metadata=dump/train/norm/metadata.jsonl \
+    --dev-metadata=dump/dev/norm/metadata.jsonl \
+    --config=${config_path} \
+    --output-dir=${train_output_path} \
+    --ngpu=1 \
+    --phones-dict=dump/phone_id_map.txt
--- a/examples/csmsc/jets/path.sh
+++ b/examples/csmsc/jets/path.sh
@ -0,0 +1,13 @@
+#!/bin/bash
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+MODEL=jets
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
--- a/examples/csmsc/jets/run.sh
+++ b/examples/csmsc/jets/run.sh
@ -0,0 +1,41 @@
+#!/bin/bash
+
+set -e
+source path.sh
+
+gpus=0
+stage=0
+stop_stage=100
+
+conf_path=conf/default.yaml
+train_output_path=exp/default
+ckpt_name=snapshot_iter_150000.pdz
+
+# with the following command, you can choose the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    ./local/preprocess.sh ${conf_path}|| exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # synthesize_e2e
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} || exit -1
+fi
+
--- a/examples/csmsc/vits/local/lite_predict.sh
+++ b/examples/csmsc/vits/local/lite_predict.sh
@ -7,7 +7,7 @@ stage=0
 stop_stage=0

 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    python3 ${BIN_DIR}/../lite_predict.py \
+    python3 ${BIN_DIR}/lite_predict.py \
        --inference_dir=${train_output_path}/pdlite \
        --am=vits_csmsc \
        --text=${BIN_DIR}/../sentences.txt \
--- a/examples/csmsc/vits/run.sh
+++ b/examples/csmsc/vits/run.sh
@ -54,16 +54,16 @@ fi
 #     ./local/ort_predict.sh ${train_output_path}
 # fi

-# # not ready yet for operator missing in Paddle-Lite
-# # must run after stage 3 (which stage generated static models)
-# if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
-#     # NOTE by yuantian 2022.11.21: please compile develop version of Paddle-Lite to export and run TTS models,
-#     #                   cause TTS models are supported by https://github.com/PaddlePaddle/Paddle-Lite/pull/9587 
-#     #                   and https://github.com/PaddlePaddle/Paddle-Lite/pull/9706
-#     ./local/export2lite.sh ${train_output_path} inference pdlite vits_csmsc x86
-# fi
+# not ready yet for operator missing in Paddle-Lite
+# must run after stage 3 (which stage generated static models)
+if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
+    # NOTE by yuantian 2022.11.21: please compile develop version of Paddle-Lite to export and run TTS models,
+    #                   cause TTS models are supported by https://github.com/PaddlePaddle/Paddle-Lite/pull/10128
+    # vits can only run in arm
+    ./local/export2lite.sh ${train_output_path} inference pdlite vits_csmsc arm
+fi

-# if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
-#     CUDA_VISIBLE_DEVICES=${gpus} ./local/lite_predict.sh ${train_output_path} || exit -1
-# fi
+if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/lite_predict.sh ${train_output_path} || exit -1
+fi

--- a/examples/csmsc/voc5/conf/iSTFT.yaml
+++ b/examples/csmsc/voc5/conf/iSTFT.yaml
@ -0,0 +1,174 @@
+# This is the configuration file for CSMSC dataset.
+# This configuration is based on HiFiGAN V1, which is an official configuration. 
+# But I found that the optimizer setting does not work well with my implementation.
+# So I changed optimizer settings as follows:
+# - AdamW -> Adam
+# - betas: [0.8, 0.99] -> betas: [0.5, 0.9]
+# - Scheduler: ExponentialLR -> MultiStepLR
+# To match the shift size difference, the upsample scales is also modified from the original 256 shift setting.
+
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+fs: 24000                # Sampling rate.
+n_fft: 2048              # FFT size (samples).
+n_shift: 300             # Hop size (samples). 12.5ms
+win_length: 1200         # Window length (samples). 50ms
+                         # If set to null, it will be the same as fft_size.
+window: "hann"           # Window function.
+n_mels: 80               # Number of mel basis.
+fmin: 80                 # Minimum freq in mel basis calculation. (Hz)
+fmax: 7600               # Maximum frequency in mel basis calculation. (Hz)
+
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+generator_params:
+    use_istft: True                       # Use iSTFTNet.
+    istft_layer_id: 2                     # Use istft after istft_layer_id layers of upsample layer if use_istft=True.
+    n_fft: 2048                           # FFT size (samples) in feature extraction.
+    win_length: 1200                      # Window length (samples) in feature extraction.
+    in_channels: 80                       # Number of input channels.
+    out_channels: 1                       # Number of output channels.
+    channels: 512                         # Number of initial channels.
+    kernel_size: 7                        # Kernel size of initial and final conv layers.
+    upsample_scales:  [5, 5, 4, 3]        # Upsampling scales.
+    upsample_kernel_sizes: [10, 10, 8, 6] # Kernel size for upsampling layers.
+    resblock_kernel_sizes: [3, 7, 11]     # Kernel size for residual blocks.
+    resblock_dilations:                   # Dilations for residual blocks.
+        - [1, 3, 5]
+        - [1, 3, 5]
+        - [1, 3, 5]
+    use_additional_convs: True            # Whether to use additional conv layer in residual blocks.
+    bias: True                            # Whether to use bias parameter in conv.
+    nonlinear_activation: "leakyrelu"     # Nonlinear activation type.
+    nonlinear_activation_params:          # Nonlinear activation paramters.
+        negative_slope: 0.1
+    use_weight_norm: True                 # Whether to apply weight normalization.
+
+
+
+
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+discriminator_params:
+    scales: 3                              # Number of multi-scale discriminator.
+    scale_downsample_pooling: "AvgPool1D"  # Pooling operation for scale discriminator.
+    scale_downsample_pooling_params:
+        kernel_size: 4                     # Pooling kernel size.
+        stride: 2                          # Pooling stride.
+        padding: 2                         # Padding size.
+    scale_discriminator_params:
+        in_channels: 1                     # Number of input channels.
+        out_channels: 1                    # Number of output channels.
+        kernel_sizes: [15, 41, 5, 3]       # List of kernel sizes.
+        channels: 128                      # Initial number of channels.
+        max_downsample_channels: 1024      # Maximum number of channels in downsampling conv layers.
+        max_groups: 16                     # Maximum number of groups in downsampling conv layers.
+        bias: True
+        downsample_scales: [4, 4, 4, 4, 1] # Downsampling scales.
+        nonlinear_activation: "leakyrelu"  # Nonlinear activation.
+        nonlinear_activation_params:
+            negative_slope: 0.1
+    follow_official_norm: True             # Whether to follow the official norm setting.
+    periods: [2, 3, 5, 7, 11]              # List of period for multi-period discriminator.
+    period_discriminator_params:
+        in_channels: 1                     # Number of input channels.
+        out_channels: 1                    # Number of output channels.
+        kernel_sizes: [5, 3]               # List of kernel sizes.
+        channels: 32                       # Initial number of channels.
+        downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales.
+        max_downsample_channels: 1024      # Maximum number of channels in downsampling conv layers.
+        bias: True                         # Whether to use bias parameter in conv layer."
+        nonlinear_activation: "leakyrelu"  # Nonlinear activation.
+        nonlinear_activation_params:       # Nonlinear activation paramters.
+            negative_slope: 0.1
+        use_weight_norm: True              # Whether to apply weight normalization.
+        use_spectral_norm: False           # Whether to apply spectral normalization.
+    
+
+###########################################################
+#                   STFT LOSS SETTING                     #
+###########################################################
+use_stft_loss: False                 # Whether to use multi-resolution STFT loss.
+use_mel_loss: True                   # Whether to use Mel-spectrogram loss.
+mel_loss_params:
+    fs: 24000
+    fft_size: 2048
+    hop_size: 300
+    win_length: 1200
+    window: "hann"
+    num_mels: 80
+    fmin: 0
+    fmax: 12000
+    log_base: null
+generator_adv_loss_params:
+    average_by_discriminators: False # Whether to average loss by #discriminators.
+discriminator_adv_loss_params:
+    average_by_discriminators: False # Whether to average loss by #discriminators.
+use_feat_match_loss: True
+feat_match_loss_params:
+    average_by_discriminators: False # Whether to average loss by #discriminators.
+    average_by_layers: False         # Whether to average loss by #layers in each discriminator.
+    include_final_outputs: False     # Whether to include final outputs in feat match loss calculation.
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+lambda_aux: 45.0       # Loss balancing coefficient for STFT loss.
+lambda_adv: 1.0        # Loss balancing coefficient for adversarial loss.
+lambda_feat_match: 2.0 # Loss balancing coefficient for feat match loss..
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 16              # Batch size.
+batch_max_steps: 8400       # Length of each audio in batch. Make sure dividable by hop_size.
+num_workers: 2              # Number of workers in DataLoader.
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+    beta1: 0.5
+    beta2: 0.9
+    weight_decay: 0.0                   # Generator's weight decay coefficient.
+generator_scheduler_params:
+    learning_rate: 2.0e-4               # Generator's learning rate.
+    gamma: 0.5                          # Generator's scheduler gamma.
+    milestones:                         # At each milestone, lr will be multiplied by gamma.
+        - 200000
+        - 400000
+        - 600000
+        - 800000
+generator_grad_norm: -1                 # Generator's gradient norm.
+discriminator_optimizer_params:
+    beta1: 0.5
+    beta2: 0.9
+    weight_decay: 0.0                   # Discriminator's weight decay coefficient.
+discriminator_scheduler_params:
+    learning_rate: 2.0e-4               # Discriminator's learning rate.
+    gamma: 0.5                          # Discriminator's scheduler gamma.
+    milestones:                         # At each milestone, lr will be multiplied by gamma.
+        - 200000
+        - 400000
+        - 600000
+        - 800000    
+discriminator_grad_norm: -1             # Discriminator's gradient norm.            
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+generator_train_start_steps: 1     # Number of steps to start to train discriminator.
+discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
+train_max_steps: 2500000          # Number of training steps.
+save_interval_steps: 5000         # Interval steps to save checkpoint.
+eval_interval_steps: 1000          # Interval steps to evaluate the network.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_snapshots: 10                 # max number of snapshots to keep while training
+seed: 42                          # random seed for paddle, random, and np.random
--- a/examples/csmsc/voc5/iSTFTNet.md
+++ b/examples/csmsc/voc5/iSTFTNet.md
@ -0,0 +1,145 @@
+# iSTFTNet with CSMSC
+
+This example contains code used to train a [iSTFTNet](https://arxiv.org/abs/2203.02395) model with [Chinese Standard Mandarin Speech Copus](https://www.data-baker.com/open_source.html).
+
+## Dataset
+### Download and Extract
+Download CSMSC from it's [official website](https://test.data-baker.com/data/index/TNtts/) and extract it to `~/datasets`. Then the dataset is in the directory `~/datasets/BZNSYP`.
+
+### Get MFA Result and Extract
+We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) results to cut silence at the edge of audio.
+You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
+
+## Get Started
+Assume the path to the dataset is `~/datasets/BZNSYP`.
+Assume the path to the MFA result of CSMSC is `./baker_alignment_tone`.
+Run the command below to
+1. **source path**.
+2. preprocess the dataset.
+3. train the model.
+4. synthesize wavs.
+    - synthesize waveform from `metadata.jsonl`.
+```bash
+./run.sh
+```
+You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, running the following command will only preprocess the dataset.
+```bash
+./run.sh --stage 0 --stop-stage 0
+```
+### Data Preprocessing
+```bash
+./local/preprocess.sh ${conf_path}
+```
+When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.
+
+```text
+dump
+├── dev
+│   ├── norm
+│   └── raw
+├── test
+│   ├── norm
+│   └── raw
+└── train
+    ├── norm
+    ├── raw
+    └── feats_stats.npy
+```
+The dataset is split into 3 parts, namely `train`, `dev`, and `test`, each of which contains a `norm` and `raw` subfolder. The `raw` folder contains the log magnitude of the mel spectrogram of each utterance, while the norm folder contains the normalized spectrogram. The statistics used to normalize the spectrogram are computed from the training set, which is located in `dump/train/feats_stats.npy`.
+
+Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file that contains id and paths to the spectrogram of each utterance.
+
+### Model Training
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
+```
+`./local/train.sh` calls `${BIN_DIR}/train.py`.
+Here's the complete help message.
+
+```text
+usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
+                [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
+                [--ngpu NGPU]
+
+Train a HiFiGAN model.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --config CONFIG       HiFiGAN config file.
+  --train-metadata TRAIN_METADATA
+                        training data.
+  --dev-metadata DEV_METADATA
+                        dev data.
+  --output-dir OUTPUT_DIR
+                        output dir.
+  --ngpu NGPU           if ngpu == 0, use cpu.
+```
+
+1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/iSTFT.yaml`.
+2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder.
+3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are saved in `checkpoints/` inside this directory.
+4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
+
+### Synthesizing
+`./local/synthesize.sh` calls `${BIN_DIR}/../synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
+```
+```text
+usage: synthesize.py [-h] [--generator-type GENERATOR_TYPE] [--config CONFIG]
+                     [--checkpoint CHECKPOINT] [--test-metadata TEST_METADATA]
+                     [--output-dir OUTPUT_DIR] [--ngpu NGPU]
+
+Synthesize with GANVocoder.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --generator-type GENERATOR_TYPE
+                        type of GANVocoder, should in {pwgan, mb_melgan,
+                        style_melgan, } now
+  --config CONFIG       GANVocoder config file.
+  --checkpoint CHECKPOINT
+                        snapshot to load.
+  --test-metadata TEST_METADATA
+                        dev data.
+  --output-dir OUTPUT_DIR
+                        output dir.
+  --ngpu NGPU           if ngpu == 0, use cpu.
+```
+
+1. `--config` config file. You should use the same config with which the model is trained.
+2. `--checkpoint` is the checkpoint to load. Pick one of the checkpoints from `checkpoints` inside the training output directory.
+3. `--test-metadata` is the metadata of the test dataset. Use the `metadata.jsonl` in the `dev/norm` subfolder from the processed directory.
+4. `--output-dir` is the directory to save the synthesized audio files.
+5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
+
+## Pretrained Models
+
+The pretrained model can be downloaded here:
+
+- [iSTFTNet_csmsc_ckpt.zip](https://pan.baidu.com/s/1SNDlRWOGOcbbrKf5w-TJaA?pwd=r1e5)
+
+iSTFTNet checkpoint contains files listed below.
+
+```text
+iSTFTNet_csmsc_ckpt
+├── iSTFT.yaml                    # config used to train iSTFTNet
+├── feats_stats.npy               # statistics used to normalize spectrogram when training hifigan
+└── snapshot_iter_50000.pdz       # generator parameters of hifigan
+```
+
+A Comparison between iSTFTNet and Hifigan
+|  Model   |      Step      | eval/generator_loss | eval/mel_loss | eval/feature_matching_loss |  rtf   |
+|:--------:|:--------------:|:-------------------:|:-------------:|:--------------------------:| :---: |
+| hifigan  | 1(gpu) x 50000 |       13.989        |    0.14683    |           1.3484           |  0.01767   |
+| istftNet | 1(gpu) x 50000 |       13.319        |    0.14818    |           1.1069           |  0.01069   |
+
+> Rtf is tested on the CSMSC test dataset, and the test environment is aistudio v100 16G 1GPU, the test command is `./run.sh --stage 2 --stop-stage 2`
+
+The pretained hifigan model int the comparison  can be downloaded here:
+
+- [hifigan_csmsc_ckpt.zip](https://pan.baidu.com/s/1pGY6RYV7yEB_5hRI_JoWig?pwd=tcaj)
+
+## Acknowledgement
+
+We adapted some code from https://github.com/rishikksh20/iSTFTNet-pytorch.git.
--- a/examples/librispeech/asr2/README.md
+++ b/examples/librispeech/asr2/README.md
@ -153,7 +153,7 @@ After training the model, we need to get the final model for testing and inferen
 ```bash
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     # avg n best model
-     avg.sh lastest exp/${ckpt}/checkpoints ${avg_num}
+     avg.sh latest exp/${ckpt}/checkpoints ${avg_num}
 fi
 ```
 The `avg.sh` is in the `../../../utils/` which is define in the `path.sh`.
--- a/examples/other/mfa/local/generate_lexicon.py
+++ b/examples/other/mfa/local/generate_lexicon.py
@ -48,7 +48,7 @@ def rule(C, V, R, T):
    
    'i' is distinguished when appeared in phonemes, and separated into 3 categories, 'i', 'ii' and 'iii'.

-    Erhua is is possibly applied to every finals, except for finals that already ends with 'r'.
+    Erhua is possibly applied to every finals, except for finals that already ends with 'r'.

    When a syllable is impossible or does not have any characters with this pronunciation, return None
    to filter it out.
--- a/examples/tiny/asr1/README.md
+++ b/examples/tiny/asr1/README.md
@ -37,7 +37,7 @@ It will support the way of using `--variable value` in the shell scripts.
 Some local variables are set in `run.sh`. 
 `gpus` denotes the GPU number you want to use. If you set `gpus=`, it means you only use CPU. 
 `stage` denotes the number of stage you want the start from in the experiments.
-`stop stage` denotes the number of stage you want the stop at in the expriments. 
+`stop stage` denotes the number of stage you want the stop at in the experiments. 
 `conf_path` denotes the config path of the model.
 `avg_num`denotes the number K of top-K models you want to average to get the final model.
 `ckpt` denotes the checkpoint prefix of the model, e.g. "transformerr"
--- a/examples/vctk/vc3/conf/default.yaml
+++ b/examples/vctk/vc3/conf/default.yaml
@ -1,22 +1,135 @@
-  generator_params:
+###########################################################
+#                FEATURE EXTRACTION SETTING               #
+###########################################################
+# 源码 load 的时候用的 24k, 提取 mel 用的 16k, 后续 load 和提取 mel 都要改成 24k
+fs: 16000
+n_fft: 2048
+n_shift: 300
+win_length: 1200   # Window length.(in samples) 50ms
+                   # If set to null, it will be the same as fft_size.
+window: "hann"     # Window function.
+
+fmin: 0           # Minimum frequency of Mel basis.
+fmax: 8000        # Maximum frequency of Mel basis.  sr // 2
+n_mels: 80
+# only for StarGANv2 VC
+norm:             # None here
+htk: True
+power: 2.0
+
+
+###########################################################
+#                       MODEL SETTING                     #
+###########################################################
+generator_params:
    dim_in: 64
    style_dim: 64
    max_conv_dim: 512
    w_hpf: 0
    F0_channel: 256
-  mapping_network_params:
+mapping_network_params:
    num_domains: 20      # num of speakers in StarGANv2
    latent_dim: 16
    style_dim: 64        # same as style_dim in generator_params
    hidden_dim: 512      # same as max_conv_dim in generator_params
-  style_encoder_params:
+style_encoder_params:
    dim_in: 64           # same as dim_in in generator_params
    style_dim: 64        # same as style_dim in generator_params
    num_domains: 20      # same as num_domains in generator_params
    max_conv_dim: 512    # same as max_conv_dim in generator_params
-  discriminator_params:
+discriminator_params:
    dim_in: 64           # same as dim_in in generator_params
    num_domains: 20      # same as num_domains in mapping_network_params
    max_conv_dim: 512    # same as max_conv_dim in generator_params
-    n_repeat: 4
-    
+    repeat_num: 4
+asr_params:
+    input_dim: 80
+    hidden_dim: 256
+    n_token: 80
+    token_embedding_dim: 256
+
+###########################################################
+#               ADVERSARIAL LOSS SETTING                  #
+###########################################################
+loss_params:
+    g_loss:
+        lambda_sty: 1.
+        lambda_cyc: 5.
+        lambda_ds: 1.
+        lambda_norm: 1.
+        lambda_asr: 10.
+        lambda_f0: 5.
+        lambda_f0_sty: 0.1
+        lambda_adv: 2.
+        lambda_adv_cls: 0.5
+        norm_bias: 0.5
+    d_loss:
+        lambda_reg: 1.
+        lambda_adv_cls: 0.1
+        lambda_con_reg: 10.
+
+    adv_cls_epoch: 50
+    con_reg_epoch: 30
+        
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 5               # Batch size.
+num_workers: 2              # Number of workers in DataLoader.
+max_mel_length: 192
+
+###########################################################
+#             OPTIMIZER & SCHEDULER SETTING               #
+###########################################################
+generator_optimizer_params:
+    beta1: 0.0
+    beta2: 0.99
+    weight_decay: 1.0e-4                   
+    epsilon: 1.0e-9
+generator_scheduler_params:
+    max_learning_rate: 2.0e-4
+    phase_pct: 0.0
+    divide_factor: 1
+    total_steps: 200000                # train_max_steps
+    end_learning_rate: 2.0e-4
+style_encoder_optimizer_params:
+    beta1: 0.0
+    beta2: 0.99
+    weight_decay: 1.0e-4                 
+    epsilon: 1.0e-9
+style_encoder_scheduler_params:
+    max_learning_rate: 2.0e-4
+    phase_pct: 0.0
+    divide_factor: 1
+    total_steps: 200000                # train_max_steps
+    end_learning_rate: 2.0e-4
+mapping_network_optimizer_params:
+    beta1: 0.0
+    beta2: 0.99
+    weight_decay: 1.0e-4                 
+    epsilon: 1.0e-9
+mapping_network_scheduler_params:
+    max_learning_rate: 2.0e-6
+    phase_pct: 0.0
+    divide_factor: 1
+    total_steps: 200000                # train_max_steps
+    end_learning_rate: 2.0e-6
+discriminator_optimizer_params:
+    beta1: 0.0
+    beta2: 0.99
+    weight_decay: 1.0e-4                 
+    epsilon: 1.0e-9
+discriminator_scheduler_params:
+    max_learning_rate: 2.0e-4
+    phase_pct: 0.0
+    divide_factor: 1
+    total_steps: 200000                # train_max_steps
+    end_learning_rate: 2.0e-4        
+
+###########################################################
+#                    TRAINING SETTING                     #
+###########################################################
+max_epoch: 150
+num_snapshots: 5
+seed: 1
--- a/examples/vctk/vc3/local/preprocess.sh
+++ b/examples/vctk/vc3/local/preprocess.sh
@ -6,13 +6,32 @@ stop_stage=100
 config_path=$1

 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # extract features
+    echo "Extract features ..."
+    python3 ${BIN_DIR}/preprocess.py \
+        --dataset=vctk \
+        --rootdir=~/datasets/VCTK-Corpus-0.92/ \
+        --dumpdir=dump \
+        --config=${config_path} \
+        --num-cpu=20

 fi

 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-
-fi
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+echo "Normalize ..."
+    python3 ${BIN_DIR}/normalize.py \
+        --metadata=dump/train/raw/metadata.jsonl \
+        --dumpdir=dump/train/norm \
+        --speaker-dict=dump/speaker_id_map.txt
+
+    python3 ${BIN_DIR}/normalize.py \
+        --metadata=dump/dev/raw/metadata.jsonl \
+        --dumpdir=dump/dev/norm \
+        --speaker-dict=dump/speaker_id_map.txt
+
+    python3 ${BIN_DIR}/normalize.py \
+        --metadata=dump/test/raw/metadata.jsonl \
+        --dumpdir=dump/test/norm \
+        --speaker-dict=dump/speaker_id_map.txt

 fi
--- a/examples/vctk/vc3/local/train.sh
+++ b/examples/vctk/vc3/local/train.sh
@ -9,5 +9,4 @@ python3 ${BIN_DIR}/train.py \
    --config=${config_path} \
    --output-dir=${train_output_path} \
    --ngpu=1 \
-    --phones-dict=dump/phone_id_map.txt \
    --speaker-dict=dump/speaker_id_map.txt
--- a/paddlespeech/init.py
+++ b/paddlespeech/init.py
@ -13,3 +13,7 @@
 # limitations under the License.
 import _locale
 _locale._getdefaultlocale = (lambda *args: ['en_US', 'utf8'])
+
+__version__ = '0.0.0'
+
+__commit__ = '9cf8c1985a98bb380c183116123672976bdfe5c9'
--- a/paddlespeech/cli/download.py
+++ b/paddlespeech/cli/download.py
@ -133,10 +133,10 @@ def _get_download(url, fullname):
    total_size = req.headers.get('content-length')
    with open(tmp_fullname, 'wb') as f:
        if total_size:
-            with tqdm(total=(int(total_size) + 1023) // 1024) as pbar:
+            with tqdm(total=(int(total_size)), unit='B', unit_scale=True) as pbar:
                for chunk in req.iter_content(chunk_size=1024):
                    f.write(chunk)
-                    pbar.update(1)
+                    pbar.update(len(chunk))
        else:
            for chunk in req.iter_content(chunk_size=1024):
                if chunk:
--- a/paddlespeech/dataset/init.py
+++ b/paddlespeech/dataset/init.py
--- a/paddlespeech/dataset/aidatatang_200zh/README.md
+++ b/paddlespeech/dataset/aidatatang_200zh/README.md
--- a/paddlespeech/dataset/aidatatang_200zh/init.py
+++ b/paddlespeech/dataset/aidatatang_200zh/init.py
@ -0,0 +1,14 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .aidatatang_200zh import main as aidatatang_200zh_main
--- a/paddlespeech/dataset/aidatatang_200zh/aidatatang_200zh.py
+++ b/paddlespeech/dataset/aidatatang_200zh/aidatatang_200zh.py
@ -0,0 +1,158 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Prepare aidatatang_200zh mandarin dataset
+
+Download, unpack and create manifest files.
+Manifest file is a json-format file with each line containing the
+meta data (i.e. audio filepath, transcript and audio duration)
+of each audio file in the data set.
+"""
+import argparse
+import codecs
+import json
+import os
+from pathlib import Path
+
+import soundfile
+
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unpack
+from paddlespeech.utils.argparse import print_arguments
+
+DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
+
+URL_ROOT = 'http://www.openslr.org/resources/62'
+# URL_ROOT = 'https://openslr.magicdatatech.com/resources/62'
+DATA_URL = URL_ROOT + '/aidatatang_200zh.tgz'
+MD5_DATA = '6e0f4f39cd5f667a7ee53c397c8d0949'
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--target_dir",
+    default=DATA_HOME + "/aidatatang_200zh",
+    type=str,
+    help="Directory to save the dataset. (default: %(default)s)")
+parser.add_argument(
+    "--manifest_prefix",
+    default="manifest",
+    type=str,
+    help="Filepath prefix for output manifests. (default: %(default)s)")
+args = parser.parse_args()
+
+
+def create_manifest(data_dir, manifest_path_prefix):
+    print("Creating manifest %s ..." % manifest_path_prefix)
+    json_lines = []
+    transcript_path = os.path.join(data_dir, 'transcript',
+                                   'aidatatang_200_zh_transcript.txt')
+    transcript_dict = {}
+    for line in codecs.open(transcript_path, 'r', 'utf-8'):
+        line = line.strip()
+        if line == '':
+            continue
+        audio_id, text = line.split(' ', 1)
+        # remove withespace, charactor text
+        text = ''.join(text.split())
+        transcript_dict[audio_id] = text
+
+    data_types = ['train', 'dev', 'test']
+    for dtype in data_types:
+        del json_lines[:]
+        total_sec = 0.0
+        total_text = 0.0
+        total_num = 0
+
+        audio_dir = os.path.join(data_dir, 'corpus/', dtype)
+        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
+            for fname in filelist:
+                if not fname.endswith('.wav'):
+                    continue
+
+                audio_path = os.path.abspath(os.path.join(subfolder, fname))
+                audio_id = os.path.basename(fname)[:-4]
+                utt2spk = Path(audio_path).parent.name
+
+                audio_data, samplerate = soundfile.read(audio_path)
+                duration = float(len(audio_data) / samplerate)
+                text = transcript_dict[audio_id]
+                json_lines.append(
+                    json.dumps(
+                        {
+                            'utt': audio_id,
+                            'utt2spk': str(utt2spk),
+                            'feat': audio_path,
+                            'feat_shape': (duration, ),  # second
+                            'text': text,
+                        },
+                        ensure_ascii=False))
+
+                total_sec += duration
+                total_text += len(text)
+                total_num += 1
+
+        manifest_path = manifest_path_prefix + '.' + dtype
+        with codecs.open(manifest_path, 'w', 'utf-8') as fout:
+            for line in json_lines:
+                fout.write(line + '\n')
+
+        manifest_dir = os.path.dirname(manifest_path_prefix)
+        meta_path = os.path.join(manifest_dir, dtype) + '.meta'
+        with open(meta_path, 'w') as f:
+            print(f"{dtype}:", file=f)
+            print(f"{total_num} utts", file=f)
+            print(f"{total_sec / (60*60)} h", file=f)
+            print(f"{total_text} text", file=f)
+            print(f"{total_text / total_sec} text/sec", file=f)
+            print(f"{total_sec / total_num} sec/utt", file=f)
+
+
+def prepare_dataset(url, md5sum, target_dir, manifest_path, subset):
+    """Download, unpack and create manifest file."""
+    data_dir = os.path.join(target_dir, subset)
+    if not os.path.exists(data_dir):
+        filepath = download(url, md5sum, target_dir)
+        unpack(filepath, target_dir)
+        # unpack all audio tar files
+        audio_dir = os.path.join(data_dir, 'corpus')
+        for subfolder, dirlist, filelist in sorted(os.walk(audio_dir)):
+            for sub in dirlist:
+                print(f"unpack dir {sub}...")
+                for folder, _, filelist in sorted(
+                        os.walk(os.path.join(subfolder, sub))):
+                    for ftar in filelist:
+                        unpack(os.path.join(folder, ftar), folder, True)
+    else:
+        print("Skip downloading and unpacking. Data already exists in %s." %
+              target_dir)
+
+    create_manifest(data_dir, manifest_path)
+
+
+def main():
+    print_arguments(args, globals())
+    if args.target_dir.startswith('~'):
+        args.target_dir = os.path.expanduser(args.target_dir)
+
+    prepare_dataset(
+        url=DATA_URL,
+        md5sum=MD5_DATA,
+        target_dir=args.target_dir,
+        manifest_path=args.manifest_prefix,
+        subset='aidatatang_200zh')
+
+    print("Data download and manifest prepare done!")
+
+
+if __name__ == '__main__':
+    main()
--- a/paddlespeech/dataset/aishell/README.md
+++ b/paddlespeech/dataset/aishell/README.md
@ -0,0 +1,58 @@
+# [Aishell1](http://openslr.elda.org/33/)
+
+This Open Source Mandarin Speech Corpus, AISHELL-ASR0009-OS1, is 178 hours long. It is a part of AISHELL-ASR0009, of which utterance contains 11 domains, including smart home, autonomous driving, and industrial production. The whole recording was put in quiet indoor environment, using 3 different devices at the same time: high fidelity microphone (44.1kHz, 16-bit,); Android-system mobile phone (16kHz, 16-bit), iOS-system mobile phone (16kHz, 16-bit). Audios in high fidelity were re-sampled to 16kHz to build AISHELL- ASR0009-OS1. 400 speakers from different accent areas in China were invited to participate in the recording. The manual transcription accuracy rate is above 95%, through professional speech annotation and strict quality inspection. The corpus is divided into training, development and testing sets. ( This database is free for academic research, not in the commerce, if without permission. )
+
+
+## Dataset Architecture
+
+```bash
+data_aishell
+├── transcript      # text 目录
+└── wav             # wav 目录
+    ├── dev         # dev 目录
+    │   ├── S0724   # spk 目录
+    │   ├── S0725
+    │   ├── S0726
+    ├── train
+    │   ├── S0724
+    │   ├── S0725
+    │   ├── S0726
+    ├── test
+    │   ├── S0724
+    │   ├── S0725
+    │   ├── S0726
+ 
+
+data_aishell
+├── transcript
+│   └── aishell_transcript_v0.8.txt   # 文本标注文件
+└── wav
+    ├── dev
+    │   ├── S0724
+    │   │   ├── BAC009S0724W0121.wav  # S0724 的音频
+    │   │   ├── BAC009S0724W0122.wav
+    │   │   ├── BAC009S0724W0123.wav
+    ├── test
+    │   ├── S0724
+    │   │   ├── BAC009S0724W0121.wav
+    │   │   ├── BAC009S0724W0122.wav
+    │   │   ├── BAC009S0724W0123.wav
+    ├── train
+    │   ├── S0724
+    │   │   ├── BAC009S0724W0121.wav
+    │   │   ├── BAC009S0724W0122.wav
+    │   │   ├── BAC009S0724W0123.wav
+    
+标注文件格式： <utt> <tokens>
+> head data_aishell/transcript/aishell_transcript_v0.8.txt 
+BAC009S0002W0122 而 对 楼市 成交 抑制 作用 最 大 的 限 购
+BAC009S0002W0123 也 成为 地方 政府 的 眼中 钉
+BAC009S0002W0124 自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后
+BAC009S0002W0125 各地 政府 便 纷纷 跟进
+BAC009S0002W0126 仅 一 个 多 月 的 时间 里
+BAC009S0002W0127 除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外
+BAC009S0002W0128 四十六 个 限 购 城市 当中
+BAC009S0002W0129 四十一 个 已 正式 取消 或 变相 放松 了 限 购
+BAC009S0002W0130 财政 金融 政策 紧随 其后 而来
+BAC009S0002W0131 显示 出 了 极 强 的 威力
+```
--- a/paddlespeech/dataset/aishell/init.py
+++ b/paddlespeech/dataset/aishell/init.py
@ -0,0 +1,18 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .aishell import check_dataset
+from .aishell import create_manifest
+from .aishell import download_dataset
+from .aishell import main as aishell_main
+from .aishell import prepare_dataset
--- a/paddlespeech/dataset/aishell/aishell.py
+++ b/paddlespeech/dataset/aishell/aishell.py
@ -0,0 +1,230 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Prepare Aishell mandarin dataset
+
+Download, unpack and create manifest files.
+Manifest file is a json-format file with each line containing the
+meta data (i.e. audio filepath, transcript and audio duration)
+of each audio file in the data set.
+"""
+import argparse
+import codecs
+import json
+import os
+from pathlib import Path
+
+import soundfile
+
+from paddlespeech.dataset.download import download
+from paddlespeech.dataset.download import unpack
+from paddlespeech.utils.argparse import print_arguments
+
+DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
+
+URL_ROOT = 'http://openslr.elda.org/resources/33'
+# URL_ROOT = 'https://openslr.magicdatatech.com/resources/33'
+DATA_URL = URL_ROOT + '/data_aishell.tgz'
+MD5_DATA = '2f494334227864a8a8fec932999db9d8'
+RESOURCE_URL = URL_ROOT + '/resource_aishell.tgz'
+MD5_RESOURCE = '957d480a0fcac85fc18e550756f624e5'
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--target_dir",
+    default=DATA_HOME + "/Aishell",
+    type=str,
+    help="Directory to save the dataset. (default: %(default)s)")
+parser.add_argument(
+    "--manifest_prefix",
+    default="manifest",
+    type=str,
+    help="Filepath prefix for output manifests. (default: %(default)s)")
+args = parser.parse_args()
+
+
+def create_manifest(data_dir, manifest_path_prefix):
+    print("Creating manifest %s ..." % os.path.join(data_dir,
+                                                    manifest_path_prefix))
+    json_lines = []
+    transcript_path = os.path.join(data_dir, 'transcript',
+                                   'aishell_transcript_v0.8.txt')
+    transcript_dict = {}
+    for line in codecs.open(transcript_path, 'r', 'utf-8'):
+        line = line.strip()
+        if line == '':
+            continue
+        audio_id, text = line.split(' ', 1)
+        # remove withespace, charactor text
+        text = ''.join(text.split())
+        transcript_dict[audio_id] = text
+
+    data_metas = dict()
+    data_types = ['train', 'dev', 'test']
+    for dtype in data_types:
+        del json_lines[:]
+        total_sec = 0.0
+        total_text = 0.0
+        total_num = 0
+
+        audio_dir = os.path.join(data_dir, 'wav', dtype)
+        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
+            for fname in filelist:
+                audio_path = os.path.abspath(os.path.join(subfolder, fname))
+                audio_id = os.path.basename(fname)[:-4]
+                # if no transcription for audio then skipped
+                if audio_id not in transcript_dict:
+                    continue
+
+                utt2spk = Path(audio_path).parent.name
+                audio_data, samplerate = soundfile.read(audio_path)
+                duration = float(len(audio_data) / samplerate)
+                text = transcript_dict[audio_id]
+                json_lines.append(
+                    json.dumps(
+                        {
+                            'utt': audio_id,
+                            'utt2spk': str(utt2spk),
+                            'feat': audio_path,
+                            'feat_shape': (duration, ),  # second
+                            'text': text
+                        },
+                        ensure_ascii=False))
+
+                total_sec += duration
+                total_text += len(text)
+                total_num += 1
+
+        manifest_path = manifest_path_prefix + '.' + dtype
+        with codecs.open(manifest_path, 'w', 'utf-8') as fout:
+            for line in json_lines:
+                fout.write(line + '\n')
+
+        meta = dict()
+        meta["dtype"] = dtype  # train, dev, test
+        meta["utts"] = total_num
+        meta["hours"] = total_sec / (60 * 60)
+        meta["text"] = total_text
+        meta["text/sec"] = total_text / total_sec
+        meta["sec/utt"] = total_sec / total_num
+        data_metas[dtype] = meta
+
+        manifest_dir = os.path.dirname(manifest_path_prefix)
+        meta_path = os.path.join(manifest_dir, dtype) + '.meta'
+        with open(meta_path, 'w') as f:
+            for key, val in meta.items():
+                print(f"{key}: {val}", file=f)
+
+    return data_metas
+
+
+def download_dataset(url, md5sum, target_dir):
+    """Download, unpack and create manifest file."""
+    data_dir = os.path.join(target_dir, 'data_aishell')
+    if not os.path.exists(data_dir):
+        filepath = download(url, md5sum, target_dir)
+        unpack(filepath, target_dir)
+        # unpack all audio tar files
+        audio_dir = os.path.join(data_dir, 'wav')
+        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
+            for ftar in filelist:
+                unpack(os.path.join(subfolder, ftar), subfolder, True)
+    else:
+        print("Skip downloading and unpacking. Data already exists in %s." %
+              os.path.abspath(target_dir))
+    return os.path.abspath(data_dir)
+
+
+def check_dataset(data_dir):
+    print(f"check dataset {os.path.abspath(data_dir)} ...")
+
+    transcript_path = os.path.join(data_dir, 'transcript',
+                                   'aishell_transcript_v0.8.txt')
+    if not os.path.exists(transcript_path):
+        raise FileNotFoundError(f"no transcript file found in {data_dir}.")
+
+    transcript_dict = {}
+    for line in codecs.open(transcript_path, 'r', 'utf-8'):
+        line = line.strip()
+        if line == '':
+            continue
+        audio_id, text = line.split(' ', 1)
+        # remove withespace, charactor text
+        text = ''.join(text.split())
+        transcript_dict[audio_id] = text
+
+    no_label = 0
+    data_types = ['train', 'dev', 'test']
+    for dtype in data_types:
+        audio_dir = os.path.join(data_dir, 'wav', dtype)
+        if not os.path.exists(audio_dir):
+            raise IOError(f"{audio_dir} does not exist.")
+
+        for subfolder, _, filelist in sorted(os.walk(audio_dir)):
+            for fname in filelist:
+                audio_path = os.path.abspath(os.path.join(subfolder, fname))
+                audio_id = os.path.basename(fname)[:-4]
+                # if no transcription for audio then skipped
+                if audio_id not in transcript_dict:
+                    print(f"Warning: {audio_id} not has transcript.")
+                    no_label += 1
+                    continue
+
+                utt2spk = Path(audio_path).parent.name
+                audio_data, samplerate = soundfile.read(audio_path)
+                assert samplerate == 16000, f"{audio_path} sample rate is {samplerate} not 16k, please check."
+
+        print(f"Warning: {dtype} has {no_label} audio does not has transcript.")
+
+
+def prepare_dataset(url, md5sum, target_dir, manifest_path=None, check=False):
+    """Download, unpack and create manifest file."""
+    data_dir = download_dataset(url, md5sum, target_dir)
+
+    if check:
+        try:
+            check_dataset(data_dir)
+        except Exception as e:
+            raise ValueError(
+                f"{data_dir} dataset format not right, please check it.")
+
+    meta = None
+    if manifest_path:
+        meta = create_manifest(data_dir, manifest_path)
+
+    return data_dir, meta
+
+
+def main():
+    print_arguments(args, globals())
+    if args.target_dir.startswith('~'):
+        args.target_dir = os.path.expanduser(args.target_dir)
+
+    data_dir, meta = prepare_dataset(
+        url=DATA_URL,
+        md5sum=MD5_DATA,
+        target_dir=args.target_dir,
+        manifest_path=args.manifest_prefix,
+        check=True)
+
+    resource_dir, _ = prepare_dataset(
+        url=RESOURCE_URL,
+        md5sum=MD5_RESOURCE,
+        target_dir=args.target_dir,
+        manifest_path=None)
+
+    print("Data download and manifest prepare done!")
+
+
+if __name__ == '__main__':
+    main()
--- a/paddlespeech/dataset/download.py
+++ b/paddlespeech/dataset/download.py
@ -19,91 +19,16 @@ import zipfile
 from typing import Text

 __all__ = [
-    "check_md5sum", "getfile_insensitive", "download_multi", "download",
-    "unpack", "unzip", "md5file", "print_arguments", "add_arguments",
-    "get_commandline_args"
+    "check_md5sum",
+    "getfile_insensitive",
+    "download_multi",
+    "download",
+    "unpack",
+    "unzip",
+    "md5file",
 ]


-def get_commandline_args():
-    extra_chars = [
-        " ",
-        ";",
-        "&",
-        "(",
-        ")",
-        "|",
-        "^",
-        "<",
-        ">",
-        "?",
-        "*",
-        "[",
-        "]",
-        "$",
-        "`",
-        '"',
-        "\\",
-        "!",
-        "{",
-        "}",
-    ]
-
-    # Escape the extra characters for shell
-    argv = [
-        arg.replace("'", "'\\''") if all(char not in arg
-                                         for char in extra_chars) else
-        "'" + arg.replace("'", "'\\''") + "'" for arg in sys.argv
-    ]
-
-    return sys.executable + " " + " ".join(argv)
-
-
-def print_arguments(args, info=None):
-    """Print argparse's arguments.
-
-    Usage:
-
-    .. code-block:: python
-
-        parser = argparse.ArgumentParser()
-        parser.add_argument("name", default="Jonh", type=str, help="User name.")
-        args = parser.parse_args()
-        print_arguments(args)
-
-    :param args: Input argparse.Namespace for printing.
-    :type args: argparse.Namespace
-    """
-    filename = ""
-    if info:
-        filename = info["__file__"]
-    filename = os.path.basename(filename)
-    print(f"----------- {filename} Configuration Arguments -----------")
-    for arg, value in sorted(vars(args).items()):
-        print("%s: %s" % (arg, value))
-    print("-----------------------------------------------------------")
-
-
-def add_arguments(argname, type, default, help, argparser, **kwargs):
-    """Add argparse's argument.
-
-    Usage:
-
-    .. code-block:: python
-
-        parser = argparse.ArgumentParser()
-        add_argument("name", str, "Jonh", "User name.", parser)
-        args = parser.parse_args()
-    """
-    type = distutils.util.strtobool if type == bool else type
-    argparser.add_argument(
-        "--" + argname,
-        default=default,
-        type=type,
-        help=help + ' Default: %(default)s.',
-        **kwargs)
-
-
 def md5file(fname):
    hash_md5 = hashlib.md5()
    f = open(fname, "rb")
--- a/paddlespeech/dataset/s2t/init.py
+++ b/paddlespeech/dataset/s2t/init.py
@ -0,0 +1,20 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# s2t utils binaries.
+from .avg_model import main as avg_ckpts_main
+from .build_vocab import main as build_vocab_main
+from .compute_mean_std import main as compute_mean_std_main
+from .compute_wer import main as compute_wer_main
+from .format_data import main as format_data_main
+from .format_rsl import main as format_rsl_main
--- a/paddlespeech/dataset/s2t/avg_model.py
+++ b/paddlespeech/dataset/s2t/avg_model.py
@ -0,0 +1,125 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import glob
+import json
+import os
+
+import numpy as np
+import paddle
+
+
+def define_argparse():
+    parser = argparse.ArgumentParser(description='average model')
+    parser.add_argument('--dst_model', required=True, help='averaged model')
+    parser.add_argument(
+        '--ckpt_dir', required=True, help='ckpt model dir for average')
+    parser.add_argument(
+        '--val_best', action="store_true", help='averaged model')
+    parser.add_argument(
+        '--num', default=5, type=int, help='nums for averaged model')
+    parser.add_argument(
+        '--min_epoch',
+        default=0,
+        type=int,
+        help='min epoch used for averaging model')
+    parser.add_argument(
+        '--max_epoch',
+        default=65536,  # Big enough
+        type=int,
+        help='max epoch used for averaging model')
+
+    args = parser.parse_args()
+    return args
+
+
+def average_checkpoints(dst_model="",
+                        ckpt_dir="",
+                        val_best=True,
+                        num=5,
+                        min_epoch=0,
+                        max_epoch=65536):
+    paddle.set_device('cpu')
+
+    val_scores = []
+    jsons = glob.glob(f'{ckpt_dir}/[!train]*.json')
+    jsons = sorted(jsons, key=os.path.getmtime, reverse=True)
+    for y in jsons:
+        with open(y, 'r') as f:
+            dic_json = json.load(f)
+        loss = dic_json['val_loss']
+        epoch = dic_json['epoch']
+        if epoch >= min_epoch and epoch <= max_epoch:
+            val_scores.append((epoch, loss))
+    assert val_scores, f"Not find any valid checkpoints: {val_scores}"
+    val_scores = np.array(val_scores)
+
+    if val_best:
+        sort_idx = np.argsort(val_scores[:, 1])
+        sorted_val_scores = val_scores[sort_idx]
+    else:
+        sorted_val_scores = val_scores
+
+    beat_val_scores = sorted_val_scores[:num, 1]
+    selected_epochs = sorted_val_scores[:num, 0].astype(np.int64)
+    avg_val_score = np.mean(beat_val_scores)
+    print("selected val scores = " + str(beat_val_scores))
+    print("selected epochs = " + str(selected_epochs))
+    print("averaged val score = " + str(avg_val_score))
+
+    path_list = [
+        ckpt_dir + '/{}.pdparams'.format(int(epoch))
+        for epoch in sorted_val_scores[:num, 0]
+    ]
+    print(path_list)
+
+    avg = None
+    num = args.num
+    assert num == len(path_list)
+    for path in path_list:
+        print(f'Processing {path}')
+        states = paddle.load(path)
+        if avg is None:
+            avg = states
+        else:
+            for k in avg.keys():
+                avg[k] += states[k]
+    # average
+    for k in avg.keys():
+        if avg[k] is not None:
+            avg[k] /= num
+
+    paddle.save(avg, args.dst_model)
+    print(f'Saving to {args.dst_model}')
+
+    meta_path = os.path.splitext(args.dst_model)[0] + '.avg.json'
+    with open(meta_path, 'w') as f:
+        data = json.dumps({
+            "mode": 'val_best' if args.val_best else 'latest',
+            "avg_ckpt": args.dst_model,
+            "val_loss_mean": avg_val_score,
+            "ckpts": path_list,
+            "epochs": selected_epochs.tolist(),
+            "val_losses": beat_val_scores.tolist(),
+        })
+        f.write(data + "\n")
+
+
+def main():
+    args = define_argparse()
+    average_checkpoints(args)
+
+
+if __name__ == '__main__':
+    main()
--- a/paddlespeech/dataset/s2t/build_vocab.py
+++ b/paddlespeech/dataset/s2t/build_vocab.py
@ -0,0 +1,166 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Build vocabulary from manifest files.
+Each item in vocabulary file is a character.
+"""
+import argparse
+import functools
+import os
+import tempfile
+from collections import Counter
+
+import jsonlines
+
+from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
+from paddlespeech.s2t.frontend.utility import BLANK
+from paddlespeech.s2t.frontend.utility import SOS
+from paddlespeech.s2t.frontend.utility import SPACE
+from paddlespeech.s2t.frontend.utility import UNK
+from paddlespeech.utils.argparse import add_arguments
+from paddlespeech.utils.argparse import print_arguments
+
+
+def count_manifest(counter, text_feature, manifest_path):
+    manifest_jsons = []
+    with jsonlines.open(manifest_path, 'r') as reader:
+        for json_data in reader:
+            manifest_jsons.append(json_data)
+
+    for line_json in manifest_jsons:
+        if isinstance(line_json['text'], str):
+            tokens = text_feature.tokenize(
+                line_json['text'], replace_space=False)
+
+            counter.update(tokens)
+        else:
+            assert isinstance(line_json['text'], list)
+            for text in line_json['text']:
+                tokens = text_feature.tokenize(text, replace_space=False)
+                counter.update(tokens)
+
+
+def dump_text_manifest(fileobj, manifest_path, key='text'):
+    manifest_jsons = []
+    with jsonlines.open(manifest_path, 'r') as reader:
+        for json_data in reader:
+            manifest_jsons.append(json_data)
+
+    for line_json in manifest_jsons:
+        if isinstance(line_json[key], str):
+            fileobj.write(line_json[key] + "\n")
+        else:
+            assert isinstance(line_json[key], list)
+            for line in line_json[key]:
+                fileobj.write(line + "\n")
+
+
+def build_vocab(manifest_paths="",
+                vocab_path="examples/librispeech/data/vocab.txt",
+                unit_type="char",
+                count_threshold=0,
+                text_keys='text',
+                spm_mode="unigram",
+                spm_vocab_size=0,
+                spm_model_prefix="",
+                spm_character_coverage=0.9995):
+    fout = open(vocab_path, 'w', encoding='utf-8')
+    fout.write(BLANK + "\n")  # 0 will be used for "blank" in CTC
+    fout.write(UNK + '\n')  # <unk> must be 1
+
+    if unit_type == 'spm':
+        # tools/spm_train --input=$wave_data/lang_char/input.txt
+        # --vocab_size=${nbpe} --model_type=${bpemode}
+        # --model_prefix=${bpemodel} --input_sentence_size=100000000
+        import sentencepiece as spm
+
+        fp = tempfile.NamedTemporaryFile(mode='w', delete=False)
+        for manifest_path in manifest_paths:
+            _text_keys = [text_keys] if type(
+                text_keys) is not list else text_keys
+            for text_key in _text_keys:
+                dump_text_manifest(fp, manifest_path, key=text_key)
+        fp.close()
+        # train
+        spm.SentencePieceTrainer.Train(
+            input=fp.name,
+            vocab_size=spm_vocab_size,
+            model_type=spm_mode,
+            model_prefix=spm_model_prefix,
+            input_sentence_size=100000000,
+            character_coverage=spm_character_coverage)
+        os.unlink(fp.name)
+
+    # encode
+    text_feature = TextFeaturizer(unit_type, "", spm_model_prefix)
+    counter = Counter()
+
+    for manifest_path in manifest_paths:
+        count_manifest(counter, text_feature, manifest_path)
+
+    count_sorted = sorted(counter.items(), key=lambda x: x[1], reverse=True)
+    tokens = []
+    for token, count in count_sorted:
+        if count < count_threshold:
+            break
+        # replace space by `<space>`
+        token = SPACE if token == ' ' else token
+        tokens.append(token)
+
+    tokens = sorted(tokens)
+    for token in tokens:
+        fout.write(token + '\n')
+
+    fout.write(SOS + "\n")  # <sos/eos>
+    fout.close()
+
+
+def define_argparse():
+    parser = argparse.ArgumentParser(description=__doc__)
+    add_arg = functools.partial(add_arguments, argparser=parser)
+
+    # yapf: disable
+    add_arg('unit_type', str, "char", "Unit type, e.g. char, word, spm")
+    add_arg('count_threshold', int, 0,
+            "Truncation threshold for char/word counts.Default 0, no truncate.")
+    add_arg('vocab_path', str,
+            'examples/librispeech/data/vocab.txt',
+            "Filepath to write the vocabulary.")
+    add_arg('manifest_paths', str,
+            None,
+            "Filepaths of manifests for building vocabulary. "
+            "You can provide multiple manifest files.",
+            nargs='+',
+            required=True)
+    add_arg('text_keys', str,
+            'text',
+            "keys of the text in manifest for building vocabulary. "
+            "You can provide multiple k.",
+            nargs='+')
+    # bpe
+    add_arg('spm_vocab_size', int, 0, "Vocab size for spm.")
+    add_arg('spm_mode', str, 'unigram', "spm model type, e.g. unigram, spm, char, word. only need when `unit_type` is spm")
+    add_arg('spm_model_prefix', str, "", "spm_model_%(spm_mode)_%(count_threshold), spm model prefix, only need when `unit_type` is spm")
+    add_arg('spm_character_coverage', float, 0.9995, "character coverage to determine the minimum symbols")
+    # yapf: disable
+
+    args = parser.parse_args()
+    return args
+
+def main():
+    args = define_argparse()
+    print_arguments(args, globals())
+    build_vocab(**vars(args))
+
+if __name__ == '__main__':
+    main()
--- a/paddlespeech/dataset/s2t/compute_mean_std.py
+++ b/paddlespeech/dataset/s2t/compute_mean_std.py
@ -0,0 +1,106 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Compute mean and std for feature normalizer, and save to file."""
+import argparse
+import functools
+
+from paddlespeech.s2t.frontend.augmentor.augmentation import AugmentationPipeline
+from paddlespeech.s2t.frontend.featurizer.audio_featurizer import AudioFeaturizer
+from paddlespeech.s2t.frontend.normalizer import FeatureNormalizer
+from paddlespeech.utils.argparse import add_arguments
+from paddlespeech.utils.argparse import print_arguments
+
+
+def compute_cmvn(manifest_path="data/librispeech/manifest.train",
+                 output_path="data/librispeech/mean_std.npz",
+                 num_samples=2000,
+                 num_workers=0,
+                 spectrum_type="linear",
+                 feat_dim=13,
+                 delta_delta=False,
+                 stride_ms=10,
+                 window_ms=20,
+                 sample_rate=16000,
+                 use_dB_normalization=True,
+                 target_dB=-20):
+
+    augmentation_pipeline = AugmentationPipeline('{}')
+    audio_featurizer = AudioFeaturizer(
+        spectrum_type=spectrum_type,
+        feat_dim=feat_dim,
+        delta_delta=delta_delta,
+        stride_ms=float(stride_ms),
+        window_ms=float(window_ms),
+        n_fft=None,
+        max_freq=None,
+        target_sample_rate=sample_rate,
+        use_dB_normalization=use_dB_normalization,
+        target_dB=target_dB,
+        dither=0.0)
+
+    def augment_and_featurize(audio_segment):
+        augmentation_pipeline.transform_audio(audio_segment)
+        return audio_featurizer.featurize(audio_segment)
+
+    normalizer = FeatureNormalizer(
+        mean_std_filepath=None,
+        manifest_path=manifest_path,
+        featurize_func=augment_and_featurize,
+        num_samples=num_samples,
+        num_workers=num_workers)
+    normalizer.write_to_file(output_path)
+
+
+def define_argparse():
+    parser = argparse.ArgumentParser(description=__doc__)
+    add_arg = functools.partial(add_arguments, argparser=parser)
+
+    # yapf: disable
+    add_arg('manifest_path', str,
+            'data/librispeech/manifest.train',
+            "Filepath of manifest to compute normalizer's mean and stddev.")
+
+    add_arg('output_path', str,
+            'data/librispeech/mean_std.npz',
+            "Filepath of write mean and stddev to (.npz).")
+    add_arg('num_samples',  int,    2000,    "# of samples to for statistics.")
+    add_arg('num_workers',
+                            default=0,
+                            type=int,
+                            help='num of subprocess workers for processing')
+
+
+    add_arg('spectrum_type', str,
+            'linear',
+            "Audio feature type. Options: linear, mfcc, fbank.",
+            choices=['linear', 'mfcc', 'fbank'])
+    add_arg('feat_dim', int, 13, "Audio feature dim.")
+    add_arg('delta_delta', bool,  False, "Audio feature with delta delta.")
+    add_arg('stride_ms', int, 10,  "stride length in ms.")
+    add_arg('window_ms', int, 20,  "stride length in ms.")
+    add_arg('sample_rate',  int, 16000,  "target sample rate.")
+    add_arg('use_dB_normalization', bool, True, "do dB normalization.")
+    add_arg('target_dB',   int, -20,  "target dB.")
+    # yapf: disable
+
+    args = parser.parse_args()
+    return args
+
+def main():
+    args = define_argparse()
+    print_arguments(args, globals())
+    compute_cmvn(**vars(args))
+
+if __name__ == '__main__':
+    main()
--- a/paddlespeech/dataset/s2t/compute_wer.py
+++ b/paddlespeech/dataset/s2t/compute_wer.py
@ -0,0 +1,558 @@
+# Copyright 2021 Mobvoi Inc. All Rights Reserved.
+# flake8: noqa
+import codecs
+import re
+import sys
+import unicodedata
+
+remove_tag = True
+spacelist = [' ', '\t', '\r', '\n']
+puncts = [
+    '!', ',', '?', '、', '。', '！', '，', '；', '？', '：', '「', '」', '︰', '『', '』',
+    '《', '》'
+]
+
+
+def characterize(string):
+    res = []
+    i = 0
+    while i < len(string):
+        char = string[i]
+        if char in puncts:
+            i += 1
+            continue
+        cat1 = unicodedata.category(char)
+        #https://unicodebook.readthedocs.io/unicode.html#unicode-categories
+        if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist:  # space or not assigned
+            i += 1
+            continue
+        if cat1 == 'Lo':  # letter-other
+            res.append(char)
+            i += 1
+        else:
+            # some input looks like: <unk><noise>, we want to separate it to two words.
+            sep = ' '
+            if char == '<': sep = '>'
+            j = i + 1
+            while j < len(string):
+                c = string[j]
+                if ord(c) >= 128 or (c in spacelist) or (c == sep):
+                    break
+                j += 1
+            if j < len(string) and string[j] == '>':
+                j += 1
+            res.append(string[i:j])
+            i = j
+    return res
+
+
+def stripoff_tags(x):
+    if not x: return ''
+    chars = []
+    i = 0
+    T = len(x)
+    while i < T:
+        if x[i] == '<':
+            while i < T and x[i] != '>':
+                i += 1
+            i += 1
+        else:
+            chars.append(x[i])
+            i += 1
+    return ''.join(chars)
+
+
+def normalize(sentence, ignore_words, cs, split=None):
+    """ sentence, ignore_words are both in unicode
+    """
+    new_sentence = []
+    for token in sentence:
+        x = token
+        if not cs:
+            x = x.upper()
+        if x in ignore_words:
+            continue
+        if remove_tag:
+            x = stripoff_tags(x)
+        if not x:
+            continue
+        if split and x in split:
+            new_sentence += split[x]
+        else:
+            new_sentence.append(x)
+    return new_sentence
+
+
+class Calculator:
+    def __init__(self):
+        self.data = {}
+        self.space = []
+        self.cost = {}
+        self.cost['cor'] = 0
+        self.cost['sub'] = 1
+        self.cost['del'] = 1
+        self.cost['ins'] = 1
+
+    def calculate(self, lab, rec):
+        # Initialization
+        lab.insert(0, '')
+        rec.insert(0, '')
+        while len(self.space) < len(lab):
+            self.space.append([])
+        for row in self.space:
+            for element in row:
+                element['dist'] = 0
+                element['error'] = 'non'
+            while len(row) < len(rec):
+                row.append({'dist': 0, 'error': 'non'})
+        for i in range(len(lab)):
+            self.space[i][0]['dist'] = i
+            self.space[i][0]['error'] = 'del'
+        for j in range(len(rec)):
+            self.space[0][j]['dist'] = j
+            self.space[0][j]['error'] = 'ins'
+        self.space[0][0]['error'] = 'non'
+        for token in lab:
+            if token not in self.data and len(token) > 0:
+                self.data[token] = {
+                    'all': 0,
+                    'cor': 0,
+                    'sub': 0,
+                    'ins': 0,
+                    'del': 0
+                }
+        for token in rec:
+            if token not in self.data and len(token) > 0:
+                self.data[token] = {
+                    'all': 0,
+                    'cor': 0,
+                    'sub': 0,
+                    'ins': 0,
+                    'del': 0
+                }
+        # Computing edit distance
+        for i, lab_token in enumerate(lab):
+            for j, rec_token in enumerate(rec):
+                if i == 0 or j == 0:
+                    continue
+                min_dist = sys.maxsize
+                min_error = 'none'
+                dist = self.space[i - 1][j]['dist'] + self.cost['del']
+                error = 'del'
+                if dist < min_dist:
+                    min_dist = dist
+                    min_error = error
+                dist = self.space[i][j - 1]['dist'] + self.cost['ins']
+                error = 'ins'
+                if dist < min_dist:
+                    min_dist = dist
+                    min_error = error
+                if lab_token == rec_token:
+                    dist = self.space[i - 1][j - 1]['dist'] + self.cost['cor']
+                    error = 'cor'
+                else:
+                    dist = self.space[i - 1][j - 1]['dist'] + self.cost['sub']
+                    error = 'sub'
+                if dist < min_dist:
+                    min_dist = dist
+                    min_error = error
+                self.space[i][j]['dist'] = min_dist
+                self.space[i][j]['error'] = min_error
+        # Tracing back
+        result = {
+            'lab': [],
+            'rec': [],
+            'all': 0,
+            'cor': 0,
+            'sub': 0,
+            'ins': 0,
+            'del': 0
+        }
+        i = len(lab) - 1
+        j = len(rec) - 1
+        while True:
+            if self.space[i][j]['error'] == 'cor':  # correct
+                if len(lab[i]) > 0:
+                    self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1
+                    self.data[lab[i]]['cor'] = self.data[lab[i]]['cor'] + 1
+                    result['all'] = result['all'] + 1
+                    result['cor'] = result['cor'] + 1
+                result['lab'].insert(0, lab[i])
+                result['rec'].insert(0, rec[j])
+                i = i - 1
+                j = j - 1
+            elif self.space[i][j]['error'] == 'sub':  # substitution
+                if len(lab[i]) > 0:
+                    self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1
+                    self.data[lab[i]]['sub'] = self.data[lab[i]]['sub'] + 1
+                    result['all'] = result['all'] + 1
+                    result['sub'] = result['sub'] + 1
+                result['lab'].insert(0, lab[i])
+                result['rec'].insert(0, rec[j])
+                i = i - 1
+                j = j - 1
+            elif self.space[i][j]['error'] == 'del':  # deletion
+                if len(lab[i]) > 0:
+                    self.data[lab[i]]['all'] = self.data[lab[i]]['all'] + 1
+                    self.data[lab[i]]['del'] = self.data[lab[i]]['del'] + 1
+                    result['all'] = result['all'] + 1
+                    result['del'] = result['del'] + 1
+                result['lab'].insert(0, lab[i])
+                result['rec'].insert(0, "")
+                i = i - 1
+            elif self.space[i][j]['error'] == 'ins':  # insertion
+                if len(rec[j]) > 0:
+                    self.data[rec[j]]['ins'] = self.data[rec[j]]['ins'] + 1
+                    result['ins'] = result['ins'] + 1
+                result['lab'].insert(0, "")
+                result['rec'].insert(0, rec[j])
+                j = j - 1
+            elif self.space[i][j]['error'] == 'non':  # starting point
+                break
+            else:  # shouldn't reach here
+                print(
+                    'this should not happen , i = {i} , j = {j} , error = {error}'.
+                    format(i=i, j=j, error=self.space[i][j]['error']))
+        return result
+
+    def overall(self):
+        result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0}
+        for token in self.data:
+            result['all'] = result['all'] + self.data[token]['all']
+            result['cor'] = result['cor'] + self.data[token]['cor']
+            result['sub'] = result['sub'] + self.data[token]['sub']
+            result['ins'] = result['ins'] + self.data[token]['ins']
+            result['del'] = result['del'] + self.data[token]['del']
+        return result
+
+    def cluster(self, data):
+        result = {'all': 0, 'cor': 0, 'sub': 0, 'ins': 0, 'del': 0}
+        for token in data:
+            if token in self.data:
+                result['all'] = result['all'] + self.data[token]['all']
+                result['cor'] = result['cor'] + self.data[token]['cor']
+                result['sub'] = result['sub'] + self.data[token]['sub']
+                result['ins'] = result['ins'] + self.data[token]['ins']
+                result['del'] = result['del'] + self.data[token]['del']
+        return result
+
+    def keys(self):
+        return list(self.data.keys())
+
+
+def width(string):
+    return sum(1 + (unicodedata.east_asian_width(c) in "AFW") for c in string)
+
+
+def default_cluster(word):
+    unicode_names = [unicodedata.name(char) for char in word]
+    for i in reversed(range(len(unicode_names))):
+        if unicode_names[i].startswith('DIGIT'):  # 1
+            unicode_names[i] = 'Number'  # 'DIGIT'
+        elif (unicode_names[i].startswith('CJK UNIFIED IDEOGRAPH') or
+              unicode_names[i].startswith('CJK COMPATIBILITY IDEOGRAPH')):
+            # 明 / 郎
+            unicode_names[i] = 'Mandarin'  # 'CJK IDEOGRAPH'
+        elif (unicode_names[i].startswith('LATIN CAPITAL LETTER') or
+              unicode_names[i].startswith('LATIN SMALL LETTER')):
+            # A / a
+            unicode_names[i] = 'English'  # 'LATIN LETTER'
+        elif unicode_names[i].startswith('HIRAGANA LETTER'):  # は こ め
+            unicode_names[i] = 'Japanese'  # 'GANA LETTER'
+        elif (unicode_names[i].startswith('AMPERSAND') or
+              unicode_names[i].startswith('APOSTROPHE') or
+              unicode_names[i].startswith('COMMERCIAL AT') or
+              unicode_names[i].startswith('DEGREE CELSIUS') or
+              unicode_names[i].startswith('EQUALS SIGN') or
+              unicode_names[i].startswith('FULL STOP') or
+              unicode_names[i].startswith('HYPHEN-MINUS') or
+              unicode_names[i].startswith('LOW LINE') or
+              unicode_names[i].startswith('NUMBER SIGN') or
+              unicode_names[i].startswith('PLUS SIGN') or
+              unicode_names[i].startswith('SEMICOLON')):
+            # & / ' / @ / ℃ / = / . / - / _ / # / + / ;
+            del unicode_names[i]
+        else:
+            return 'Other'
+    if len(unicode_names) == 0:
+        return 'Other'
+    if len(unicode_names) == 1:
+        return unicode_names[0]
+    for i in range(len(unicode_names) - 1):
+        if unicode_names[i] != unicode_names[i + 1]:
+            return 'Other'
+    return unicode_names[0]
+
+
+def usage():
+    print(
+        "compute-wer.py : compute word error rate (WER) and align recognition results and references."
+    )
+    print(
+        "         usage : python compute-wer.py [--cs={0,1}] [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] [--padding-symbol={space,underline}] test.ref test.hyp > test.wer"
+    )
+
+
+def main():
+    # python utils/compute-wer.py --char=1 --v=1 ref hyp > rsl.error
+    if len(sys.argv) == 1:
+        usage()
+        sys.exit(0)
+    calculator = Calculator()
+    cluster_file = ''
+    ignore_words = set()
+    tochar = False
+    verbose = 1
+    padding_symbol = ' '
+    case_sensitive = False
+    max_words_per_line = sys.maxsize
+    split = None
+    while len(sys.argv) > 3:
+        a = '--maxw='
+        if sys.argv[1].startswith(a):
+            b = sys.argv[1][len(a):]
+            del sys.argv[1]
+            max_words_per_line = int(b)
+            continue
+        a = '--rt='
+        if sys.argv[1].startswith(a):
+            b = sys.argv[1][len(a):].lower()
+            del sys.argv[1]
+            remove_tag = (b == 'true') or (b != '0')
+            continue
+        a = '--cs='
+        if sys.argv[1].startswith(a):
+            b = sys.argv[1][len(a):].lower()
+            del sys.argv[1]
+            case_sensitive = (b == 'true') or (b != '0')
+            continue
+        a = '--cluster='
+        if sys.argv[1].startswith(a):
+            cluster_file = sys.argv[1][len(a):]
+            del sys.argv[1]
+            continue
+        a = '--splitfile='
+        if sys.argv[1].startswith(a):
+            split_file = sys.argv[1][len(a):]
+            del sys.argv[1]
+            split = dict()
+            with codecs.open(split_file, 'r', 'utf-8') as fh:
+                for line in fh:  # line in unicode
+                    words = line.strip().split()
+                    if len(words) >= 2:
+                        split[words[0]] = words[1:]
+            continue
+        a = '--ig='
+        if sys.argv[1].startswith(a):
+            ignore_file = sys.argv[1][len(a):]
+            del sys.argv[1]
+            with codecs.open(ignore_file, 'r', 'utf-8') as fh:
+                for line in fh:  # line in unicode
+                    line = line.strip()
+                    if len(line) > 0:
+                        ignore_words.add(line)
+            continue
+        a = '--char='
+        if sys.argv[1].startswith(a):
+            b = sys.argv[1][len(a):].lower()
+            del sys.argv[1]
+            tochar = (b == 'true') or (b != '0')
+            continue
+        a = '--v='
+        if sys.argv[1].startswith(a):
+            b = sys.argv[1][len(a):].lower()
+            del sys.argv[1]
+            verbose = 0
+            try:
+                verbose = int(b)
+            except:
+                if b == 'true' or b != '0':
+                    verbose = 1
+            continue
+        a = '--padding-symbol='
+        if sys.argv[1].startswith(a):
+            b = sys.argv[1][len(a):].lower()
+            del sys.argv[1]
+            if b == 'space':
+                padding_symbol = ' '
+            elif b == 'underline':
+                padding_symbol = '_'
+            continue
+        if True or sys.argv[1].startswith('-'):
+            #ignore invalid switch
+            del sys.argv[1]
+            continue
+
+    if not case_sensitive:
+        ig = set([w.upper() for w in ignore_words])
+        ignore_words = ig
+
+    default_clusters = {}
+    default_words = {}
+
+    ref_file = sys.argv[1]
+    hyp_file = sys.argv[2]
+    rec_set = {}
+    if split and not case_sensitive:
+        newsplit = dict()
+        for w in split:
+            words = split[w]
+            for i in range(len(words)):
+                words[i] = words[i].upper()
+            newsplit[w.upper()] = words
+        split = newsplit
+
+    with codecs.open(hyp_file, 'r', 'utf-8') as fh:
+        for line in fh:
+            if tochar:
+                array = characterize(line)
+            else:
+                array = line.strip().split()
+            if len(array) == 0: continue
+            fid = array[0]
+            rec_set[fid] = normalize(array[1:], ignore_words, case_sensitive,
+                                     split)
+
+    # compute error rate on the interaction of reference file and hyp file
+    for line in open(ref_file, 'r', encoding='utf-8'):
+        if tochar:
+            array = characterize(line)
+        else:
+            array = line.rstrip('\n').split()
+        if len(array) == 0: continue
+        fid = array[0]
+        if fid not in rec_set:
+            continue
+        lab = normalize(array[1:], ignore_words, case_sensitive, split)
+        rec = rec_set[fid]
+        if verbose:
+            print('\nutt: %s' % fid)
+
+        for word in rec + lab:
+            if word not in default_words:
+                default_cluster_name = default_cluster(word)
+                if default_cluster_name not in default_clusters:
+                    default_clusters[default_cluster_name] = {}
+                if word not in default_clusters[default_cluster_name]:
+                    default_clusters[default_cluster_name][word] = 1
+                default_words[word] = default_cluster_name
+
+        result = calculator.calculate(lab, rec)
+        if verbose:
+            if result['all'] != 0:
+                wer = float(result['ins'] + result['sub'] + result[
+                    'del']) * 100.0 / result['all']
+            else:
+                wer = 0.0
+            print('WER: %4.2f %%' % wer, end=' ')
+            print('N=%d C=%d S=%d D=%d I=%d' %
+                  (result['all'], result['cor'], result['sub'], result['del'],
+                   result['ins']))
+            space = {}
+            space['lab'] = []
+            space['rec'] = []
+            for idx in range(len(result['lab'])):
+                len_lab = width(result['lab'][idx])
+                len_rec = width(result['rec'][idx])
+                length = max(len_lab, len_rec)
+                space['lab'].append(length - len_lab)
+                space['rec'].append(length - len_rec)
+            upper_lab = len(result['lab'])
+            upper_rec = len(result['rec'])
+            lab1, rec1 = 0, 0
+            while lab1 < upper_lab or rec1 < upper_rec:
+                if verbose > 1:
+                    print('lab(%s):' % fid.encode('utf-8'), end=' ')
+                else:
+                    print('lab:', end=' ')
+                lab2 = min(upper_lab, lab1 + max_words_per_line)
+                for idx in range(lab1, lab2):
+                    token = result['lab'][idx]
+                    print('{token}'.format(token=token), end='')
+                    for n in range(space['lab'][idx]):
+                        print(padding_symbol, end='')
+                    print(' ', end='')
+                print()
+                if verbose > 1:
+                    print('rec(%s):' % fid.encode('utf-8'), end=' ')
+                else:
+                    print('rec:', end=' ')
+                rec2 = min(upper_rec, rec1 + max_words_per_line)
+                for idx in range(rec1, rec2):
+                    token = result['rec'][idx]
+                    print('{token}'.format(token=token), end='')
+                    for n in range(space['rec'][idx]):
+                        print(padding_symbol, end='')
+                    print(' ', end='')
+                print('\n', end='\n')
+                lab1 = lab2
+                rec1 = rec2
+
+    if verbose:
+        print(
+            '==========================================================================='
+        )
+        print()
+
+    result = calculator.overall()
+    if result['all'] != 0:
+        wer = float(result['ins'] + result['sub'] + result[
+            'del']) * 100.0 / result['all']
+    else:
+        wer = 0.0
+    print('Overall -> %4.2f %%' % wer, end=' ')
+    print('N=%d C=%d S=%d D=%d I=%d' %
+          (result['all'], result['cor'], result['sub'], result['del'],
+           result['ins']))
+    if not verbose:
+        print()
+
+    if verbose:
+        for cluster_id in default_clusters:
+            result = calculator.cluster(
+                [k for k in default_clusters[cluster_id]])
+            if result['all'] != 0:
+                wer = float(result['ins'] + result['sub'] + result[
+                    'del']) * 100.0 / result['all']
+            else:
+                wer = 0.0
+            print('%s -> %4.2f %%' % (cluster_id, wer), end=' ')
+            print('N=%d C=%d S=%d D=%d I=%d' %
+                  (result['all'], result['cor'], result['sub'], result['del'],
+                   result['ins']))
+        if len(cluster_file) > 0:  # compute separated WERs for word clusters
+            cluster_id = ''
+            cluster = []
+            for line in open(cluster_file, 'r', encoding='utf-8'):
+                for token in line.decode('utf-8').rstrip('\n').split():
+                    # end of cluster reached, like </Keyword>
+                    if token[0:2] == '</' and token[len(token)-1] == '>' and \
+                       token.lstrip('</').rstrip('>') == cluster_id :
+                        result = calculator.cluster(cluster)
+                        if result['all'] != 0:
+                            wer = float(result['ins'] + result['sub'] + result[
+                                'del']) * 100.0 / result['all']
+                        else:
+                            wer = 0.0
+                        print('%s -> %4.2f %%' % (cluster_id, wer), end=' ')
+                        print('N=%d C=%d S=%d D=%d I=%d' %
+                              (result['all'], result['cor'], result['sub'],
+                               result['del'], result['ins']))
+                        cluster_id = ''
+                        cluster = []
+                    # begin of cluster reached, like <Keyword>
+                    elif token[0] == '<' and token[len(token)-1] == '>' and \
+                         cluster_id == '' :
+                        cluster_id = token.lstrip('<').rstrip('>')
+                        cluster = []
+                    # general terms, like WEATHER / CAR / ...
+                    else:
+                        cluster.append(token)
+        print()
+        print(
+            '==========================================================================='
+        )
+
+
+if __name__ == '__main__':
+    main()
--- a/paddlespeech/dataset/s2t/format_data.py
+++ b/paddlespeech/dataset/s2t/format_data.py
@ -0,0 +1,154 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""format manifest with more metadata."""
+import argparse
+import functools
+import json
+
+import jsonlines
+
+from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
+from paddlespeech.s2t.frontend.utility import load_cmvn
+from paddlespeech.s2t.io.utility import feat_type
+from paddlespeech.utils.argparse import add_arguments
+from paddlespeech.utils.argparse import print_arguments
+
+
+def define_argparse():
+    parser = argparse.ArgumentParser(description=__doc__)
+    add_arg = functools.partial(add_arguments, argparser=parser)
+    # yapf: disable
+    add_arg('manifest_paths',   str,
+            None,
+            "Filepaths of manifests for building vocabulary. "
+            "You can provide multiple manifest files.",
+            nargs='+',
+            required=True)
+    add_arg('output_path',  str, None, "filepath of formated manifest.", required=True)
+    add_arg('cmvn_path',       str,
+            'examples/librispeech/data/mean_std.json',
+            "Filepath of cmvn.")
+    add_arg('unit_type', str, "char", "Unit type, e.g. char, word, spm")
+    add_arg('vocab_path',       str,
+            'examples/librispeech/data/vocab.txt',
+            "Filepath of the vocabulary.")
+    # bpe
+    add_arg('spm_model_prefix', str, None,
+        "spm model prefix, spm_model_%(bpe_mode)_%(count_threshold), only need when `unit_type` is spm")
+
+    # yapf: disable
+    args = parser.parse_args()
+    return args
+
+def format_data(
+    manifest_paths="",
+    output_path="",
+    cmvn_path="examples/librispeech/data/mean_std.json",
+    unit_type="char",
+    vocab_path="examples/librispeech/data/vocab.txt",
+    spm_model_prefix=""):
+
+    fout = open(output_path, 'w', encoding='utf-8')
+
+    # get feat dim
+    filetype = cmvn_path.split(".")[-1]
+    mean, istd = load_cmvn(cmvn_path, filetype=filetype)
+    feat_dim = mean.shape[0] #(D)
+    print(f"Feature dim: {feat_dim}")
+
+    text_feature = TextFeaturizer(unit_type, vocab_path, spm_model_prefix)
+    vocab_size = text_feature.vocab_size
+    print(f"Vocab size: {vocab_size}")
+
+    # josnline like this
+    # {
+    #   "input": [{"name": "input1", "shape": (100, 83), "feat": "xxx.ark:123"}],
+    #   "output": [{"name":"target1", "shape": (40, 5002), "text": "a b c de"}],
+    #   "utt2spk": "111-2222",
+    #   "utt": "111-2222-333"
+    # }
+    count = 0
+    for manifest_path in manifest_paths:
+        with jsonlines.open(str(manifest_path), 'r') as reader:
+            manifest_jsons = list(reader)
+
+        for line_json in manifest_jsons:
+            output_json = {
+                "input": [],
+                "output": [],
+                'utt': line_json['utt'],
+                'utt2spk': line_json.get('utt2spk', 'global'),
+            }
+
+            # output
+            line = line_json['text']
+            if isinstance(line, str):
+                # only one target
+                tokens = text_feature.tokenize(line)
+                tokenids = text_feature.featurize(line)
+                output_json['output'].append({
+                    'name': 'target1',
+                    'shape': (len(tokenids), vocab_size),
+                    'text': line,
+                    'token': ' '.join(tokens),
+                    'tokenid': ' '.join(map(str, tokenids)),
+                })
+            else:
+                # isinstance(line, list), multi target in one vocab
+                for i, item in enumerate(line, 1):
+                    tokens = text_feature.tokenize(item)
+                    tokenids = text_feature.featurize(item)
+                    output_json['output'].append({
+                        'name': f'target{i}',
+                        'shape': (len(tokenids), vocab_size),
+                        'text': item,
+                        'token': ' '.join(tokens),
+                        'tokenid': ' '.join(map(str, tokenids)),
+                    })
+
+            # input
+            line = line_json['feat']
+            if isinstance(line, str):
+                # only one input
+                feat_shape = line_json['feat_shape']
+                assert isinstance(feat_shape, (list, tuple)), type(feat_shape)
+                filetype = feat_type(line)
+                if filetype == 'sound':
+                    feat_shape.append(feat_dim)
+                else: # kaldi
+                    raise NotImplementedError('no support kaldi feat now!')
+
+                output_json['input'].append({
+                    "name": "input1",
+                    "shape": feat_shape,
+                    "feat": line,
+                    "filetype": filetype,
+                })
+            else:
+                # isinstance(line, list), multi input 
+                raise NotImplementedError("not support multi input now!")
+
+            fout.write(json.dumps(output_json) + '\n')
+            count += 1
+
+    print(f"{manifest_paths} Examples number: {count}")
+    fout.close()
+
+def main():
+    args = define_argparse()
+    print_arguments(args, globals())
+    format_data(**vars(args))
+
+if __name__ == '__main__':
+    main()
--- a/paddlespeech/dataset/s2t/format_rsl.py
+++ b/paddlespeech/dataset/s2t/format_rsl.py
@ -0,0 +1,143 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+format ref/hyp file for `utt text` format to compute CER/WER/MER.
+
+norm:
+BAC009S0764W0196 明确了发展目标和重点任务
+BAC009S0764W0186 实现我国房地产市场的平稳运行
+
+
+sclite:
+加大对结构机械化环境和收集谈控机制力度(BAC009S0906W0240.wav)
+河南省新乡市丰秋县刘光镇政府东五零左右(BAC009S0770W0441.wav)
+"""
+import argparse
+
+import jsonlines
+
+from paddlespeech.utils.argparse import print_arguments
+
+
+def transform_hyp(origin, trans, trans_sclite):
+    """
+    Args:
+        origin: The input json file which contains the model output
+        trans: The output file for caculate CER/WER
+        trans_sclite: The output file for caculate CER/WER using sclite
+    """
+    input_dict = {}
+
+    with open(origin, "r+", encoding="utf8") as f:
+        for item in jsonlines.Reader(f):
+            input_dict[item["utt"]] = item["hyps"][0]
+
+    if trans:
+        with open(trans, "w+", encoding="utf8") as f:
+            for key in input_dict.keys():
+                f.write(key + " " + input_dict[key] + "\n")
+        print(f"transform_hyp output: {trans}")
+
+    if trans_sclite:
+        with open(trans_sclite, "w+") as f:
+            for key in input_dict.keys():
+                line = input_dict[key] + "(" + key + ".wav" + ")" + "\n"
+                f.write(line)
+        print(f"transform_hyp output: {trans_sclite}")
+
+
+def transform_ref(origin, trans, trans_sclite):
+    """
+    Args:
+        origin: The input json file which contains the model output
+        trans: The output file for caculate CER/WER
+        trans_sclite: The output file for caculate CER/WER using sclite
+    """
+    input_dict = {}
+
+    with open(origin, "r", encoding="utf8") as f:
+        for item in jsonlines.Reader(f):
+            input_dict[item["utt"]] = item["text"]
+
+    if trans:
+        with open(trans, "w", encoding="utf8") as f:
+            for key in input_dict.keys():
+                f.write(key + " " + input_dict[key] + "\n")
+        print(f"transform_hyp output: {trans}")
+
+    if trans_sclite:
+        with open(trans_sclite, "w") as f:
+            for key in input_dict.keys():
+                line = input_dict[key] + "(" + key + ".wav" + ")" + "\n"
+                f.write(line)
+        print(f"transform_hyp output: {trans_sclite}")
+
+
+def define_argparse():
+    parser = argparse.ArgumentParser(
+        prog='format ref/hyp file for compute CER/WER', add_help=True)
+    parser.add_argument(
+        '--origin_hyp', type=str, default="", help='origin hyp file')
+    parser.add_argument(
+        '--trans_hyp',
+        type=str,
+        default="",
+        help='hyp file for caculating CER/WER')
+    parser.add_argument(
+        '--trans_hyp_sclite',
+        type=str,
+        default="",
+        help='hyp file for caculating CER/WER by sclite')
+
+    parser.add_argument(
+        '--origin_ref', type=str, default="", help='origin ref file')
+    parser.add_argument(
+        '--trans_ref',
+        type=str,
+        default="",
+        help='ref file for caculating CER/WER')
+    parser.add_argument(
+        '--trans_ref_sclite',
+        type=str,
+        default="",
+        help='ref file for caculating CER/WER by sclite')
+    parser_args = parser.parse_args()
+    return parser_args
+
+
+def format_result(origin_hyp="",
+                  trans_hyp="",
+                  trans_hyp_sclite="",
+                  origin_ref="",
+                  trans_ref="",
+                  trans_ref_sclite=""):
+
+    if origin_hyp:
+        transform_hyp(
+            origin=origin_hyp, trans=trans_hyp, trans_sclite=trans_hyp_sclite)
+
+    if origin_ref:
+        transform_ref(
+            origin=origin_ref, trans=trans_ref, trans_sclite=trans_ref_sclite)
+
+
+def main():
+    args = define_argparse()
+    print_arguments(args, globals())
+
+    format_result(**vars(args))
+
+
+if __name__ == "__main__":
+    main()
--- a/paddlespeech/s2t/init.py
+++ b/paddlespeech/s2t/init.py
@ -267,7 +267,7 @@ def to(x: paddle.Tensor, *args, **kwargs) -> paddle.Tensor:


 if not hasattr(paddle.Tensor, 'to'):
-    logger.debug("register user to to paddle.Tensor, remove this when fixed!")
+    logger.debug("register user to paddle.Tensor, remove this when fixed!")
    setattr(paddle.Tensor, 'to', to)
    setattr(paddle.static.Variable, 'to', to)

--- a/paddlespeech/s2t/exps/deepspeech2/bin/deploy/runtime.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/deploy/runtime.py
@ -28,8 +28,8 @@ from paddlespeech.s2t.training.cli import default_argument_parser
 from paddlespeech.s2t.utils.socket_server import AsrRequestHandler
 from paddlespeech.s2t.utils.socket_server import AsrTCPServer
 from paddlespeech.s2t.utils.socket_server import warm_up_test
-from paddlespeech.s2t.utils.utility import add_arguments
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import add_arguments
+from paddlespeech.utils.argparse import print_arguments


 def init_predictor(args):
--- a/paddlespeech/s2t/exps/deepspeech2/bin/deploy/server.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/deploy/server.py
@ -26,8 +26,8 @@ from paddlespeech.s2t.training.cli import default_argument_parser
 from paddlespeech.s2t.utils.socket_server import AsrRequestHandler
 from paddlespeech.s2t.utils.socket_server import AsrTCPServer
 from paddlespeech.s2t.utils.socket_server import warm_up_test
-from paddlespeech.s2t.utils.utility import add_arguments
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import add_arguments
+from paddlespeech.utils.argparse import print_arguments


 def start_server(config, args):
--- a/paddlespeech/s2t/exps/deepspeech2/bin/export.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/export.py
@ -16,7 +16,7 @@ from yacs.config import CfgNode

 from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Tester as Tester
 from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments


 def main_sp(config, args):
--- a/paddlespeech/s2t/exps/deepspeech2/bin/test.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/test.py
@ -16,7 +16,7 @@ from yacs.config import CfgNode

 from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Tester as Tester
 from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments


 def main_sp(config, args):
--- a/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py
@ -16,7 +16,7 @@ from yacs.config import CfgNode

 from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2ExportTester as ExportTester
 from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments


 def main_sp(config, args):
--- a/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/test_wav.py
@ -27,8 +27,8 @@ from paddlespeech.s2t.training.cli import default_argument_parser
 from paddlespeech.s2t.utils import mp_tools
 from paddlespeech.s2t.utils.checkpoint import Checkpoint
 from paddlespeech.s2t.utils.log import Log
-from paddlespeech.s2t.utils.utility import print_arguments
 from paddlespeech.s2t.utils.utility import UpdateConfig
+from paddlespeech.utils.argparse import print_arguments

 logger = Log(__name__).getlog()

--- a/paddlespeech/s2t/exps/deepspeech2/bin/train.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/train.py
@ -16,7 +16,7 @@ from yacs.config import CfgNode

 from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Trainer as Trainer
 from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments


 def main_sp(config, args):
--- a/paddlespeech/s2t/exps/u2/bin/alignment.py
+++ b/paddlespeech/s2t/exps/u2/bin/alignment.py
@ -16,7 +16,7 @@ from yacs.config import CfgNode

 from paddlespeech.s2t.exps.u2.model import U2Tester as Tester
 from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments


 def main_sp(config, args):
--- a/paddlespeech/s2t/exps/u2/bin/export.py
+++ b/paddlespeech/s2t/exps/u2/bin/export.py
@ -16,7 +16,7 @@ from yacs.config import CfgNode

 from paddlespeech.s2t.exps.u2.model import U2Tester as Tester
 from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments


 def main_sp(config, args):
--- a/paddlespeech/s2t/exps/u2/bin/test.py
+++ b/paddlespeech/s2t/exps/u2/bin/test.py
@ -18,7 +18,7 @@ from yacs.config import CfgNode

 from paddlespeech.s2t.exps.u2.model import U2Tester as Tester
 from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments


 def main_sp(config, args):
--- a/paddlespeech/s2t/exps/u2/bin/train.py
+++ b/paddlespeech/s2t/exps/u2/bin/train.py
@ -19,7 +19,7 @@ from yacs.config import CfgNode

 from paddlespeech.s2t.exps.u2.model import U2Trainer as Trainer
 from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments

 # from paddlespeech.s2t.exps.u2.trainer import U2Trainer as Trainer

--- a/paddlespeech/s2t/exps/u2_kaldi/bin/test.py
+++ b/paddlespeech/s2t/exps/u2_kaldi/bin/test.py
@ -18,7 +18,7 @@ from yacs.config import CfgNode

 from paddlespeech.s2t.training.cli import default_argument_parser
 from paddlespeech.s2t.utils.dynamic_import import dynamic_import
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments

 model_test_alias = {
    "u2": "paddlespeech.s2t.exps.u2.model:U2Tester",
--- a/paddlespeech/s2t/exps/u2_kaldi/bin/train.py
+++ b/paddlespeech/s2t/exps/u2_kaldi/bin/train.py
@ -19,7 +19,7 @@ from yacs.config import CfgNode

 from paddlespeech.s2t.training.cli import default_argument_parser
 from paddlespeech.s2t.utils.dynamic_import import dynamic_import
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments

 model_train_alias = {
    "u2": "paddlespeech.s2t.exps.u2.model:U2Trainer",
--- a/paddlespeech/s2t/exps/u2_st/bin/export.py
+++ b/paddlespeech/s2t/exps/u2_st/bin/export.py
@ -16,7 +16,7 @@ from yacs.config import CfgNode

 from paddlespeech.s2t.exps.u2_st.model import U2STTester as Tester
 from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments


 def main_sp(config, args):
--- a/paddlespeech/s2t/exps/u2_st/bin/test.py
+++ b/paddlespeech/s2t/exps/u2_st/bin/test.py
@ -18,7 +18,7 @@ from yacs.config import CfgNode

 from paddlespeech.s2t.exps.u2_st.model import U2STTester as Tester
 from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments


 def main_sp(config, args):
--- a/paddlespeech/s2t/exps/u2_st/bin/train.py
+++ b/paddlespeech/s2t/exps/u2_st/bin/train.py
@ -19,7 +19,7 @@ from yacs.config import CfgNode

 from paddlespeech.s2t.exps.u2_st.model import U2STTrainer as Trainer
 from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments


 def main_sp(config, args):
--- a/paddlespeech/s2t/exps/wav2vec2/bin/test.py
+++ b/paddlespeech/s2t/exps/wav2vec2/bin/test.py
@ -18,7 +18,7 @@ from yacs.config import CfgNode

 from paddlespeech.s2t.exps.wav2vec2.model import Wav2Vec2ASRTester as Tester
 from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments


 def main_sp(config, args):
--- a/paddlespeech/s2t/exps/wav2vec2/bin/train.py
+++ b/paddlespeech/s2t/exps/wav2vec2/bin/train.py
@ -19,7 +19,7 @@ from yacs.config import CfgNode

 from paddlespeech.s2t.exps.wav2vec2.model import Wav2Vec2ASRTrainer as Trainer
 from paddlespeech.s2t.training.cli import default_argument_parser
-from paddlespeech.s2t.utils.utility import print_arguments
+from paddlespeech.utils.argparse import print_arguments


 def main_sp(config, args):
--- a/paddlespeech/s2t/frontend/augmentor/augmentation.py
+++ b/paddlespeech/s2t/frontend/augmentor/augmentation.py
@ -45,7 +45,7 @@ class AugmentationPipeline():
    samples to make the model invariant to certain types of perturbations in the
    real world, improving model's generalization ability.

-    The pipeline is built according the the augmentation configuration in json
+    The pipeline is built according to the augmentation configuration in json
    string, e.g.
    
    .. code-block::
--- a/paddlespeech/s2t/frontend/featurizer/text_featurizer.py
+++ b/paddlespeech/s2t/frontend/featurizer/text_featurizer.py
@ -48,13 +48,16 @@ class TextFeaturizer():
        self.unit_type = unit_type
        self.unk = UNK
        self.maskctc = maskctc
+        self.vocab_path_or_list = vocab

-        if vocab:
+        if self.vocab_path_or_list:
            self.vocab_dict, self._id2token, self.vocab_list, self.unk_id, self.eos_id, self.blank_id = self._load_vocabulary_from_file(
                vocab, maskctc)
            self.vocab_size = len(self.vocab_list)
        else:
-            logger.warning("TextFeaturizer: not have vocab file or vocab list.")
+            logger.warning(
+                "TextFeaturizer: not have vocab file or vocab list. Only Tokenizer can use, can not convert to token idx"
+            )

        if unit_type == 'spm':
            spm_model = spm_model_prefix + '.model'
@ -62,6 +65,7 @@ class TextFeaturizer():
            self.sp.Load(spm_model)

    def tokenize(self, text, replace_space=True):
+        """tokenizer split text into text tokens"""
        if self.unit_type == 'char':
            tokens = self.char_tokenize(text, replace_space)
        elif self.unit_type == 'word':
@ -71,6 +75,7 @@ class TextFeaturizer():
        return tokens

    def detokenize(self, tokens):
+        """tokenizer convert text tokens back to text"""
        if self.unit_type == 'char':
            text = self.char_detokenize(tokens)
        elif self.unit_type == 'word':
@ -88,6 +93,7 @@ class TextFeaturizer():
        Returns:
            List[int]: List of token indices.
        """
+        assert self.vocab_path_or_list, "toidx need vocab path or vocab list"
        tokens = self.tokenize(text)
        ids = []
        for token in tokens:
@ -107,6 +113,7 @@ class TextFeaturizer():
        Returns:
            str: Text.
        """
+        assert self.vocab_path_or_list, "toidx need vocab path or vocab list"
        tokens = []
        for idx in idxs:
            if idx == self.eos_id:
@ -127,10 +134,10 @@ class TextFeaturizer():
        """
        text = text.strip()
        if replace_space:
-            text_list = [SPACE if item == " " else item for item in list(text)]
+            tokens = [SPACE if item == " " else item for item in list(text)]
        else:
-            text_list = list(text)
-        return text_list
+            tokens = list(text)
+        return tokens

    def char_detokenize(self, tokens):
        """Character detokenizer.
--- a/paddlespeech/s2t/io/speechbrain/sampler.py
+++ b/paddlespeech/s2t/io/speechbrain/sampler.py
@ -283,7 +283,7 @@ class DynamicBatchSampler(Sampler):
            num_quantiles, )
        # get quantiles using lognormal distribution
        quantiles = lognorm.ppf(latent_boundaries, 1)
-        # scale up to to max_batch_length
+        # scale up to max_batch_length
        bucket_boundaries = quantiles * max_batch_length / quantiles[-1]
        # compute resulting bucket length multipliers
        length_multipliers = [
--- a/paddlespeech/s2t/models/u2/u2.py
+++ b/paddlespeech/s2t/models/u2/u2.py
@ -560,7 +560,7 @@ class U2BaseModel(ASRInterface, nn.Layer):
            [len(hyp[0]) for hyp in hyps], place=device,
            dtype=paddle.long)  # (beam_size,)
        hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id)
-        hyps_lens = hyps_lens + 1  # Add <sos> at begining
+        hyps_lens = hyps_lens + 1  # Add <sos> at beginning
        logger.debug(
            f"hyps pad: {hyps_pad} {self.sos} {self.eos} {self.ignore_id}")

@ -709,7 +709,7 @@ class U2BaseModel(ASRInterface, nn.Layer):
            hypothesis from ctc prefix beam search and one encoder output
        Args:
            hyps (paddle.Tensor): hyps from ctc prefix beam search, already
-                pad sos at the begining, (B, T)
+                pad sos at the beginning, (B, T)
            hyps_lens (paddle.Tensor): length of each hyp in hyps, (B)
            encoder_out (paddle.Tensor): corresponding encoder output, (B=1, T, D)
        Returns:
--- a/paddlespeech/s2t/models/u2_st/u2_st.py
+++ b/paddlespeech/s2t/models/u2_st/u2_st.py
@ -455,7 +455,7 @@ class U2STBaseModel(nn.Layer):
            hypothesis from ctc prefix beam search and one encoder output
        Args:
            hyps (paddle.Tensor): hyps from ctc prefix beam search, already
-                pad sos at the begining, (B, T)
+                pad sos at the beginning, (B, T)
            hyps_lens (paddle.Tensor): length of each hyp in hyps, (B)
            encoder_out (paddle.Tensor): corresponding encoder output, (B=1, T, D)
        Returns:
--- a/Show More
+++ b/Show More