diff --git a/.gitignore b/.gitignore
index 75f56b604..4a0c43312 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,6 +15,7 @@
 *.egg-info
 build
 *output/
+.history
 
 audio/dist/
 audio/fc_patch/
diff --git a/README.md b/README.md
index dbdf6a4f8..afc4e4d09 100644
--- a/README.md
+++ b/README.md
@@ -157,6 +157,8 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision
   - 🧩  *Cascaded models application*: as an extension of the typical traditional audio tasks, we combine the workflows of the aforementioned tasks with other fields like Natural language processing (NLP) and Computer Vision (CV).
 
 ### Recent Update
+- 🔥 2023.01.10: Add [code-switch asr CLI and Demos](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/speech_recognition).
+- 👑 2023.01.06: Add [code-switch asr tal_cs recipe](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/tal_cs/asr1/).
 - 🎉 2022.12.02: Add [end-to-end Prosody Prediction pipeline](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3_rhy) (including using prosody labels in Acoustic Model).
 - 🎉 2022.11.30: Add [TTS Android Demo](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/TTSAndroid).
 - 🤗 2022.11.28: PP-TTS and PP-ASR demos are available in [AIStudio](https://aistudio.baidu.com/aistudio/modelsoverview) and [official website
@@ -189,7 +191,7 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision
 - Scan the QR code below with your Wechat, you can access to official technical exchange group and get the bonus ( more than 20GB learning materials, such as papers, codes and videos ) and the live link of the lessons. Look forward to your participation.
 
 <div align="center">
-<img src="https://user-images.githubusercontent.com/30135920/196351517-19dece6b-d6ea-448e-a341-d6bfe5712ec1.jpg"  width = "200"  />
+<img src="https://user-images.githubusercontent.com/30135920/212860467-9e943cc3-8be8-49a4-97fd-7c94aad8e979.jpg"  width = "200"  />
 </div>
 
 ## Installation
diff --git a/README_cn.md b/README_cn.md
index 5cc156c9f..ecc4644aa 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -164,6 +164,8 @@
 
   
 ### 近期更新
+- 🔥 2023.01.10: 新增 [中英混合 ASR CLI 和 Demos](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/speech_recognition).
+- 👑 2023.01.06: 新增 [ASR中英混合 tal_cs 训练推理流程](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/tal_cs/asr1/).
 - 🎉 2022.12.02: 新增 [端到端韵律预测全流程](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3_rhy) (包含在声学模型中使用韵律标签)。
 - 🎉 2022.11.30: 新增 [TTS Android 部署示例](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/TTSAndroid)。
 - 🤗 2022.11.28: PP-TTS and PP-ASR 示例可在 [AIStudio](https://aistudio.baidu.com/aistudio/modelsoverview) 和[飞桨官网](https://www.paddlepaddle.org.cn/models)体验！
@@ -200,7 +202,7 @@
 微信扫描二维码关注公众号，点击“马上报名”填写问卷加入官方交流群，获得更高效的问题答疑，与各行各业开发者充分交流，期待您的加入。
 
 <div align="center">
-<img src="https://user-images.githubusercontent.com/30135920/196351517-19dece6b-d6ea-448e-a341-d6bfe5712ec1.jpg"  width = "200"  />
+<img src="https://user-images.githubusercontent.com/30135920/212860467-9e943cc3-8be8-49a4-97fd-7c94aad8e979.jpg"  width = "200"  />
 </div>
 
 <a name="安装"></a>
diff --git a/audio/setup.py b/audio/setup.py
index 82e9a55a5..d36b2c440 100644
--- a/audio/setup.py
+++ b/audio/setup.py
@@ -43,7 +43,7 @@ base = [
     "scipy>=1.0.0",
     "soundfile~=0.10",
     "colorlog",
-    "pathos == 0.2.8",
+    "pathos==0.2.8",
     "pybind11",
     "parameterized",
     "tqdm",
diff --git a/demos/speech_recognition/README.md b/demos/speech_recognition/README.md
index c815a88af..ee2acd6fd 100644
--- a/demos/speech_recognition/README.md
+++ b/demos/speech_recognition/README.md
@@ -17,7 +17,7 @@ The input of this demo should be a WAV file(`.wav`), and the sample rate must be
 
 Here are sample files for this demo that can be downloaded:
 ```bash
-wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
+wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav
 ```
 
 ### 3. Usage
@@ -27,6 +27,8 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
   paddlespeech asr --input ./zh.wav -v
   # English
   paddlespeech asr --model transformer_librispeech --lang en --input ./en.wav -v
+  # Code-Switch
+  paddlespeech asr --model conformer_talcs --lang zh_en --codeswitch True --input ./ch_zh_mix.wav -v 
   # Chinese ASR + Punctuation Restoration
   paddlespeech asr --input ./zh.wav -v | paddlespeech text --task punc -v
   ```
@@ -40,6 +42,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
   - `input`(required): Audio file to recognize.
   - `model`: Model type of asr task. Default: `conformer_wenetspeech`.
   - `lang`: Model language. Default: `zh`.
+  - `codeswitch`: Code Swith Model. Default: `False`
   - `sample_rate`: Sample rate of the model. Default: `16000`.
   - `config`: Config of asr task. Use pretrained model when it is None. Default: `None`.
   - `ckpt_path`: Model checkpoint. Use pretrained model when it is None. Default: `None`.
@@ -83,14 +86,15 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
 
 Here is a list of pretrained models released by PaddleSpeech that can be used by command and python API:
 
-| Model | Language | Sample Rate
-| :--- | :---: | :---: |
-| conformer_wenetspeech | zh | 16k
-| conformer_online_multicn | zh | 16k
-| conformer_aishell | zh | 16k
-| conformer_online_aishell | zh | 16k
-| transformer_librispeech | en | 16k
-| deepspeech2online_wenetspeech | zh | 16k
-| deepspeech2offline_aishell| zh| 16k
-| deepspeech2online_aishell | zh | 16k
-| deepspeech2offline_librispeech | en | 16k
+| Model | Code Switch | Language | Sample Rate
+| :--- | :---: | :---: | :---: |
+| conformer_wenetspeech | False | zh | 16k
+| conformer_online_multicn | False | zh | 16k
+| conformer_aishell | False | zh | 16k
+| conformer_online_aishell | False | zh | 16k
+| transformer_librispeech | False | en | 16k
+| deepspeech2online_wenetspeech | False | zh | 16k
+| deepspeech2offline_aishell | False | zh| 16k
+| deepspeech2online_aishell | False | zh | 16k
+| deepspeech2offline_librispeech | False | en | 16k
+| conformer_talcs | True | zh_en | 16k
diff --git a/demos/speech_recognition/README_cn.md b/demos/speech_recognition/README_cn.md
index 13aa9f277..62dce3bc9 100644
--- a/demos/speech_recognition/README_cn.md
+++ b/demos/speech_recognition/README_cn.md
@@ -1,4 +1,5 @@
 (简体中文|[English](./README.md))
+ (简体中文|[English](./README.md))
 
 # 语音识别
 ## 介绍
@@ -16,7 +17,7 @@
 
 可以下载此 demo 的示例音频：
 ```bash
-wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
+wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav
 ```
 ### 3. 使用方法
 - 命令行 (推荐使用)
@@ -25,6 +26,8 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
   paddlespeech asr --input ./zh.wav -v
   # 英文
   paddlespeech asr --model transformer_librispeech --lang en --input ./en.wav -v
+  #中英混合
+  paddlespeech asr --model conformer_talcs --lang zh_en --codeswitch True --input ./ch_zh_mix.wav -v 
   # 中文 + 标点恢复
   paddlespeech asr --input ./zh.wav -v | paddlespeech text --task punc -v
   ```
@@ -38,6 +41,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
   - `input`(必须输入)：用于识别的音频文件。
   - `model`：ASR 任务的模型，默认值：`conformer_wenetspeech`。
   - `lang`：模型语言，默认值：`zh`。
+  - `codeswitch`: 是否使用语言转换，默认值：`False`。
   - `sample_rate`：音频采样率，默认值：`16000`。
   - `config`：ASR 任务的参数文件，若不设置则使用预训练模型中的默认配置，默认值：`None`。
   - `ckpt_path`：模型参数文件，若不设置则下载预训练模型使用，默认值：`None`。
@@ -80,14 +84,15 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
 ### 4.预训练模型
 以下是 PaddleSpeech 提供的可以被命令行和 python API 使用的预训练模型列表：
 
-| 模型 | 语言 | 采样率
-| :--- | :---: | :---: |
-| conformer_wenetspeech | zh | 16k
-| conformer_online_multicn | zh | 16k
-| conformer_aishell | zh | 16k
-| conformer_online_aishell | zh | 16k
-| transformer_librispeech | en | 16k
-| deepspeech2online_wenetspeech | zh | 16k
-| deepspeech2offline_aishell| zh| 16k
-| deepspeech2online_aishell | zh | 16k
-| deepspeech2offline_librispeech | en | 16k
+| 模型 | 语言转换 | 语言 | 采样率
+| :--- | :---: | :---: | :---: |
+| conformer_wenetspeech | False | zh | 16k
+| conformer_online_multicn | False | zh | 16k
+| conformer_aishell | False | zh | 16k
+| conformer_online_aishell | False | zh | 16k
+| transformer_librispeech | False | en | 16k
+| deepspeech2online_wenetspeech | False | zh | 16k
+| deepspeech2offline_aishell | False | zh| 16k
+| deepspeech2online_aishell | False | zh | 16k
+| deepspeech2offline_librispeech | False | en | 16k
+| conformer_talcs | True | zh_en | 16k
diff --git a/demos/speech_recognition/run.sh b/demos/speech_recognition/run.sh
index e48ff3e96..8ba6e4c3e 100755
--- a/demos/speech_recognition/run.sh
+++ b/demos/speech_recognition/run.sh
@@ -2,6 +2,7 @@
 
 wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
 wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
+wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav
 
 # asr
 paddlespeech asr --input ./zh.wav
@@ -18,6 +19,11 @@ paddlespeech asr --help
 # english asr
 paddlespeech asr --lang en --model transformer_librispeech --input ./en.wav
 
+
+# code-switch asr
+paddlespeech asr --lang zh_en --codeswitch True --model conformer_talcs --input ./ch_zh_mix.wav
+
+
 # model stats
 paddlespeech stats --task asr
 
diff --git a/docs/requirements.txt b/docs/requirements.txt
index c6228d917..5422c26f9 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,5 +1,4 @@
 braceexpand
-colorlog
 editdistance
 fastapi
 g2p_en
@@ -16,7 +15,7 @@ matplotlib
 myst-parser
 nara_wpe
 numpydoc
-onnxruntime==1.10.0
+onnxruntime>=1.11.0
 opencc
 paddlenlp
 # use paddlepaddle == 2.3.* according to: https://github.com/PaddlePaddle/Paddle/issues/48243
@@ -24,7 +23,6 @@ paddlepaddle>=2.2.2,<2.4.0
 paddlespeech_ctcdecoders
 paddlespeech_feat
 pandas
-pathos==0.2.8
 pattern_singleton
 Pillow>=9.0.0
 ppdiffusers>=0.9.0
diff --git a/examples/aishell3/tts3/path.sh b/examples/aishell3/tts3/path.sh
deleted file mode 100755
index fb7e8411c..000000000
--- a/examples/aishell3/tts3/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=fastspeech2
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
diff --git a/examples/aishell3/tts3/path.sh b/examples/aishell3/tts3/path.sh
new file mode 120000
index 000000000..4785b9095
--- /dev/null
+++ b/examples/aishell3/tts3/path.sh
@@ -0,0 +1 @@
+../../csmsc/tts3/path.sh
\ No newline at end of file
diff --git a/examples/aishell3/vc0/path.sh b/examples/aishell3/vc0/path.sh
deleted file mode 100755
index a37cd21e3..000000000
--- a/examples/aishell3/vc0/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=tacotron2
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
diff --git a/examples/aishell3/vc0/path.sh b/examples/aishell3/vc0/path.sh
new file mode 120000
index 000000000..9e1fdbd16
--- /dev/null
+++ b/examples/aishell3/vc0/path.sh
@@ -0,0 +1 @@
+../../csmsc/tts0/path.sh
\ No newline at end of file
diff --git a/examples/aishell3/vc1/local/train.sh b/examples/aishell3/vc1/local/train.sh
deleted file mode 100755
index c775fcadc..000000000
--- a/examples/aishell3/vc1/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-python3 ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=2 \
-    --phones-dict=dump/phone_id_map.txt \
-    --voice-cloning=True
\ No newline at end of file
diff --git a/examples/aishell3/vc1/local/train.sh b/examples/aishell3/vc1/local/train.sh
new file mode 120000
index 000000000..115a0b8dc
--- /dev/null
+++ b/examples/aishell3/vc1/local/train.sh
@@ -0,0 +1 @@
+../../vc0/local/train.sh
\ No newline at end of file
diff --git a/examples/aishell3/vc1/path.sh b/examples/aishell3/vc1/path.sh
deleted file mode 100755
index fb7e8411c..000000000
--- a/examples/aishell3/vc1/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=fastspeech2
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
diff --git a/examples/aishell3/vc1/path.sh b/examples/aishell3/vc1/path.sh
new file mode 120000
index 000000000..4785b9095
--- /dev/null
+++ b/examples/aishell3/vc1/path.sh
@@ -0,0 +1 @@
+../../csmsc/tts3/path.sh
\ No newline at end of file
diff --git a/examples/aishell3/vc2/local/synthesize.sh b/examples/aishell3/vc2/local/synthesize.sh
deleted file mode 100755
index 8fd8977d3..000000000
--- a/examples/aishell3/vc2/local/synthesize.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-ckpt_name=$3
-
-python3 ${BIN_DIR}/../synthesize.py \
-    --am=fastspeech2_aishell3 \
-    --am_config=${config_path} \
-    --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
-    --am_stat=dump/train/speech_stats.npy \
-    --voc=pwgan_aishell3 \
-    --voc_config=pwg_aishell3_ckpt_0.5/default.yaml \
-    --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
-    --voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
-    --test_metadata=dump/test/norm/metadata.jsonl \
-    --output_dir=${train_output_path}/test \
-    --phones_dict=dump/phone_id_map.txt \
-    --speaker_dict=dump/speaker_id_map.txt \
-    --voice-cloning=True
diff --git a/examples/aishell3/vc2/local/synthesize.sh b/examples/aishell3/vc2/local/synthesize.sh
new file mode 120000
index 000000000..ca8df6b04
--- /dev/null
+++ b/examples/aishell3/vc2/local/synthesize.sh
@@ -0,0 +1 @@
+../../vc1/local/synthesize.sh
\ No newline at end of file
diff --git a/examples/aishell3/vc2/local/train.sh b/examples/aishell3/vc2/local/train.sh
deleted file mode 100755
index c775fcadc..000000000
--- a/examples/aishell3/vc2/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-python3 ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=2 \
-    --phones-dict=dump/phone_id_map.txt \
-    --voice-cloning=True
\ No newline at end of file
diff --git a/examples/aishell3/vc2/local/train.sh b/examples/aishell3/vc2/local/train.sh
new file mode 120000
index 000000000..115a0b8dc
--- /dev/null
+++ b/examples/aishell3/vc2/local/train.sh
@@ -0,0 +1 @@
+../../vc0/local/train.sh
\ No newline at end of file
diff --git a/examples/aishell3/vc2/path.sh b/examples/aishell3/vc2/path.sh
deleted file mode 100755
index fb7e8411c..000000000
--- a/examples/aishell3/vc2/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=fastspeech2
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
diff --git a/examples/aishell3/vc2/path.sh b/examples/aishell3/vc2/path.sh
new file mode 120000
index 000000000..4785b9095
--- /dev/null
+++ b/examples/aishell3/vc2/path.sh
@@ -0,0 +1 @@
+../../csmsc/tts3/path.sh
\ No newline at end of file
diff --git a/examples/aishell3/voc1/local/preprocess.sh b/examples/aishell3/voc1/local/preprocess.sh
index 44cc3dbe4..71eab68ad 100755
--- a/examples/aishell3/voc1/local/preprocess.sh
+++ b/examples/aishell3/voc1/local/preprocess.sh
@@ -42,14 +42,18 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     python3 ${BIN_DIR}/../normalize.py \
         --metadata=dump/train/raw/metadata.jsonl \
         --dumpdir=dump/train/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
+        --skip-wav-copy
+
     python3 ${BIN_DIR}/../normalize.py \
         --metadata=dump/dev/raw/metadata.jsonl \
         --dumpdir=dump/dev/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
+        --skip-wav-copy
     
     python3 ${BIN_DIR}/../normalize.py \
         --metadata=dump/test/raw/metadata.jsonl \
         --dumpdir=dump/test/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
+        --skip-wav-copy
 fi
diff --git a/examples/aishell3/voc1/local/synthesize.sh b/examples/aishell3/voc1/local/synthesize.sh
deleted file mode 100755
index 145557b3d..000000000
--- a/examples/aishell3/voc1/local/synthesize.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-ckpt_name=$3
-
-FLAGS_allocator_strategy=naive_best_fit \
-FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ${BIN_DIR}/../synthesize.py \
-    --config=${config_path} \
-    --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
-    --test-metadata=dump/test/norm/metadata.jsonl \
-    --output-dir=${train_output_path}/test \
-    --generator-type=pwgan
diff --git a/examples/aishell3/voc1/local/synthesize.sh b/examples/aishell3/voc1/local/synthesize.sh
new file mode 120000
index 000000000..d6aecd8d1
--- /dev/null
+++ b/examples/aishell3/voc1/local/synthesize.sh
@@ -0,0 +1 @@
+../../../csmsc/voc1/local/synthesize.sh
\ No newline at end of file
diff --git a/examples/aishell3/voc1/local/train.sh b/examples/aishell3/voc1/local/train.sh
deleted file mode 100755
index 9695631ef..000000000
--- a/examples/aishell3/voc1/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-FLAGS_cudnn_exhaustive_search=true \
-FLAGS_conv_workspace_size_limit=4000 \
-python ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=1
diff --git a/examples/aishell3/voc1/local/train.sh b/examples/aishell3/voc1/local/train.sh
new file mode 120000
index 000000000..2942893d2
--- /dev/null
+++ b/examples/aishell3/voc1/local/train.sh
@@ -0,0 +1 @@
+../../../csmsc/voc1/local/train.sh
\ No newline at end of file
diff --git a/examples/aishell3/voc1/path.sh b/examples/aishell3/voc1/path.sh
deleted file mode 100755
index 1e6647b86..000000000
--- a/examples/aishell3/voc1/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=parallelwave_gan
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL}
\ No newline at end of file
diff --git a/examples/aishell3/voc1/path.sh b/examples/aishell3/voc1/path.sh
new file mode 120000
index 000000000..b7ed4fb8f
--- /dev/null
+++ b/examples/aishell3/voc1/path.sh
@@ -0,0 +1 @@
+../../csmsc/voc1/path.sh
\ No newline at end of file
diff --git a/examples/aishell3/voc5/local/preprocess.sh b/examples/aishell3/voc5/local/preprocess.sh
deleted file mode 100755
index 44cc3dbe4..000000000
--- a/examples/aishell3/voc5/local/preprocess.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/bash
-
-stage=0
-stop_stage=100
-
-config_path=$1
-
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    # get durations from MFA's result
-    echo "Generate durations.txt from MFA results ..."
-    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
-        --inputdir=./aishell3_alignment_tone \
-        --output=durations.txt \
-        --config=${config_path}
-fi
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    # extract features
-    echo "Extract features ..."
-    python3 ${BIN_DIR}/../preprocess.py \
-        --rootdir=~/datasets/data_aishell3/ \
-        --dataset=aishell3 \
-        --dumpdir=dump \
-        --dur-file=durations.txt \
-        --config=${config_path} \
-        --cut-sil=True \
-        --num-cpu=20
-fi
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    # get features' stats(mean and std)
-    echo "Get features' stats ..."
-    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
-        --metadata=dump/train/raw/metadata.jsonl \
-        --field-name="feats"
-fi
-
-if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
-    # normalize, dev and test should use train's stats
-    echo "Normalize ..."
-   
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/train/raw/metadata.jsonl \
-        --dumpdir=dump/train/norm \
-        --stats=dump/train/feats_stats.npy
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/dev/raw/metadata.jsonl \
-        --dumpdir=dump/dev/norm \
-        --stats=dump/train/feats_stats.npy
-    
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/test/raw/metadata.jsonl \
-        --dumpdir=dump/test/norm \
-        --stats=dump/train/feats_stats.npy
-fi
diff --git a/examples/aishell3/voc5/local/preprocess.sh b/examples/aishell3/voc5/local/preprocess.sh
new file mode 120000
index 000000000..f0cb24de9
--- /dev/null
+++ b/examples/aishell3/voc5/local/preprocess.sh
@@ -0,0 +1 @@
+../../voc1/local/preprocess.sh
\ No newline at end of file
diff --git a/examples/aishell3/voc5/local/synthesize.sh b/examples/aishell3/voc5/local/synthesize.sh
deleted file mode 100755
index 647896175..000000000
--- a/examples/aishell3/voc5/local/synthesize.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-ckpt_name=$3
-
-FLAGS_allocator_strategy=naive_best_fit \
-FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ${BIN_DIR}/../synthesize.py \
-    --config=${config_path} \
-    --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
-    --test-metadata=dump/test/norm/metadata.jsonl \
-    --output-dir=${train_output_path}/test \
-    --generator-type=hifigan
diff --git a/examples/aishell3/voc5/local/synthesize.sh b/examples/aishell3/voc5/local/synthesize.sh
new file mode 120000
index 000000000..c887112c0
--- /dev/null
+++ b/examples/aishell3/voc5/local/synthesize.sh
@@ -0,0 +1 @@
+../../../csmsc/voc5/local/synthesize.sh
\ No newline at end of file
diff --git a/examples/aishell3/voc5/local/train.sh b/examples/aishell3/voc5/local/train.sh
deleted file mode 100755
index 9695631ef..000000000
--- a/examples/aishell3/voc5/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-FLAGS_cudnn_exhaustive_search=true \
-FLAGS_conv_workspace_size_limit=4000 \
-python ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=1
diff --git a/examples/aishell3/voc5/local/train.sh b/examples/aishell3/voc5/local/train.sh
new file mode 120000
index 000000000..2942893d2
--- /dev/null
+++ b/examples/aishell3/voc5/local/train.sh
@@ -0,0 +1 @@
+../../../csmsc/voc1/local/train.sh
\ No newline at end of file
diff --git a/examples/aishell3/voc5/path.sh b/examples/aishell3/voc5/path.sh
deleted file mode 100755
index 7451b3218..000000000
--- a/examples/aishell3/voc5/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=hifigan
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL}
diff --git a/examples/aishell3/voc5/path.sh b/examples/aishell3/voc5/path.sh
new file mode 120000
index 000000000..b67fe2b39
--- /dev/null
+++ b/examples/aishell3/voc5/path.sh
@@ -0,0 +1 @@
+../../csmsc/voc5/path.sh
\ No newline at end of file
diff --git a/examples/aishell3_vctk/ernie_sat/local/synthesize.sh b/examples/aishell3_vctk/ernie_sat/local/synthesize.sh
deleted file mode 100755
index 8b4178f13..000000000
--- a/examples/aishell3_vctk/ernie_sat/local/synthesize.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-ckpt_name=$3
-
-stage=0
-stop_stage=0
-
-# hifigan
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    FLAGS_allocator_strategy=naive_best_fit \
-    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-    python3 ${BIN_DIR}/synthesize.py \
-        --erniesat_config=${config_path} \
-        --erniesat_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
-        --erniesat_stat=dump/train/speech_stats.npy \
-        --voc=hifigan_aishell3 \
-        --voc_config=hifigan_aishell3_ckpt_0.2.0/default.yaml \
-        --voc_ckpt=hifigan_aishell3_ckpt_0.2.0/snapshot_iter_2500000.pdz \
-        --voc_stat=hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \
-        --test_metadata=dump/test/norm/metadata.jsonl \
-        --output_dir=${train_output_path}/test \
-        --phones_dict=dump/phone_id_map.txt
-fi
diff --git a/examples/aishell3_vctk/ernie_sat/local/synthesize.sh b/examples/aishell3_vctk/ernie_sat/local/synthesize.sh
new file mode 120000
index 000000000..5703dcb2c
--- /dev/null
+++ b/examples/aishell3_vctk/ernie_sat/local/synthesize.sh
@@ -0,0 +1 @@
+../../../aishell3/ernie_sat/local/synthesize.sh
\ No newline at end of file
diff --git a/examples/aishell3_vctk/ernie_sat/local/train.sh b/examples/aishell3_vctk/ernie_sat/local/train.sh
deleted file mode 100755
index 526aac435..000000000
--- a/examples/aishell3_vctk/ernie_sat/local/train.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-python3 ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=8 \
-    --phones-dict=dump/phone_id_map.txt
\ No newline at end of file
diff --git a/examples/aishell3_vctk/ernie_sat/local/train.sh b/examples/aishell3_vctk/ernie_sat/local/train.sh
new file mode 120000
index 000000000..9f1d2346d
--- /dev/null
+++ b/examples/aishell3_vctk/ernie_sat/local/train.sh
@@ -0,0 +1 @@
+../../../aishell3/ernie_sat/local/train.sh
\ No newline at end of file
diff --git a/examples/aishell3_vctk/ernie_sat/path.sh b/examples/aishell3_vctk/ernie_sat/path.sh
deleted file mode 100755
index 4ecab0251..000000000
--- a/examples/aishell3_vctk/ernie_sat/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=ernie_sat
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
\ No newline at end of file
diff --git a/examples/aishell3_vctk/ernie_sat/path.sh b/examples/aishell3_vctk/ernie_sat/path.sh
new file mode 120000
index 000000000..5ec397590
--- /dev/null
+++ b/examples/aishell3_vctk/ernie_sat/path.sh
@@ -0,0 +1 @@
+../../aishell3/ernie_sat/path.sh
\ No newline at end of file
diff --git a/examples/csmsc/tts3/local/PTQ_static.sh b/examples/csmsc/tts3/local/PTQ_static.sh
index a70a77b58..c6dce53cb 100755
--- a/examples/csmsc/tts3/local/PTQ_static.sh
+++ b/examples/csmsc/tts3/local/PTQ_static.sh
@@ -5,4 +5,4 @@ python3 ${BIN_DIR}/../PTQ_static.py \
     --dev-metadata=dump/dev/norm/metadata.jsonl \
     --inference_dir ${train_output_path}/inference \
     --model_name ${model_name} \
-    --onnx_forma=True
\ No newline at end of file
+    --onnx_format=True
\ No newline at end of file
diff --git a/examples/csmsc/voc1/local/PTQ_static.sh b/examples/csmsc/voc1/local/PTQ_static.sh
index 2e5166141..c85ebd109 100755
--- a/examples/csmsc/voc1/local/PTQ_static.sh
+++ b/examples/csmsc/voc1/local/PTQ_static.sh
@@ -2,7 +2,7 @@ train_output_path=$1
 model_name=$2
 
 python3 ${BIN_DIR}/../../PTQ_static.py \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
+    --dev-metadata=dump/dev/raw/metadata.jsonl \
     --inference_dir ${train_output_path}/inference \
     --model_name ${model_name} \
     --onnx_format=True 
\ No newline at end of file
diff --git a/examples/csmsc/voc1/local/preprocess.sh b/examples/csmsc/voc1/local/preprocess.sh
index 61d6d62be..62d0717b9 100755
--- a/examples/csmsc/voc1/local/preprocess.sh
+++ b/examples/csmsc/voc1/local/preprocess.sh
@@ -42,14 +42,18 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     python3 ${BIN_DIR}/../normalize.py \
         --metadata=dump/train/raw/metadata.jsonl \
         --dumpdir=dump/train/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
+        --skip-wav-copy
+        
     python3 ${BIN_DIR}/../normalize.py \
         --metadata=dump/dev/raw/metadata.jsonl \
         --dumpdir=dump/dev/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
+        --skip-wav-copy
     
     python3 ${BIN_DIR}/../normalize.py \
         --metadata=dump/test/raw/metadata.jsonl \
         --dumpdir=dump/test/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
+        --skip-wav-copy
 fi
diff --git a/examples/csmsc/voc3/finetune.sh b/examples/csmsc/voc3/finetune.sh
deleted file mode 100755
index 6719bd0be..000000000
--- a/examples/csmsc/voc3/finetune.sh
+++ /dev/null
@@ -1,64 +0,0 @@
-#!/bin/bash
-
-source path.sh
-
-gpus=0
-stage=0
-stop_stage=100
-
-source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
-
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    python3 ${MAIN_ROOT}/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py \
-        --fastspeech2-config=fastspeech2_nosil_baker_ckpt_0.4/default.yaml \
-        --fastspeech2-checkpoint=fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \
-        --fastspeech2-stat=fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \
-        --dur-file=durations.txt \
-        --output-dir=dump_finetune \
-        --phones-dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt \
-        --dataset=baker \
-        --rootdir=~/datasets/BZNSYP/
-fi
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    python3 ${MAIN_ROOT}/utils/link_wav.py \
-        --old-dump-dir=dump \
-        --dump-dir=dump_finetune
-fi
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    # get features' stats(mean and std)
-    echo "Get features' stats ..."
-    cp dump/train/feats_stats.npy dump_finetune/train/
-fi
-
-if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
-    # normalize, dev and test should use train's stats
-    echo "Normalize ..."
-   
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump_finetune/train/raw/metadata.jsonl \
-        --dumpdir=dump_finetune/train/norm \
-        --stats=dump_finetune/train/feats_stats.npy
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump_finetune/dev/raw/metadata.jsonl \
-        --dumpdir=dump_finetune/dev/norm \
-        --stats=dump_finetune/train/feats_stats.npy
-    
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump_finetune/test/raw/metadata.jsonl \
-        --dumpdir=dump_finetune/test/norm \
-        --stats=dump_finetune/train/feats_stats.npy
-fi
-
-if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
-    CUDA_VISIBLE_DEVICES=${gpus} \
-    FLAGS_cudnn_exhaustive_search=true \
-    FLAGS_conv_workspace_size_limit=4000 \
-    python ${BIN_DIR}/train.py \
-        --train-metadata=dump_finetune/train/norm/metadata.jsonl \
-        --dev-metadata=dump_finetune/dev/norm/metadata.jsonl \
-        --config=conf/finetune.yaml \
-        --output-dir=exp/finetune \
-        --ngpu=1
-fi 
\ No newline at end of file
diff --git a/examples/csmsc/voc3/finetune.sh b/examples/csmsc/voc3/finetune.sh
new file mode 120000
index 000000000..b6fa868e2
--- /dev/null
+++ b/examples/csmsc/voc3/finetune.sh
@@ -0,0 +1 @@
+../voc5/finetune.sh
\ No newline at end of file
diff --git a/examples/csmsc/voc3/local/preprocess.sh b/examples/csmsc/voc3/local/preprocess.sh
deleted file mode 100755
index 61d6d62be..000000000
--- a/examples/csmsc/voc3/local/preprocess.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/bash
-
-stage=0
-stop_stage=100
-
-config_path=$1
-
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    # get durations from MFA's result
-    echo "Generate durations.txt from MFA results ..."
-    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
-        --inputdir=./baker_alignment_tone \
-        --output=durations.txt \
-        --config=${config_path}
-fi
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    # extract features
-    echo "Extract features ..."
-    python3 ${BIN_DIR}/../preprocess.py \
-        --rootdir=~/datasets/BZNSYP/ \
-        --dataset=baker \
-        --dumpdir=dump \
-        --dur-file=durations.txt \
-        --config=${config_path} \
-        --cut-sil=True \
-        --num-cpu=20
-fi
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    # get features' stats(mean and std)
-    echo "Get features' stats ..."
-    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
-        --metadata=dump/train/raw/metadata.jsonl \
-        --field-name="feats"
-fi
-
-if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
-    # normalize, dev and test should use train's stats
-    echo "Normalize ..."
-   
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/train/raw/metadata.jsonl \
-        --dumpdir=dump/train/norm \
-        --stats=dump/train/feats_stats.npy
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/dev/raw/metadata.jsonl \
-        --dumpdir=dump/dev/norm \
-        --stats=dump/train/feats_stats.npy
-    
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/test/raw/metadata.jsonl \
-        --dumpdir=dump/test/norm \
-        --stats=dump/train/feats_stats.npy
-fi
diff --git a/examples/csmsc/voc3/local/preprocess.sh b/examples/csmsc/voc3/local/preprocess.sh
new file mode 120000
index 000000000..f0cb24de9
--- /dev/null
+++ b/examples/csmsc/voc3/local/preprocess.sh
@@ -0,0 +1 @@
+../../voc1/local/preprocess.sh
\ No newline at end of file
diff --git a/examples/csmsc/voc3/local/train.sh b/examples/csmsc/voc3/local/train.sh
deleted file mode 100755
index 9695631ef..000000000
--- a/examples/csmsc/voc3/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-FLAGS_cudnn_exhaustive_search=true \
-FLAGS_conv_workspace_size_limit=4000 \
-python ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=1
diff --git a/examples/csmsc/voc3/local/train.sh b/examples/csmsc/voc3/local/train.sh
new file mode 120000
index 000000000..9ec3ed94b
--- /dev/null
+++ b/examples/csmsc/voc3/local/train.sh
@@ -0,0 +1 @@
+../../voc1/local/train.sh
\ No newline at end of file
diff --git a/examples/csmsc/voc4/local/preprocess.sh b/examples/csmsc/voc4/local/preprocess.sh
deleted file mode 100755
index 61d6d62be..000000000
--- a/examples/csmsc/voc4/local/preprocess.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/bash
-
-stage=0
-stop_stage=100
-
-config_path=$1
-
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    # get durations from MFA's result
-    echo "Generate durations.txt from MFA results ..."
-    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
-        --inputdir=./baker_alignment_tone \
-        --output=durations.txt \
-        --config=${config_path}
-fi
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    # extract features
-    echo "Extract features ..."
-    python3 ${BIN_DIR}/../preprocess.py \
-        --rootdir=~/datasets/BZNSYP/ \
-        --dataset=baker \
-        --dumpdir=dump \
-        --dur-file=durations.txt \
-        --config=${config_path} \
-        --cut-sil=True \
-        --num-cpu=20
-fi
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    # get features' stats(mean and std)
-    echo "Get features' stats ..."
-    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
-        --metadata=dump/train/raw/metadata.jsonl \
-        --field-name="feats"
-fi
-
-if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
-    # normalize, dev and test should use train's stats
-    echo "Normalize ..."
-   
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/train/raw/metadata.jsonl \
-        --dumpdir=dump/train/norm \
-        --stats=dump/train/feats_stats.npy
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/dev/raw/metadata.jsonl \
-        --dumpdir=dump/dev/norm \
-        --stats=dump/train/feats_stats.npy
-    
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/test/raw/metadata.jsonl \
-        --dumpdir=dump/test/norm \
-        --stats=dump/train/feats_stats.npy
-fi
diff --git a/examples/csmsc/voc4/local/preprocess.sh b/examples/csmsc/voc4/local/preprocess.sh
new file mode 120000
index 000000000..f0cb24de9
--- /dev/null
+++ b/examples/csmsc/voc4/local/preprocess.sh
@@ -0,0 +1 @@
+../../voc1/local/preprocess.sh
\ No newline at end of file
diff --git a/examples/csmsc/voc4/local/train.sh b/examples/csmsc/voc4/local/train.sh
deleted file mode 100755
index 9695631ef..000000000
--- a/examples/csmsc/voc4/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-FLAGS_cudnn_exhaustive_search=true \
-FLAGS_conv_workspace_size_limit=4000 \
-python ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=1
diff --git a/examples/csmsc/voc4/local/train.sh b/examples/csmsc/voc4/local/train.sh
new file mode 120000
index 000000000..9ec3ed94b
--- /dev/null
+++ b/examples/csmsc/voc4/local/train.sh
@@ -0,0 +1 @@
+../../voc1/local/train.sh
\ No newline at end of file
diff --git a/examples/csmsc/voc5/finetune.sh b/examples/csmsc/voc5/finetune.sh
index 6719bd0be..eb8325aeb 100755
--- a/examples/csmsc/voc5/finetune.sh
+++ b/examples/csmsc/voc5/finetune.sh
@@ -39,16 +39,19 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     python3 ${BIN_DIR}/../normalize.py \
         --metadata=dump_finetune/train/raw/metadata.jsonl \
         --dumpdir=dump_finetune/train/norm \
-        --stats=dump_finetune/train/feats_stats.npy
+        --stats=dump_finetune/train/feats_stats.npy \
+        --skip-wav-copy
     python3 ${BIN_DIR}/../normalize.py \
         --metadata=dump_finetune/dev/raw/metadata.jsonl \
         --dumpdir=dump_finetune/dev/norm \
-        --stats=dump_finetune/train/feats_stats.npy
+        --stats=dump_finetune/train/feats_stats.npy \
+        --skip-wav-copy
     
     python3 ${BIN_DIR}/../normalize.py \
         --metadata=dump_finetune/test/raw/metadata.jsonl \
         --dumpdir=dump_finetune/test/norm \
-        --stats=dump_finetune/train/feats_stats.npy
+        --stats=dump_finetune/train/feats_stats.npy \
+        --skip-wav-copy
 fi
 
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
diff --git a/examples/csmsc/voc5/local/preprocess.sh b/examples/csmsc/voc5/local/preprocess.sh
deleted file mode 100755
index 61d6d62be..000000000
--- a/examples/csmsc/voc5/local/preprocess.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/bash
-
-stage=0
-stop_stage=100
-
-config_path=$1
-
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    # get durations from MFA's result
-    echo "Generate durations.txt from MFA results ..."
-    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
-        --inputdir=./baker_alignment_tone \
-        --output=durations.txt \
-        --config=${config_path}
-fi
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    # extract features
-    echo "Extract features ..."
-    python3 ${BIN_DIR}/../preprocess.py \
-        --rootdir=~/datasets/BZNSYP/ \
-        --dataset=baker \
-        --dumpdir=dump \
-        --dur-file=durations.txt \
-        --config=${config_path} \
-        --cut-sil=True \
-        --num-cpu=20
-fi
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    # get features' stats(mean and std)
-    echo "Get features' stats ..."
-    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
-        --metadata=dump/train/raw/metadata.jsonl \
-        --field-name="feats"
-fi
-
-if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
-    # normalize, dev and test should use train's stats
-    echo "Normalize ..."
-   
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/train/raw/metadata.jsonl \
-        --dumpdir=dump/train/norm \
-        --stats=dump/train/feats_stats.npy
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/dev/raw/metadata.jsonl \
-        --dumpdir=dump/dev/norm \
-        --stats=dump/train/feats_stats.npy
-    
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/test/raw/metadata.jsonl \
-        --dumpdir=dump/test/norm \
-        --stats=dump/train/feats_stats.npy
-fi
diff --git a/examples/csmsc/voc5/local/preprocess.sh b/examples/csmsc/voc5/local/preprocess.sh
new file mode 120000
index 000000000..f0cb24de9
--- /dev/null
+++ b/examples/csmsc/voc5/local/preprocess.sh
@@ -0,0 +1 @@
+../../voc1/local/preprocess.sh
\ No newline at end of file
diff --git a/examples/csmsc/voc5/local/train.sh b/examples/csmsc/voc5/local/train.sh
deleted file mode 100755
index 9695631ef..000000000
--- a/examples/csmsc/voc5/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-FLAGS_cudnn_exhaustive_search=true \
-FLAGS_conv_workspace_size_limit=4000 \
-python ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=1
diff --git a/examples/csmsc/voc5/local/train.sh b/examples/csmsc/voc5/local/train.sh
new file mode 120000
index 000000000..9ec3ed94b
--- /dev/null
+++ b/examples/csmsc/voc5/local/train.sh
@@ -0,0 +1 @@
+../../voc1/local/train.sh
\ No newline at end of file
diff --git a/examples/csmsc/voc6/local/preprocess.sh b/examples/csmsc/voc6/local/preprocess.sh
index 2dcc39ac7..509824b8e 100755
--- a/examples/csmsc/voc6/local/preprocess.sh
+++ b/examples/csmsc/voc6/local/preprocess.sh
@@ -42,14 +42,18 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     python3 ${BIN_DIR}/../gan_vocoder/normalize.py \
         --metadata=dump/train/raw/metadata.jsonl \
         --dumpdir=dump/train/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
+        --skip-wav-copy
+
     python3 ${BIN_DIR}/../gan_vocoder/normalize.py \
         --metadata=dump/dev/raw/metadata.jsonl \
         --dumpdir=dump/dev/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
+        --skip-wav-copy
     
     python3 ${BIN_DIR}/../gan_vocoder/normalize.py \
         --metadata=dump/test/raw/metadata.jsonl \
         --dumpdir=dump/test/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
+        --skip-wav-copy
 fi
diff --git a/examples/csmsc/voc6/local/train.sh b/examples/csmsc/voc6/local/train.sh
deleted file mode 100755
index 9695631ef..000000000
--- a/examples/csmsc/voc6/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-FLAGS_cudnn_exhaustive_search=true \
-FLAGS_conv_workspace_size_limit=4000 \
-python ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=1
diff --git a/examples/csmsc/voc6/local/train.sh b/examples/csmsc/voc6/local/train.sh
new file mode 120000
index 000000000..9ec3ed94b
--- /dev/null
+++ b/examples/csmsc/voc6/local/train.sh
@@ -0,0 +1 @@
+../../voc1/local/train.sh
\ No newline at end of file
diff --git a/examples/ljspeech/tts0/local/train.sh b/examples/ljspeech/tts0/local/train.sh
deleted file mode 100755
index f90db9150..000000000
--- a/examples/ljspeech/tts0/local/train.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-python3 ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=1 \
-    --phones-dict=dump/phone_id_map.txt
\ No newline at end of file
diff --git a/examples/ljspeech/tts0/local/train.sh b/examples/ljspeech/tts0/local/train.sh
new file mode 120000
index 000000000..7f54e9239
--- /dev/null
+++ b/examples/ljspeech/tts0/local/train.sh
@@ -0,0 +1 @@
+../../../csmsc/tts0/local/train.sh
\ No newline at end of file
diff --git a/examples/ljspeech/tts0/path.sh b/examples/ljspeech/tts0/path.sh
deleted file mode 100755
index a37cd21e3..000000000
--- a/examples/ljspeech/tts0/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=tacotron2
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
diff --git a/examples/ljspeech/tts0/path.sh b/examples/ljspeech/tts0/path.sh
new file mode 120000
index 000000000..9e1fdbd16
--- /dev/null
+++ b/examples/ljspeech/tts0/path.sh
@@ -0,0 +1 @@
+../../csmsc/tts0/path.sh
\ No newline at end of file
diff --git a/examples/ljspeech/tts3/local/train.sh b/examples/ljspeech/tts3/local/train.sh
deleted file mode 100755
index d1302f99f..000000000
--- a/examples/ljspeech/tts3/local/train.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-python3 ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=1 \
-    --phones-dict=dump/phone_id_map.txt
diff --git a/examples/ljspeech/tts3/local/train.sh b/examples/ljspeech/tts3/local/train.sh
new file mode 120000
index 000000000..d7b05058e
--- /dev/null
+++ b/examples/ljspeech/tts3/local/train.sh
@@ -0,0 +1 @@
+../../../csmsc/tts3/local/train.sh
\ No newline at end of file
diff --git a/examples/ljspeech/tts3/path.sh b/examples/ljspeech/tts3/path.sh
deleted file mode 100755
index fb7e8411c..000000000
--- a/examples/ljspeech/tts3/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=fastspeech2
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
diff --git a/examples/ljspeech/tts3/path.sh b/examples/ljspeech/tts3/path.sh
new file mode 120000
index 000000000..4785b9095
--- /dev/null
+++ b/examples/ljspeech/tts3/path.sh
@@ -0,0 +1 @@
+../../csmsc/tts3/path.sh
\ No newline at end of file
diff --git a/examples/ljspeech/voc1/local/preprocess.sh b/examples/ljspeech/voc1/local/preprocess.sh
index d1af60dad..bfbf75b7d 100755
--- a/examples/ljspeech/voc1/local/preprocess.sh
+++ b/examples/ljspeech/voc1/local/preprocess.sh
@@ -42,14 +42,18 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     python3 ${BIN_DIR}/../normalize.py \
         --metadata=dump/train/raw/metadata.jsonl \
         --dumpdir=dump/train/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
+        --skip-wav-copy
+
     python3 ${BIN_DIR}/../normalize.py \
         --metadata=dump/dev/raw/metadata.jsonl \
         --dumpdir=dump/dev/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
+        --skip-wav-copy
     
     python3 ${BIN_DIR}/../normalize.py \
         --metadata=dump/test/raw/metadata.jsonl \
         --dumpdir=dump/test/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
+        --skip-wav-copy
 fi
diff --git a/examples/ljspeech/voc1/local/synthesize.sh b/examples/ljspeech/voc1/local/synthesize.sh
deleted file mode 100755
index 145557b3d..000000000
--- a/examples/ljspeech/voc1/local/synthesize.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-ckpt_name=$3
-
-FLAGS_allocator_strategy=naive_best_fit \
-FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ${BIN_DIR}/../synthesize.py \
-    --config=${config_path} \
-    --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
-    --test-metadata=dump/test/norm/metadata.jsonl \
-    --output-dir=${train_output_path}/test \
-    --generator-type=pwgan
diff --git a/examples/ljspeech/voc1/local/synthesize.sh b/examples/ljspeech/voc1/local/synthesize.sh
new file mode 120000
index 000000000..d6aecd8d1
--- /dev/null
+++ b/examples/ljspeech/voc1/local/synthesize.sh
@@ -0,0 +1 @@
+../../../csmsc/voc1/local/synthesize.sh
\ No newline at end of file
diff --git a/examples/ljspeech/voc1/local/train.sh b/examples/ljspeech/voc1/local/train.sh
deleted file mode 100755
index 9695631ef..000000000
--- a/examples/ljspeech/voc1/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-FLAGS_cudnn_exhaustive_search=true \
-FLAGS_conv_workspace_size_limit=4000 \
-python ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=1
diff --git a/examples/ljspeech/voc1/local/train.sh b/examples/ljspeech/voc1/local/train.sh
new file mode 120000
index 000000000..2942893d2
--- /dev/null
+++ b/examples/ljspeech/voc1/local/train.sh
@@ -0,0 +1 @@
+../../../csmsc/voc1/local/train.sh
\ No newline at end of file
diff --git a/examples/ljspeech/voc1/path.sh b/examples/ljspeech/voc1/path.sh
deleted file mode 100755
index 1e6647b86..000000000
--- a/examples/ljspeech/voc1/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=parallelwave_gan
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL}
\ No newline at end of file
diff --git a/examples/ljspeech/voc1/path.sh b/examples/ljspeech/voc1/path.sh
new file mode 120000
index 000000000..b7ed4fb8f
--- /dev/null
+++ b/examples/ljspeech/voc1/path.sh
@@ -0,0 +1 @@
+../../csmsc/voc1/path.sh
\ No newline at end of file
diff --git a/examples/ljspeech/voc5/local/preprocess.sh b/examples/ljspeech/voc5/local/preprocess.sh
deleted file mode 100755
index d1af60dad..000000000
--- a/examples/ljspeech/voc5/local/preprocess.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/bash
-
-stage=0
-stop_stage=100
-
-config_path=$1
-
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    # get durations from MFA's result
-    echo "Generate durations.txt from MFA results ..."
-    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
-        --inputdir=./ljspeech_alignment \
-        --output=durations.txt \
-        --config=${config_path}
-fi
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    # extract features
-    echo "Extract features ..."
-    python3 ${BIN_DIR}/../preprocess.py \
-        --rootdir=~/datasets/LJSpeech-1.1/ \
-        --dataset=ljspeech \
-        --dumpdir=dump \
-        --dur-file=durations.txt \
-        --config=${config_path} \
-        --cut-sil=True \
-        --num-cpu=20
-fi
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    # get features' stats(mean and std)
-    echo "Get features' stats ..."
-    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
-        --metadata=dump/train/raw/metadata.jsonl \
-        --field-name="feats"
-fi
-
-if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
-    # normalize, dev and test should use train's stats
-    echo "Normalize ..."
-   
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/train/raw/metadata.jsonl \
-        --dumpdir=dump/train/norm \
-        --stats=dump/train/feats_stats.npy
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/dev/raw/metadata.jsonl \
-        --dumpdir=dump/dev/norm \
-        --stats=dump/train/feats_stats.npy
-    
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/test/raw/metadata.jsonl \
-        --dumpdir=dump/test/norm \
-        --stats=dump/train/feats_stats.npy
-fi
diff --git a/examples/ljspeech/voc5/local/preprocess.sh b/examples/ljspeech/voc5/local/preprocess.sh
new file mode 120000
index 000000000..f0cb24de9
--- /dev/null
+++ b/examples/ljspeech/voc5/local/preprocess.sh
@@ -0,0 +1 @@
+../../voc1/local/preprocess.sh
\ No newline at end of file
diff --git a/examples/ljspeech/voc5/local/synthesize.sh b/examples/ljspeech/voc5/local/synthesize.sh
deleted file mode 100755
index 647896175..000000000
--- a/examples/ljspeech/voc5/local/synthesize.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-ckpt_name=$3
-
-FLAGS_allocator_strategy=naive_best_fit \
-FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ${BIN_DIR}/../synthesize.py \
-    --config=${config_path} \
-    --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
-    --test-metadata=dump/test/norm/metadata.jsonl \
-    --output-dir=${train_output_path}/test \
-    --generator-type=hifigan
diff --git a/examples/ljspeech/voc5/local/synthesize.sh b/examples/ljspeech/voc5/local/synthesize.sh
new file mode 120000
index 000000000..c887112c0
--- /dev/null
+++ b/examples/ljspeech/voc5/local/synthesize.sh
@@ -0,0 +1 @@
+../../../csmsc/voc5/local/synthesize.sh
\ No newline at end of file
diff --git a/examples/ljspeech/voc5/local/train.sh b/examples/ljspeech/voc5/local/train.sh
deleted file mode 100755
index 9695631ef..000000000
--- a/examples/ljspeech/voc5/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-FLAGS_cudnn_exhaustive_search=true \
-FLAGS_conv_workspace_size_limit=4000 \
-python ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=1
diff --git a/examples/ljspeech/voc5/local/train.sh b/examples/ljspeech/voc5/local/train.sh
new file mode 120000
index 000000000..2942893d2
--- /dev/null
+++ b/examples/ljspeech/voc5/local/train.sh
@@ -0,0 +1 @@
+../../../csmsc/voc1/local/train.sh
\ No newline at end of file
diff --git a/examples/ljspeech/voc5/path.sh b/examples/ljspeech/voc5/path.sh
deleted file mode 100755
index 7451b3218..000000000
--- a/examples/ljspeech/voc5/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=hifigan
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL}
diff --git a/examples/ljspeech/voc5/path.sh b/examples/ljspeech/voc5/path.sh
new file mode 120000
index 000000000..b67fe2b39
--- /dev/null
+++ b/examples/ljspeech/voc5/path.sh
@@ -0,0 +1 @@
+../../csmsc/voc5/path.sh
\ No newline at end of file
diff --git a/examples/vctk/ernie_sat/local/train.sh b/examples/vctk/ernie_sat/local/train.sh
deleted file mode 100755
index 526aac435..000000000
--- a/examples/vctk/ernie_sat/local/train.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-python3 ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=8 \
-    --phones-dict=dump/phone_id_map.txt
\ No newline at end of file
diff --git a/examples/vctk/ernie_sat/local/train.sh b/examples/vctk/ernie_sat/local/train.sh
new file mode 120000
index 000000000..9f1d2346d
--- /dev/null
+++ b/examples/vctk/ernie_sat/local/train.sh
@@ -0,0 +1 @@
+../../../aishell3/ernie_sat/local/train.sh
\ No newline at end of file
diff --git a/examples/vctk/ernie_sat/path.sh b/examples/vctk/ernie_sat/path.sh
deleted file mode 100755
index 4ecab0251..000000000
--- a/examples/vctk/ernie_sat/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=ernie_sat
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
\ No newline at end of file
diff --git a/examples/vctk/ernie_sat/path.sh b/examples/vctk/ernie_sat/path.sh
new file mode 120000
index 000000000..5ec397590
--- /dev/null
+++ b/examples/vctk/ernie_sat/path.sh
@@ -0,0 +1 @@
+../../aishell3/ernie_sat/path.sh
\ No newline at end of file
diff --git a/examples/vctk/tts3/local/train.sh b/examples/vctk/tts3/local/train.sh
deleted file mode 100755
index 3a5076505..000000000
--- a/examples/vctk/tts3/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-python3 ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=1 \
-    --phones-dict=dump/phone_id_map.txt \
-    --speaker-dict=dump/speaker_id_map.txt
diff --git a/examples/vctk/tts3/local/train.sh b/examples/vctk/tts3/local/train.sh
new file mode 120000
index 000000000..78885a300
--- /dev/null
+++ b/examples/vctk/tts3/local/train.sh
@@ -0,0 +1 @@
+../../../aishell3/tts3/local/train.sh
\ No newline at end of file
diff --git a/examples/vctk/tts3/path.sh b/examples/vctk/tts3/path.sh
deleted file mode 100755
index fb7e8411c..000000000
--- a/examples/vctk/tts3/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=fastspeech2
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
diff --git a/examples/vctk/tts3/path.sh b/examples/vctk/tts3/path.sh
new file mode 120000
index 000000000..4785b9095
--- /dev/null
+++ b/examples/vctk/tts3/path.sh
@@ -0,0 +1 @@
+../../csmsc/tts3/path.sh
\ No newline at end of file
diff --git a/examples/vctk/voc1/local/preprocess.sh b/examples/vctk/voc1/local/preprocess.sh
index 88a478cd5..6b7e5288a 100755
--- a/examples/vctk/voc1/local/preprocess.sh
+++ b/examples/vctk/voc1/local/preprocess.sh
@@ -42,14 +42,18 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     python3 ${BIN_DIR}/../normalize.py \
         --metadata=dump/train/raw/metadata.jsonl \
         --dumpdir=dump/train/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
+        --skip-wav-copy
+
     python3 ${BIN_DIR}/../normalize.py \
         --metadata=dump/dev/raw/metadata.jsonl \
         --dumpdir=dump/dev/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
+        --skip-wav-copy
     
     python3 ${BIN_DIR}/../normalize.py \
         --metadata=dump/test/raw/metadata.jsonl \
         --dumpdir=dump/test/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
+        --skip-wav-copy
 fi
diff --git a/examples/vctk/voc1/local/synthesize.sh b/examples/vctk/voc1/local/synthesize.sh
deleted file mode 100755
index 145557b3d..000000000
--- a/examples/vctk/voc1/local/synthesize.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-ckpt_name=$3
-
-FLAGS_allocator_strategy=naive_best_fit \
-FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ${BIN_DIR}/../synthesize.py \
-    --config=${config_path} \
-    --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
-    --test-metadata=dump/test/norm/metadata.jsonl \
-    --output-dir=${train_output_path}/test \
-    --generator-type=pwgan
diff --git a/examples/vctk/voc1/local/synthesize.sh b/examples/vctk/voc1/local/synthesize.sh
new file mode 120000
index 000000000..d6aecd8d1
--- /dev/null
+++ b/examples/vctk/voc1/local/synthesize.sh
@@ -0,0 +1 @@
+../../../csmsc/voc1/local/synthesize.sh
\ No newline at end of file
diff --git a/examples/vctk/voc1/local/train.sh b/examples/vctk/voc1/local/train.sh
deleted file mode 100755
index 9695631ef..000000000
--- a/examples/vctk/voc1/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-FLAGS_cudnn_exhaustive_search=true \
-FLAGS_conv_workspace_size_limit=4000 \
-python ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=1
diff --git a/examples/vctk/voc1/local/train.sh b/examples/vctk/voc1/local/train.sh
new file mode 120000
index 000000000..2942893d2
--- /dev/null
+++ b/examples/vctk/voc1/local/train.sh
@@ -0,0 +1 @@
+../../../csmsc/voc1/local/train.sh
\ No newline at end of file
diff --git a/examples/vctk/voc1/path.sh b/examples/vctk/voc1/path.sh
deleted file mode 100755
index 1e6647b86..000000000
--- a/examples/vctk/voc1/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=parallelwave_gan
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL}
\ No newline at end of file
diff --git a/examples/vctk/voc1/path.sh b/examples/vctk/voc1/path.sh
new file mode 120000
index 000000000..b7ed4fb8f
--- /dev/null
+++ b/examples/vctk/voc1/path.sh
@@ -0,0 +1 @@
+../../csmsc/voc1/path.sh
\ No newline at end of file
diff --git a/examples/vctk/voc5/local/preprocess.sh b/examples/vctk/voc5/local/preprocess.sh
deleted file mode 100755
index 88a478cd5..000000000
--- a/examples/vctk/voc5/local/preprocess.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/bash
-
-stage=0
-stop_stage=100
-
-config_path=$1
-
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    # get durations from MFA's result
-    echo "Generate durations.txt from MFA results ..."
-    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
-        --inputdir=./vctk_alignment \
-        --output=durations.txt \
-        --config=${config_path}
-fi
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    # extract features
-    echo "Extract features ..."
-    python3 ${BIN_DIR}/../preprocess.py \
-        --rootdir=~/datasets/VCTK-Corpus-0.92/ \
-        --dataset=vctk \
-        --dumpdir=dump \
-        --dur-file=durations.txt \
-        --config=${config_path} \
-        --cut-sil=True \
-        --num-cpu=20
-fi
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    # get features' stats(mean and std)
-    echo "Get features' stats ..."
-    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
-        --metadata=dump/train/raw/metadata.jsonl \
-        --field-name="feats"
-fi
-
-if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
-    # normalize, dev and test should use train's stats
-    echo "Normalize ..."
-   
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/train/raw/metadata.jsonl \
-        --dumpdir=dump/train/norm \
-        --stats=dump/train/feats_stats.npy
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/dev/raw/metadata.jsonl \
-        --dumpdir=dump/dev/norm \
-        --stats=dump/train/feats_stats.npy
-    
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/test/raw/metadata.jsonl \
-        --dumpdir=dump/test/norm \
-        --stats=dump/train/feats_stats.npy
-fi
diff --git a/examples/vctk/voc5/local/preprocess.sh b/examples/vctk/voc5/local/preprocess.sh
new file mode 120000
index 000000000..f0cb24de9
--- /dev/null
+++ b/examples/vctk/voc5/local/preprocess.sh
@@ -0,0 +1 @@
+../../voc1/local/preprocess.sh
\ No newline at end of file
diff --git a/examples/vctk/voc5/local/synthesize.sh b/examples/vctk/voc5/local/synthesize.sh
deleted file mode 100755
index 647896175..000000000
--- a/examples/vctk/voc5/local/synthesize.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-ckpt_name=$3
-
-FLAGS_allocator_strategy=naive_best_fit \
-FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ${BIN_DIR}/../synthesize.py \
-    --config=${config_path} \
-    --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
-    --test-metadata=dump/test/norm/metadata.jsonl \
-    --output-dir=${train_output_path}/test \
-    --generator-type=hifigan
diff --git a/examples/vctk/voc5/local/synthesize.sh b/examples/vctk/voc5/local/synthesize.sh
new file mode 120000
index 000000000..c887112c0
--- /dev/null
+++ b/examples/vctk/voc5/local/synthesize.sh
@@ -0,0 +1 @@
+../../../csmsc/voc5/local/synthesize.sh
\ No newline at end of file
diff --git a/examples/vctk/voc5/local/train.sh b/examples/vctk/voc5/local/train.sh
deleted file mode 100755
index 9695631ef..000000000
--- a/examples/vctk/voc5/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-FLAGS_cudnn_exhaustive_search=true \
-FLAGS_conv_workspace_size_limit=4000 \
-python ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=1
diff --git a/examples/vctk/voc5/local/train.sh b/examples/vctk/voc5/local/train.sh
new file mode 120000
index 000000000..2942893d2
--- /dev/null
+++ b/examples/vctk/voc5/local/train.sh
@@ -0,0 +1 @@
+../../../csmsc/voc1/local/train.sh
\ No newline at end of file
diff --git a/examples/vctk/voc5/path.sh b/examples/vctk/voc5/path.sh
deleted file mode 100755
index 7451b3218..000000000
--- a/examples/vctk/voc5/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=hifigan
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL}
diff --git a/examples/vctk/voc5/path.sh b/examples/vctk/voc5/path.sh
new file mode 120000
index 000000000..b67fe2b39
--- /dev/null
+++ b/examples/vctk/voc5/path.sh
@@ -0,0 +1 @@
+../../csmsc/voc5/path.sh
\ No newline at end of file
diff --git a/examples/zh_en_tts/tts3/local/train.sh b/examples/zh_en_tts/tts3/local/train.sh
deleted file mode 100755
index 1da72f117..000000000
--- a/examples/zh_en_tts/tts3/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-python3 ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=2 \
-    --phones-dict=dump/phone_id_map.txt \
-    --speaker-dict=dump/speaker_id_map.txt
diff --git a/examples/zh_en_tts/tts3/local/train.sh b/examples/zh_en_tts/tts3/local/train.sh
new file mode 120000
index 000000000..78885a300
--- /dev/null
+++ b/examples/zh_en_tts/tts3/local/train.sh
@@ -0,0 +1 @@
+../../../aishell3/tts3/local/train.sh
\ No newline at end of file
diff --git a/examples/zh_en_tts/tts3/path.sh b/examples/zh_en_tts/tts3/path.sh
deleted file mode 100755
index fb7e8411c..000000000
--- a/examples/zh_en_tts/tts3/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=fastspeech2
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
diff --git a/examples/zh_en_tts/tts3/path.sh b/examples/zh_en_tts/tts3/path.sh
new file mode 120000
index 000000000..4785b9095
--- /dev/null
+++ b/examples/zh_en_tts/tts3/path.sh
@@ -0,0 +1 @@
+../../csmsc/tts3/path.sh
\ No newline at end of file
diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py
index 004143361..7a7aef8b0 100644
--- a/paddlespeech/cli/asr/infer.py
+++ b/paddlespeech/cli/asr/infer.py
@@ -25,6 +25,9 @@ import librosa
 import numpy as np
 import paddle
 import soundfile
+from paddlespeech.audio.transform.transformation import Transformation
+from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
+from paddlespeech.s2t.utils.utility import UpdateConfig
 from yacs.config import CfgNode
 
 from ...utils.env import MODEL_HOME
@@ -34,9 +37,6 @@ from ..log import logger
 from ..utils import CLI_TIMER
 from ..utils import stats_wrapper
 from ..utils import timer_register
-from paddlespeech.audio.transform.transformation import Transformation
-from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
-from paddlespeech.s2t.utils.utility import UpdateConfig
 
 __all__ = ['ASRExecutor']
 
@@ -62,8 +62,13 @@ class ASRExecutor(BaseExecutor):
             '--lang',
             type=str,
             default='zh',
-            help='Choose model language. zh or en, zh:[conformer_wenetspeech-zh-16k], en:[transformer_librispeech-en-16k]'
+            help='Choose model language. [zh, en, zh_en], zh:[conformer_wenetspeech-zh-16k], en:[transformer_librispeech-en-16k], zh_en:[conformer_talcs-codeswitch_zh_en-16k]'
         )
+        self.parser.add_argument(
+            '--codeswitch',
+            type=bool,
+            default=False,
+            help='Choose whether use code-switch. True or False.')
         self.parser.add_argument(
             "--sample_rate",
             type=int,
@@ -127,6 +132,7 @@ class ASRExecutor(BaseExecutor):
     def _init_from_path(self,
                         model_type: str='wenetspeech',
                         lang: str='zh',
+                        codeswitch: bool=False,
                         sample_rate: int=16000,
                         cfg_path: Optional[os.PathLike]=None,
                         decode_method: str='attention_rescoring',
@@ -144,7 +150,12 @@ class ASRExecutor(BaseExecutor):
 
         if cfg_path is None or ckpt_path is None:
             sample_rate_str = '16k' if sample_rate == 16000 else '8k'
-            tag = model_type + '-' + lang + '-' + sample_rate_str
+            if lang == "zh_en" and codeswitch is True:
+                tag = model_type + '-' + 'codeswitch_' + lang + '-' + sample_rate_str
+            elif lang == "zh_en" or codeswitch is True:
+                raise Exception("codeswitch is true only in zh_en model")
+            else:
+                tag = model_type + '-' + lang + '-' + sample_rate_str
             self.task_resource.set_task_model(tag, version=None)
             self.res_path = self.task_resource.res_dir
 
@@ -423,6 +434,7 @@ class ASRExecutor(BaseExecutor):
 
         model = parser_args.model
         lang = parser_args.lang
+        codeswitch = parser_args.codeswitch
         sample_rate = parser_args.sample_rate
         config = parser_args.config
         ckpt_path = parser_args.ckpt_path
@@ -444,6 +456,7 @@ class ASRExecutor(BaseExecutor):
                     audio_file=input_,
                     model=model,
                     lang=lang,
+                    codeswitch=codeswitch,
                     sample_rate=sample_rate,
                     config=config,
                     ckpt_path=ckpt_path,
@@ -472,6 +485,7 @@ class ASRExecutor(BaseExecutor):
                  audio_file: os.PathLike,
                  model: str='conformer_u2pp_online_wenetspeech',
                  lang: str='zh',
+                 codeswitch: bool=False,
                  sample_rate: int=16000,
                  config: os.PathLike=None,
                  ckpt_path: os.PathLike=None,
@@ -485,8 +499,8 @@ class ASRExecutor(BaseExecutor):
         """
         audio_file = os.path.abspath(audio_file)
         paddle.set_device(device)
-        self._init_from_path(model, lang, sample_rate, config, decode_method,
-                             num_decoding_left_chunks, ckpt_path)
+        self._init_from_path(model, lang, codeswitch, sample_rate, config,
+                             decode_method, num_decoding_left_chunks, ckpt_path)
         if not self._check(audio_file, sample_rate, force_yes):
             sys.exit(-1)
         if rtf:
diff --git a/paddlespeech/cli/base_commands.py b/paddlespeech/cli/base_commands.py
index 767d0df78..dfeb5cae5 100644
--- a/paddlespeech/cli/base_commands.py
+++ b/paddlespeech/cli/base_commands.py
@@ -14,6 +14,7 @@
 import argparse
 from typing import List
 
+import numpy
 from prettytable import PrettyTable
 
 from ..resource import CommonTaskResource
@@ -78,7 +79,7 @@ class VersionCommand:
 
 
 model_name_format = {
-    'asr': 'Model-Language-Sample Rate',
+    'asr': 'Model-Size-Code Switch-Multilingual-Language-Sample Rate',
     'cls': 'Model-Sample Rate',
     'st': 'Model-Source language-Target language',
     'text': 'Model-Task-Language',
@@ -111,7 +112,21 @@ class StatsCommand:
         fields = model_name_format[self.task].split("-")
         table = PrettyTable(fields)
         for key in pretrained_models:
-            table.add_row(key.split("-"))
+            line = key.split("-")
+            if self.task == "asr" and len(line) < len(fields):
+                for i in range(len(line), len(fields)):
+                    line.append("-")
+                if "codeswitch" in key:
+                    line[3], line[1] = line[1].split("_")[0], line[1].split(
+                        "_")[1:]
+                elif "multilingual" in key:
+                    line[4], line[1] = line[1].split("_")[0], line[1].split(
+                        "_")[1:]
+                tmp = numpy.array(line)
+                idx = [0, 5, 3, 4, 1, 2]
+                line = tmp[idx]
+            table.add_row(line)
+
         print(table)
 
     def execute(self, argv: List[str]) -> bool:
diff --git a/paddlespeech/resource/pretrained_models.py b/paddlespeech/resource/pretrained_models.py
index 3c5aa1f90..ff0b30f6d 100644
--- a/paddlespeech/resource/pretrained_models.py
+++ b/paddlespeech/resource/pretrained_models.py
@@ -30,6 +30,7 @@ __all__ = [
 ]
 
 # The tags for pretrained_models should be "{model_name}[_{dataset}][-{lang}][-...]".
+# Add code-switch and multilingual tag, "{model_name}[_{dataset}]-[codeswitch/multilingual][_{lang}][-...]".
 # e.g. "conformer_wenetspeech-zh-16k" and "panns_cnn6-32k".
 # Command line and python api use "{model_name}[_{dataset}]" as --model, usage:
 # "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav"
@@ -322,6 +323,18 @@ asr_dynamic_pretrained_models = {
             '099a601759d467cd0a8523ff939819c5'
         },
     },
+    "conformer_talcs-codeswitch_zh_en-16k": {
+        '1.4': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/s2t/tal_cs/asr1/asr1_conformer_talcs_ckpt_1.4.0.model.tar.gz',
+            'md5':
+            '01962c5d0a70878fe41cacd4f61e14d1',
+            'cfg_path':
+            'model.yaml',
+            'ckpt_path':
+            'exp/conformer/checkpoints/avg_10'
+        },
+    },
 }
 
 asr_static_pretrained_models = {
diff --git a/paddlespeech/s2t/models/whisper/tokenizer.py b/paddlespeech/s2t/models/whisper/tokenizer.py
index 8bd85c914..1e1aea044 100644
--- a/paddlespeech/s2t/models/whisper/tokenizer.py
+++ b/paddlespeech/s2t/models/whisper/tokenizer.py
@@ -155,6 +155,10 @@ class Tokenizer:
                 if ids < len(self.tokenizer):
                     ids_list.append(ids)
             token_ids = ids_list
+        elif len(token_ids) == 1:
+            token_ids = token_ids[0]
+        else:
+            raise ValueError(f"token_ids {token_ids} load error.")
 
         return self.tokenizer.decode(token_ids, **kwargs)
 
diff --git a/paddlespeech/s2t/models/whisper/whipser.py b/paddlespeech/s2t/models/whisper/whipser.py
index 63cafbdb7..9cf9a9eca 100644
--- a/paddlespeech/s2t/models/whisper/whipser.py
+++ b/paddlespeech/s2t/models/whisper/whipser.py
@@ -17,12 +17,11 @@ from typing import Union
 import numpy as np
 import paddle
 import paddle.nn.functional as F
+import paddlespeech.s2t.modules.align as paddlespeech_nn
 import soundfile
 import tqdm
 from paddle import nn
 from paddle.distribution import Categorical
-
-import paddlespeech.s2t.modules.align as paddlespeech_nn
 from paddlespeech.s2t.models.whisper import utils
 from paddlespeech.s2t.models.whisper.tokenizer import get_tokenizer
 from paddlespeech.s2t.models.whisper.tokenizer import LANGUAGES
@@ -477,7 +476,7 @@ def transcribe(
         decode_options["fp16"] = False
 
     if decode_options.get(
-            "language", 'None') or decode_options.get("language", None) is None:
+            "language") == 'None' or decode_options.get("language", None) is None:
         if not model.is_multilingual:
             decode_options["language"] = "en"
         else:
@@ -771,8 +770,10 @@ class GreedyDecoder(TokenDecoder):
         if temperature == 0:
             next_tokens = paddle.argmax(logits, axis=-1)
         else:
-            next_tokens = Categorical(logits=logits / temperature).sample(
-                shape=logits.shape)
+            next_tokens = Categorical(logits=logits / temperature).sample([1])
+            next_tokens = paddle.reshape(next_tokens, [
+                next_tokens.shape[0] * next_tokens.shape[1],
+            ])
 
         logprobs = F.log_softmax(logits, axis=-1, dtype=paddle.float32)
         current_logprobs = logprobs[paddle.arange(logprobs.shape[0]),
@@ -1205,9 +1206,8 @@ class DecodingTask:
                 DecodingResult(
                     audio_features=features,
                     language=language,
-                    language_probs=probs)
-                for features, language, probs in zip(audio_features, languages,
-                                                     language_probs)
+                    language_probs=probs) for features, language, probs in
+                zip(audio_features, languages, language_probs)
             ]
 
         # repeat the audio & text tensors by the group size, for beam search or best-of-n sampling
diff --git a/paddlespeech/s2t/training/gradclip.py b/paddlespeech/s2t/training/gradclip.py
index 26ac501e2..be6fcf589 100644
--- a/paddlespeech/s2t/training/gradclip.py
+++ b/paddlespeech/s2t/training/gradclip.py
@@ -43,8 +43,8 @@ class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm):
             if g.type == core.VarDesc.VarType.SELECTED_ROWS:
                 merge_grad = layers.merge_selected_rows(g)
                 merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
-            square = layers.square(merge_grad)
-            sum_square = layers.reduce_sum(square)
+            square = paddle.square(merge_grad)
+            sum_square = paddle.sum(square)
             sum_square_list.append(sum_square)
 
             # debug log, not dump all since slow down train process
@@ -57,23 +57,24 @@ class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm):
             return params_grads
 
         global_norm_var = layers.concat(sum_square_list)
-        global_norm_var = layers.reduce_sum(global_norm_var)
-        global_norm_var = layers.sqrt(global_norm_var)
+        global_norm_var = paddle.sum(global_norm_var)
+        global_norm_var = paddle.sqrt(global_norm_var)
+
         # debug log
         logger.debug(f"Grad Global Norm: {float(global_norm_var)}!!!!")
 
         max_global_norm = layers.fill_constant(
             shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm)
-        clip_var = layers.elementwise_div(
+        clip_var = paddle.divide(
             x=max_global_norm,
-            y=layers.elementwise_max(x=global_norm_var, y=max_global_norm))
+            y=paddle.maximum(x=global_norm_var, y=max_global_norm))
         for i, (p, g) in enumerate(params_grads):
             if g is None:
                 continue
             if getattr(p, 'need_clip', True) is False:
                 params_and_grads.append((p, g))
                 continue
-            new_grad = layers.elementwise_mul(x=g, y=clip_var)
+            new_grad = paddle.multiply(x=g, y=clip_var)
             params_and_grads.append((p, new_grad))
 
             # debug log, not dump all since slow down train process
diff --git a/paddlespeech/server/bin/paddlespeech_server.py b/paddlespeech/server/bin/paddlespeech_server.py
index 1b1792bd1..299a8c3d4 100644
--- a/paddlespeech/server/bin/paddlespeech_server.py
+++ b/paddlespeech/server/bin/paddlespeech_server.py
@@ -16,14 +16,9 @@ import sys
 import warnings
 from typing import List
 
+import numpy
 import uvicorn
 from fastapi import FastAPI
-from prettytable import PrettyTable
-from starlette.middleware.cors import CORSMiddleware
-
-from ..executor import BaseExecutor
-from ..util import cli_server_register
-from ..util import stats_wrapper
 from paddlespeech.cli.log import logger
 from paddlespeech.resource import CommonTaskResource
 from paddlespeech.server.engine.engine_pool import init_engine_pool
@@ -31,6 +26,12 @@ from paddlespeech.server.engine.engine_warmup import warm_up
 from paddlespeech.server.restful.api import setup_router as setup_http_router
 from paddlespeech.server.utils.config import get_config
 from paddlespeech.server.ws.api import setup_router as setup_ws_router
+from prettytable import PrettyTable
+from starlette.middleware.cors import CORSMiddleware
+
+from ..executor import BaseExecutor
+from ..util import cli_server_register
+from ..util import stats_wrapper
 warnings.filterwarnings("ignore")
 
 __all__ = ['ServerExecutor', 'ServerStatsExecutor']
@@ -134,7 +135,7 @@ class ServerStatsExecutor():
             required=True)
         self.task_choices = ['asr', 'tts', 'cls', 'text', 'vector']
         self.model_name_format = {
-            'asr': 'Model-Language-Sample Rate',
+            'asr': 'Model-Size-Code Switch-Multilingual-Language-Sample Rate',
             'tts': 'Model-Language',
             'cls': 'Model-Sample Rate',
             'text': 'Model-Task-Language',
@@ -145,7 +146,20 @@ class ServerStatsExecutor():
         fields = self.model_name_format[self.task].split("-")
         table = PrettyTable(fields)
         for key in pretrained_models:
-            table.add_row(key.split("-"))
+            line = key.split("-")
+            if self.task == "asr" and len(line) < len(fields):
+                for i in range(len(line), len(fields)):
+                    line.append("-")
+                if "codeswitch" in key:
+                    line[3], line[1] = line[1].split("_")[0], line[1].split(
+                        "_")[1:]
+                elif "multilingual" in key:
+                    line[4], line[1] = line[1].split("_")[0], line[1].split(
+                        "_")[1:]
+                tmp = numpy.array(line)
+                idx = [0, 5, 3, 4, 1, 2]
+                line = tmp[idx]
+            table.add_row(line)
         print(table)
 
     def execute(self, argv: List[str]) -> bool:
diff --git a/paddlespeech/t2s/exps/vits/normalize.py b/paddlespeech/t2s/exps/vits/normalize.py
index 514cbef8e..24e15765e 100644
--- a/paddlespeech/t2s/exps/vits/normalize.py
+++ b/paddlespeech/t2s/exps/vits/normalize.py
@@ -187,7 +187,7 @@ def main():
             record["spk_emb"] = str(item["spk_emb"])
 
         output_metadata.append(record)
-    output_metadata.sort(key=itemgetter('feats_lengths'))
+    output_metadata.sort(key=itemgetter('feats_lengths'), reverse=True)
     output_metadata_path = Path(args.dumpdir) / "metadata.jsonl"
     with jsonlines.open(output_metadata_path, 'w') as writer:
         for item in output_metadata:
diff --git a/paddlespeech/t2s/exps/vits/preprocess.py b/paddlespeech/t2s/exps/vits/preprocess.py
index 2b1a40834..d6b226a20 100644
--- a/paddlespeech/t2s/exps/vits/preprocess.py
+++ b/paddlespeech/t2s/exps/vits/preprocess.py
@@ -166,7 +166,7 @@ def process_sentences(config,
                     if record:
                         results.append(record)
 
-    results.sort(key=itemgetter("feats_lengths"))
+    results.sort(key=itemgetter("feats_lengths"), reverse=True)
     with jsonlines.open(output_dir / "metadata.jsonl", 'w') as writer:
         for item in results:
             writer.write(item)
diff --git a/paddlespeech/t2s/exps/vits/train.py b/paddlespeech/t2s/exps/vits/train.py
index 07301db56..f6a31ced2 100644
--- a/paddlespeech/t2s/exps/vits/train.py
+++ b/paddlespeech/t2s/exps/vits/train.py
@@ -110,7 +110,7 @@ def train_sp(args, config):
     train_sampler = ErnieSATSampler(
         train_dataset,
         batch_size=config.batch_size,
-        shuffle=True,
+        shuffle=False,
         drop_last=True)
     dev_sampler = ErnieSATSampler(
         dev_dataset,
diff --git a/paddlespeech/t2s/frontend/g2pw/onnx_api.py b/paddlespeech/t2s/frontend/g2pw/onnx_api.py
index 47c26a610..3ce3d246d 100644
--- a/paddlespeech/t2s/frontend/g2pw/onnx_api.py
+++ b/paddlespeech/t2s/frontend/g2pw/onnx_api.py
@@ -100,7 +100,7 @@ class G2PWOnnxConverter:
         ]
         self.non_polyphonic = {
             '一', '不', '和', '咋', '嗲', '剖', '差', '攢', '倒', '難', '奔', '勁', '拗',
-            '肖', '瘙', '誒', '泊', '听'
+            '肖', '瘙', '誒', '泊', '听', '噢'
         }
         self.non_monophonic = {'似', '攢'}
         self.monophonic_chars = [
diff --git a/paddlespeech/t2s/frontend/mix_frontend.py b/paddlespeech/t2s/frontend/mix_frontend.py
index 19c98d53f..c13a5ab62 100644
--- a/paddlespeech/t2s/frontend/mix_frontend.py
+++ b/paddlespeech/t2s/frontend/mix_frontend.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import re
 from typing import Dict
 from typing import List
 
@@ -18,6 +19,7 @@ import paddle
 
 from paddlespeech.t2s.frontend import English
 from paddlespeech.t2s.frontend.zh_frontend import Frontend
+from paddlespeech.t2s.ssml.xml_processor import MixTextProcessor
 
 
 class MixFrontend():
@@ -107,7 +109,40 @@ class MixFrontend():
                       add_sp: bool=True,
                       to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]:
 
-        segments = self.get_segment(sentence)
+        ''' 1. 添加SSML支持，先列出 文字 和 <say-as>标签内容，
+                然后添加到tmpSegments数组里
+        '''
+        d_inputs = MixTextProcessor.get_dom_split(sentence)
+        tmpSegments = []
+        for instr in d_inputs:
+            ''' 暂时只支持 say-as '''
+            if instr.lower().startswith("<say-as"):
+                tmpSegments.append((instr, "zh"))
+            else:
+                tmpSegments.extend(self.get_segment(instr))
+
+        ''' 2. 把zh的merge到一起，避免合成结果中间停顿
+        '''
+        segments = []
+        currentSeg = ["", ""]
+        for seg in tmpSegments:
+            if seg[1] == "en" or seg[1] == "other":
+                if currentSeg[0] == '':
+                    segments.append(seg)
+                else:
+                    currentSeg[0] = "<speak>" + currentSeg[0] + "</speak>"
+                    segments.append(tuple(currentSeg))
+                    segments.append(seg)
+                    currentSeg = ["", ""]
+            else:
+                if currentSeg[0] == '':
+                    currentSeg[0] = seg[0]
+                    currentSeg[1] = seg[1]
+                else:
+                    currentSeg[0] = currentSeg[0] + seg[0]
+        if currentSeg[0] != '':
+            currentSeg[0] = "<speak>" + currentSeg[0] + "</speak>"
+            segments.append(tuple(currentSeg))
 
         phones_list = []
         result = {}
@@ -120,11 +155,21 @@ class MixFrontend():
                     input_ids = self.en_frontend.get_input_ids(
                         content, merge_sentences=False, to_tensor=to_tensor)
                 else:
-                    input_ids = self.zh_frontend.get_input_ids(
-                        content,
-                        merge_sentences=False,
-                        get_tone_ids=get_tone_ids,
-                        to_tensor=to_tensor)
+                    ''' 3. 把带speak tag的中文和普通文字分开处理
+                    '''
+                    if content.strip() != "" and \
+                        re.match(r".*?<speak>.*?</speak>.*", content, re.DOTALL):
+                        input_ids = self.zh_frontend.get_input_ids_ssml(
+                            content,
+                            merge_sentences=False,
+                            get_tone_ids=get_tone_ids,
+                            to_tensor=to_tensor)
+                    else:
+                        input_ids = self.zh_frontend.get_input_ids(
+                            content,
+                            merge_sentences=False,
+                            get_tone_ids=get_tone_ids,
+                            to_tensor=to_tensor)
                 if add_sp:
                     input_ids["phone_ids"][-1] = paddle.concat(
                         [input_ids["phone_ids"][-1], self.sp_id_tensor])
diff --git a/paddlespeech/t2s/frontend/zh_frontend.py b/paddlespeech/t2s/frontend/zh_frontend.py
index ddd8cf5c7..efb673e36 100644
--- a/paddlespeech/t2s/frontend/zh_frontend.py
+++ b/paddlespeech/t2s/frontend/zh_frontend.py
@@ -138,7 +138,7 @@ class Frontend():
             "拐儿", "聋儿", "乞儿", "患儿", "幼儿", "孤儿", "婴儿", "婴幼儿", "连体儿", "脑瘫儿",
             "流浪儿", "体弱儿", "混血儿", "蜜雪儿", "舫儿", "祖儿", "美儿", "应采儿", "可儿", "侄儿",
             "孙儿", "侄孙儿", "女儿", "男儿", "红孩儿", "花儿", "虫儿", "马儿", "鸟儿", "猪儿", "猫儿",
-            "狗儿"
+            "狗儿", "少儿"
         }
 
         self.vocab_phones = {}
diff --git a/paddlespeech/t2s/models/vits/text_encoder.py b/paddlespeech/t2s/models/vits/text_encoder.py
index 799e0c759..015ed76c6 100644
--- a/paddlespeech/t2s/models/vits/text_encoder.py
+++ b/paddlespeech/t2s/models/vits/text_encoder.py
@@ -24,6 +24,7 @@ from paddle import nn
 
 from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask
 from paddlespeech.t2s.modules.transformer.encoder import ConformerEncoder as Encoder
+from paddlespeech.utils.initialize import normal_
 
 
 class TextEncoder(nn.Layer):
@@ -105,10 +106,6 @@ class TextEncoder(nn.Layer):
         # define modules
         self.emb = nn.Embedding(vocabs, attention_dim)
 
-        dist = paddle.distribution.Normal(loc=0.0, scale=attention_dim**-0.5)
-        w = dist.sample(self.emb.weight.shape)
-        self.emb.weight.set_value(w)
-
         self.encoder = Encoder(
             idim=-1,
             input_layer=None,
@@ -130,6 +127,8 @@ class TextEncoder(nn.Layer):
             cnn_module_kernel=conformer_kernel_size, )
         self.proj = nn.Conv1D(attention_dim, attention_dim * 2, 1)
 
+        self.reset_parameters()
+
     def forward(
             self,
             x: paddle.Tensor,
@@ -166,3 +165,9 @@ class TextEncoder(nn.Layer):
         m, logs = paddle.split(stats, 2, axis=1)
 
         return x, m, logs, x_mask
+
+    def reset_parameters(self):
+        normal_(self.emb.weight, mean=0.0, std=self.attention_dim**-0.5)
+        if self.emb._padding_idx is not None:
+            with paddle.no_grad():
+                self.emb.weight[self.emb._padding_idx] = 0
diff --git a/paddlespeech/t2s/models/vits/vits.py b/paddlespeech/t2s/models/vits/vits.py
index 0ff3a546d..e68ed5643 100644
--- a/paddlespeech/t2s/models/vits/vits.py
+++ b/paddlespeech/t2s/models/vits/vits.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 # Modified from espnet(https://github.com/espnet/espnet)
 """VITS module"""
+import math
 from typing import Any
 from typing import Dict
 from typing import Optional
@@ -27,7 +28,12 @@ from paddlespeech.t2s.models.hifigan import HiFiGANMultiScaleMultiPeriodDiscrimi
 from paddlespeech.t2s.models.hifigan import HiFiGANPeriodDiscriminator
 from paddlespeech.t2s.models.hifigan import HiFiGANScaleDiscriminator
 from paddlespeech.t2s.models.vits.generator import VITSGenerator
-from paddlespeech.t2s.modules.nets_utils import initialize
+from paddlespeech.utils.initialize import _calculate_fan_in_and_fan_out
+from paddlespeech.utils.initialize import kaiming_uniform_
+from paddlespeech.utils.initialize import normal_
+from paddlespeech.utils.initialize import ones_
+from paddlespeech.utils.initialize import uniform_
+from paddlespeech.utils.initialize import zeros_
 
 AVAILABLE_GENERATERS = {
     "vits_generator": VITSGenerator,
@@ -152,8 +158,7 @@ class VITS(nn.Layer):
                     "use_spectral_norm": False,
                 },
             },
-            cache_generator_outputs: bool=True,
-            init_type: str="xavier_uniform", ):
+            cache_generator_outputs: bool=True, ):
         """Initialize VITS module.
         Args:
             idim (int):
@@ -179,9 +184,6 @@ class VITS(nn.Layer):
         assert check_argument_types()
         super().__init__()
 
-        # initialize parameters
-        initialize(self, init_type)
-
         # define modules
         generator_class = AVAILABLE_GENERATERS[generator_type]
         if generator_type == "vits_generator":
@@ -196,8 +198,6 @@ class VITS(nn.Layer):
         self.discriminator = discriminator_class(
             **discriminator_params, )
 
-        nn.initializer.set_global_initializer(None)
-
         # cache
         self.cache_generator_outputs = cache_generator_outputs
         self._cache = None
@@ -214,6 +214,10 @@ class VITS(nn.Layer):
         self.reuse_cache_gen = True
         self.reuse_cache_dis = True
 
+        self.reset_parameters()
+        self.generator.decoder.reset_parameters()
+        self.generator.text_encoder.reset_parameters()
+
     def forward(
             self,
             text: paddle.Tensor,
@@ -243,7 +247,7 @@ class VITS(nn.Layer):
             forward_generator (bool):
                     Whether to forward generator.
         Returns:
-        
+
         """
         if forward_generator:
             return self._forward_generator(
@@ -290,7 +294,7 @@ class VITS(nn.Layer):
             lids (Optional[Tensor]):
                 Language index tensor (B,) or (B, 1).
         Returns:
-            
+
         """
         # setup
         feats = feats.transpose([0, 2, 1])
@@ -497,3 +501,34 @@ class VITS(nn.Layer):
             lids, )
 
         return dict(wav=paddle.reshape(wav, [-1]))
+
+    def reset_parameters(self):
+        def _reset_parameters(module):
+            if isinstance(module,
+                        (nn.Conv1D, nn.Conv1DTranspose, nn.Conv2D, nn.Conv2DTranspose)):
+                kaiming_uniform_(module.weight, a=math.sqrt(5))
+                if module.bias is not None:
+                    fan_in, _ = _calculate_fan_in_and_fan_out(module.weight)
+                    if fan_in != 0:
+                        bound = 1 / math.sqrt(fan_in)
+                        uniform_(module.bias, -bound, bound)
+
+            if isinstance(module,
+                          (nn.BatchNorm1D, nn.BatchNorm2D, nn.GroupNorm, nn.LayerNorm)):
+                ones_(module.weight)
+                zeros_(module.bias)
+
+            if isinstance(module, nn.Linear):
+                kaiming_uniform_(module.weight, a=math.sqrt(5))
+                if module.bias is not None:
+                    fan_in, _ = _calculate_fan_in_and_fan_out(module.weight)
+                    bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+                    uniform_(module.bias, -bound, bound)
+
+            if isinstance(module, nn.Embedding):
+                normal_(module.weight)
+                if module._padding_idx is not None:
+                    with paddle.no_grad():
+                        module.weight[module._padding_idx] = 0
+
+        self.apply(_reset_parameters)
diff --git a/paddlespeech/t2s/ssml/xml_processor.py b/paddlespeech/t2s/ssml/xml_processor.py
index b39121347..892ca371e 100644
--- a/paddlespeech/t2s/ssml/xml_processor.py
+++ b/paddlespeech/t2s/ssml/xml_processor.py
@@ -74,6 +74,28 @@ class MixTextProcessor():
             ctlist.append([mixstr, []])
         return ctlist
 
+    @classmethod
+    def get_dom_split(self, mixstr):
+        ''' 文本分解，顺序加了列表中，返回文本和say-as标签
+        '''
+        ctlist = []
+        patn = re.compile(r'(.*\s*?)(<speak>.*?</speak>)(.*\s*)$', re.M | re.S)
+        mat = re.match(patn, mixstr)
+        if mat:
+            pre_xml = mat.group(1)
+            in_xml = mat.group(2)
+            after_xml = mat.group(3)
+
+            ctlist.append(pre_xml)
+            dom = DomXml(in_xml)
+            tags = dom.get_text_and_sayas_tags()
+            ctlist.extend(tags)
+            
+            ctlist.append(after_xml)
+            return ctlist
+        else:
+            ctlist.append(mixstr)
+        return ctlist
 
 class DomXml():
     def __init__(self, xmlstr):
@@ -156,3 +178,15 @@ class DomXml():
             if x.hasAttribute('pinyin'):  # pinyin
                 print(x.tagName, 'pinyin',
                       x.getAttribute('pinyin'), x.firstChild.data)
+
+    def get_text_and_sayas_tags(self):
+        '''返回 xml 内容的列表，包括所有文本内容和<say-as> tag'''
+        res = []
+
+        for x1 in self.rnode:
+            if x1.nodeType == Node.TEXT_NODE:
+                res.append(x1.value)
+            else:
+                for x2 in x1.childNodes:
+                    res.append(x2.toxml())
+        return res
diff --git a/paddlespeech/utils/initialize.py b/paddlespeech/utils/initialize.py
new file mode 100644
index 000000000..8ebe6845e
--- /dev/null
+++ b/paddlespeech/utils/initialize.py
@@ -0,0 +1,321 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This code is based on https://github.com/pytorch/pytorch/blob/master/torch/nn/init.py
+Ths copyright of pytorch/pytorch is a BSD-style license, as found in the LICENSE file.
+"""
+import math
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+
+__all__ = [
+    "uniform_",
+    "normal_",
+    "constant_",
+    "ones_",
+    "zeros_",
+    "xavier_uniform_",
+    "xavier_normal_",
+    "kaiming_uniform_",
+    "kaiming_normal_",
+    "linear_init_",
+    "conv_init_",
+    "reset_initialized_parameter",
+    "_calculate_fan_in_and_fan_out",
+]
+
+
+def _no_grad_uniform_(tensor, a, b):
+    with paddle.no_grad():
+        tensor.set_value(
+            paddle.uniform(
+                shape=tensor.shape, dtype=tensor.dtype, min=a, max=b))
+    return tensor
+
+
+def _no_grad_normal_(tensor, mean=0.0, std=1.0):
+    with paddle.no_grad():
+        tensor.set_value(paddle.normal(mean=mean, std=std, shape=tensor.shape))
+    return tensor
+
+
+def _no_grad_fill_(tensor, value=0.0):
+    with paddle.no_grad():
+        tensor.set_value(paddle.full_like(tensor, value, dtype=tensor.dtype))
+    return tensor
+
+
+def uniform_(tensor, a, b):
+    """
+    Modified tensor inspace using uniform_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        a (float|int): min value.
+        b (float|int): max value.
+    Return:
+        tensor
+    """
+    return _no_grad_uniform_(tensor, a, b)
+
+
+def normal_(tensor, mean=0.0, std=1.0):
+    """
+    Modified tensor inspace using normal_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        mean (float|int): mean value.
+        std (float|int): std value.
+    Return:
+        tensor
+    """
+    return _no_grad_normal_(tensor, mean, std)
+
+
+def constant_(tensor, value=0.0):
+    """
+    Modified tensor inspace using constant_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        value (float|int): value to fill tensor.
+    Return:
+        tensor
+    """
+    return _no_grad_fill_(tensor, value)
+
+
+def ones_(tensor):
+    """
+    Modified tensor inspace using ones_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+    Return:
+        tensor
+    """
+    return _no_grad_fill_(tensor, 1)
+
+
+def zeros_(tensor):
+    """
+    Modified tensor inspace using zeros_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+    Return:
+        tensor
+    """
+    return _no_grad_fill_(tensor, 0)
+
+
+def vector_(tensor, vector):
+    with paddle.no_grad():
+        tensor.set_value(paddle.to_tensor(vector, dtype=tensor.dtype))
+    return tensor
+
+
+def _calculate_fan_in_and_fan_out(tensor, reverse=False):
+    """
+    Calculate (fan_in, _fan_out) for tensor
+    Args:
+        tensor (Tensor): paddle.Tensor
+        reverse (bool: False): tensor data format order, False by default as [fout, fin, ...]. e.g. : conv.weight [cout, cin, kh, kw] is False; linear.weight [cin, cout] is True
+    Return:
+        Tuple[fan_in, fan_out]
+    """
+    if tensor.ndim < 2:
+        raise ValueError(
+            "Fan in and fan out can not be computed for tensor with fewer than 2 dimensions"
+        )
+
+    if reverse:
+        num_input_fmaps, num_output_fmaps = tensor.shape[0], tensor.shape[1]
+    else:
+        num_input_fmaps, num_output_fmaps = tensor.shape[1], tensor.shape[0]
+
+    receptive_field_size = 1
+    if tensor.ndim > 2:
+        receptive_field_size = np.prod(tensor.shape[2:])
+
+    fan_in = num_input_fmaps * receptive_field_size
+    fan_out = num_output_fmaps * receptive_field_size
+
+    return fan_in, fan_out
+
+
+def xavier_uniform_(tensor, gain=1.0, reverse=False):
+    """
+    Modified tensor inspace using xavier_uniform_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        gain (float): super parameter, 1. default.
+        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
+    Return:
+        tensor
+    """
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse=reverse)
+    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
+    k = math.sqrt(3.0) * std
+    return _no_grad_uniform_(tensor, -k, k)
+
+
+def xavier_normal_(tensor, gain=1.0, reverse=False):
+    """
+    Modified tensor inspace using xavier_normal_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        gain (float): super parameter, 1. default.
+        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
+    Return:
+        tensor
+    """
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse=reverse)
+    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
+    return _no_grad_normal_(tensor, 0, std)
+
+
+# reference: https://pytorch.org/docs/stable/_modules/torch/nn/init.html
+def _calculate_correct_fan(tensor, mode, reverse=False):
+    mode = mode.lower()
+    valid_modes = ["fan_in", "fan_out"]
+    if mode not in valid_modes:
+        raise ValueError("Mode {} not supported, please use one of {}".format(
+            mode, valid_modes))
+
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse)
+
+    return fan_in if mode == "fan_in" else fan_out
+
+
+def _calculate_gain(nonlinearity, param=None):
+    linear_fns = [
+        "linear", "conv1d", "conv2d", "conv3d", "conv_transpose1d",
+        "conv_transpose2d", "conv_transpose3d"
+    ]
+    if nonlinearity in linear_fns or nonlinearity == "sigmoid":
+        return 1
+    elif nonlinearity == "tanh":
+        return 5.0 / 3
+    elif nonlinearity == "relu":
+        return math.sqrt(2.0)
+    elif nonlinearity == "leaky_relu":
+        if param is None:
+            negative_slope = 0.01
+        elif not isinstance(param, bool) and isinstance(
+                param, int) or isinstance(param, float):
+            # True/False are instances of int, hence check above
+            negative_slope = param
+        else:
+            raise ValueError(
+                "negative_slope {} not a valid number".format(param))
+        return math.sqrt(2.0 / (1 + negative_slope**2))
+    elif nonlinearity == "selu":
+        return 3.0 / 4
+    else:
+        raise ValueError("Unsupported nonlinearity {}".format(nonlinearity))
+
+
+def kaiming_uniform_(tensor,
+                     a=0,
+                     mode="fan_in",
+                     nonlinearity="leaky_relu",
+                     reverse=False):
+    """
+    Modified tensor inspace using kaiming_uniform method
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        mode (str): ['fan_in', 'fan_out'], 'fin_in' defalut
+        nonlinearity (str): nonlinearity method name
+        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
+    Return:
+        tensor
+    """
+    fan = _calculate_correct_fan(tensor, mode, reverse)
+    gain = _calculate_gain(nonlinearity, a)
+    std = gain / math.sqrt(fan)
+    k = math.sqrt(3.0) * std
+    return _no_grad_uniform_(tensor, -k, k)
+
+
+def kaiming_normal_(tensor,
+                    a=0,
+                    mode="fan_in",
+                    nonlinearity="leaky_relu",
+                    reverse=False):
+    """
+    Modified tensor inspace using kaiming_normal_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        mode (str): ['fan_in', 'fan_out'], 'fin_in' defalut
+        nonlinearity (str): nonlinearity method name
+        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
+    Return:
+        tensor
+    """
+    fan = _calculate_correct_fan(tensor, mode, reverse)
+    gain = _calculate_gain(nonlinearity, a)
+    std = gain / math.sqrt(fan)
+    return _no_grad_normal_(tensor, 0, std)
+
+
+def linear_init_(module):
+    bound = 1 / math.sqrt(module.weight.shape[0])
+    uniform_(module.weight, -bound, bound)
+    uniform_(module.bias, -bound, bound)
+
+
+def conv_init_(module):
+    bound = 1 / np.sqrt(np.prod(module.weight.shape[1:]))
+    uniform_(module.weight, -bound, bound)
+    if module.bias is not None:
+        uniform_(module.bias, -bound, bound)
+
+
+def bias_init_with_prob(prior_prob=0.01):
+    """initialize conv/fc bias value according to a given probability value."""
+    bias_init = float(-np.log((1 - prior_prob) / prior_prob))
+    return bias_init
+
+
+@paddle.no_grad()
+def reset_initialized_parameter(model, include_self=True):
+    """
+    Reset initialized parameter using following method for [conv, linear, embedding, bn]
+    Args:
+        model (paddle.Layer): paddle Layer
+        include_self (bool: False): include_self for Layer.named_sublayers method. Indicate whether including itself
+    Return:
+        None
+    """
+    for _, m in model.named_sublayers(include_self=include_self):
+        if isinstance(m, nn.Conv2D):
+            k = float(m._groups) / (m._in_channels * m._kernel_size[0] *
+                                    m._kernel_size[1])
+            k = math.sqrt(k)
+            _no_grad_uniform_(m.weight, -k, k)
+            if hasattr(m, "bias") and getattr(m, "bias") is not None:
+                _no_grad_uniform_(m.bias, -k, k)
+
+        elif isinstance(m, nn.Linear):
+            k = math.sqrt(1.0 / m.weight.shape[0])
+            _no_grad_uniform_(m.weight, -k, k)
+            if hasattr(m, "bias") and getattr(m, "bias") is not None:
+                _no_grad_uniform_(m.bias, -k, k)
+
+        elif isinstance(m, nn.Embedding):
+            _no_grad_normal_(m.weight, mean=0.0, std=1.0)
+
+        elif isinstance(m, (nn.BatchNorm2D, nn.LayerNorm)):
+            _no_grad_fill_(m.weight, 1.0)
+            if hasattr(m, "bias") and getattr(m, "bias") is not None:
+                _no_grad_fill_(m.bias, 0)
diff --git a/setup.py b/setup.py
index 212d3b109..be6cf63a9 100644
--- a/setup.py
+++ b/setup.py
@@ -53,7 +53,6 @@ base = [
     "paddlespeech_feat",
     "Pillow>=9.0.0",
     "praatio==5.0.0",
-    "protobuf>=3.1.0, <=3.20.0",
     "pypinyin<=0.44.0",
     "pypinyin-dict",
     "python-dateutil",
@@ -72,12 +71,9 @@ base = [
     "yacs~=0.1.8",
     "prettytable",
     "zhon",
-    "colorlog",
-    "pathos==0.2.8",
     "braceexpand",
     "pyyaml",
-    "pybind11",
-    "paddleslim==2.3.4",
+    "paddleslim>=2.3.4",
     "paddleaudio>=1.0.2",
 ]
 
diff --git a/tests/unit/cli/test_cli.sh b/tests/unit/cli/test_cli.sh
index 3a58626d2..5d3b76f6c 100755
--- a/tests/unit/cli/test_cli.sh
+++ b/tests/unit/cli/test_cli.sh
@@ -14,7 +14,7 @@ paddlespeech ssl --task asr --lang en --input ./en.wav
 paddlespeech ssl --task vector --lang en --input ./en.wav
 
 # Speech_recognition
-wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
+wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav
 paddlespeech asr --input ./zh.wav
 paddlespeech asr --model conformer_aishell --input ./zh.wav
 paddlespeech asr --model conformer_online_aishell --input ./zh.wav
@@ -26,6 +26,7 @@ paddlespeech asr --model deepspeech2offline_aishell --input ./zh.wav
 paddlespeech asr --model deepspeech2online_wenetspeech --input ./zh.wav
 paddlespeech asr --model deepspeech2online_aishell --input ./zh.wav
 paddlespeech asr --model deepspeech2offline_librispeech --lang en --input ./en.wav
+paddlespeech asr --model conformer_talcs --lang zh_en --codeswitch True --input ./ch_zh_mix.wav
 
 # Support editing num_decoding_left_chunks
 paddlespeech asr --model conformer_online_wenetspeech --num_decoding_left_chunks 3 --input ./zh.wav