diff --git a/demos/speech_web/.gitignore b/demos/speech_web/.gitignore
index 54418e60..1e961a38 100644
--- a/demos/speech_web/.gitignore
+++ b/demos/speech_web/.gitignore
@@ -13,4 +13,7 @@
 *.pdmodel
 */source/*
 */PaddleSpeech/*
+*/tmp*/*
+*/duration.txt
+*/oov_info.txt
 
diff --git a/demos/speech_web/README.md b/demos/speech_web/README.md
index 3b2da6e9..e8c59ea8 100644
--- a/demos/speech_web/README.md
+++ b/demos/speech_web/README.md
@@ -1,55 +1,79 @@
 # Paddle Speech Demo
 
-PaddleSpeechDemo 是一个以 PaddleSpeech 的语音交互功能为主体开发的 Demo 展示项目，用于帮助大家更好的上手 PaddleSpeech 以及使用 PaddleSpeech 构建自己的应用。
+## 简介
+Paddle Speech Demo 是一个以 PaddleSpeech 的语音交互功能为主体开发的 Demo 展示项目，用于帮助大家更好的上手 PaddleSpeech 以及使用 PaddleSpeech 构建自己的应用。
 
-智能语音交互部分使用 PaddleSpeech，对话以及信息抽取部分使用 PaddleNLP，网页前端展示部分基于 Vue3 进行开发
+智能语音交互部分使用 PaddleSpeech，对话以及信息抽取部分使用 PaddleNLP，网页前端展示部分基于 Vue3 进行开发。
 
 主要功能：
 
+`main.py` 中包含功能
 + 语音聊天：PaddleSpeech 的语音识别能力+语音合成能力，对话部分基于 PaddleNLP 的闲聊功能
 + 声纹识别：PaddleSpeech 的声纹识别功能展示
 + 语音识别：支持【实时语音识别】，【端到端识别】，【音频文件识别】三种模式
 + 语音合成：支持【流式合成】与【端到端合成】两种方式
 + 语音指令：基于 PaddleSpeech 的语音识别能力与 PaddleNLP 的信息抽取，实现交通费的智能报销
 
+`vc.py` 中包含功能
++ 一句话合成：基于 GE2E 和 ECAPA-TDNN 模型的一句话合成方案，可以模仿输入的音频的音色进行合成任务
+  + GE2E 音色克隆方案可以参考： [【FastSpeech2 + AISHELL-3 Voice Cloning】](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/vc1)
+  + ECAPA-TDNN 音色克隆方案可以参考: [【FastSpeech2 + AISHELL-3 Voice Cloning (ECAPA-TDNN)】](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/vc2)
+
++ 小数据微调：基于小数据集的微调方案，内置用12句话标贝中文女声微调示例，你也可以通过一键重置，录制自己的声音，注意在安静环境下录制，效果会更好。你可以在 [【Finetune your own AM based on FastSpeech2 with AISHELL-3】](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/tts_finetune/tts3)中尝试使用自己的数据集进行微调。
+
++ ENIRE-SAT：语言-语音跨模态大模型 ENIRE-SAT 可视化展示示例，支持个性化合成，跨语言语音合成（音频为中文则输入英文文本进行合成），语音编辑（修改音频文字中间的结果）功能。 ENIRE-SAT 更多实现细节，可以参考：
+  + [【ERNIE-SAT with AISHELL-3 dataset】](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/ernie_sat)
+  + [【ERNIE-SAT with with AISHELL3 and VCTK datasets】](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3_vctk/ernie_sat)
+  + [【ERNIE-SAT with VCTK dataset】](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/ernie_sat)
+
 运行效果：
 
- ![效果](docs/效果展示.png)
+ ![效果](https://user-images.githubusercontent.com/30135920/191188766-12e7ca15-f7b4-45f8-9da5-0c0b0bbe5fcb.png)
 
-## 安装
 
-### 后端环境安装
 
-```
-# 安装环境
-cd speech_server
-pip install -r requirements.txt
+## 基础环境安装
 
-# 下载 ie 模型，针对地点进行微调，效果更好，不下载的话会使用其它版本，效果没有这个好
-cd source
-mkdir model
-cd model
-wget https://bj.bcebos.com/paddlenlp/applications/speech-cmd-analysis/finetune/model_state.pdparams
+### 后端环境安装
+```bash 
+cd speech_server
+pip install -r requirements.txt -i https://mirror.baidu.com/pypi/simple
+cd ../
 ```
 
 ### 前端环境安装
-
 前端依赖 `node.js` ，需要提前安装，确保 `npm` 可用，`npm` 测试版本 `8.3.1`，建议下载[官网](https://nodejs.org/en/)稳定版的 `node.js`
 
-```
+```bash
 # 进入前端目录
 cd web_client
-
 # 安装 `yarn`，已经安装可跳过
 npm install -g yarn
-
 # 使用yarn安装前端依赖
 yarn install
+cd ../
 ```
 
+
 ## 启动服务
+【注意】目前只支持 `main.py` 和 `vc.py` 两者中选择开启一个后端服务。
+
+### 启动 `main.py` 后端服务
+
+#### 下载相关模型
+
+只需手动下载语音指令所需模型即可，其他模型会自动下载。
 
-### 开启后端服务
+```bash
+cd speech_server
+mkdir -p source/model
+cd source/model
+# 下载IE模型
+wget https://bj.bcebos.com/paddlenlp/applications/speech-cmd-analysis/finetune/model_state.pdparams
+cd ../../
+
+```
+#### 启动后端服务
 
 ```
 cd speech_server
@@ -57,7 +81,91 @@ cd speech_server
 python main.py --port 8010
 ```
 
-### 开启前端服务
+
+### 启动 `vc.py` 后端服务
+
+#### 下载相关模型和音频
+
+```bash
+cd speech_server
+
+# 已创建则跳过
+mkdir -p source/model
+cd source
+# 下载 & 解压 wav （包含VC测试音频）
+wget https://paddlespeech.bj.bcebos.com/demos/speech_web/wav_vc.zip
+unzip wav_vc.zip
+
+cd model
+# 下载 GE2E 相关模型
+wget https://bj.bcebos.com/paddlespeech/Parakeet/released_models/ge2e/ge2e_ckpt_0.3.zip
+unzip ge2e_ckpt_0.3.zip
+wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip
+unzip pwg_aishell3_ckpt_0.5.zip
+wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip
+unzip fastspeech2_nosil_aishell3_vc1_ckpt_0.5.zip
+
+# 下载 ECAPA-TDNN 相关模型
+wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_aishell3_ckpt_vc2_1.2.0.zip
+unzip fastspeech2_aishell3_ckpt_vc2_1.2.0.zip
+
+# 下载 ERNIE-SAT 相关模型
+# aishell3 ERNIE-SAT
+wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/ernie_sat/erniesat_aishell3_ckpt_1.2.0.zip
+unzip erniesat_aishell3_ckpt_1.2.0.zip
+
+# vctk ERNIE-SAT
+wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/ernie_sat/erniesat_vctk_ckpt_1.2.0.zip
+unzip erniesat_vctk_ckpt_1.2.0.zip
+
+# aishell3_vctk ERNIE-SAT
+wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/ernie_sat/erniesat_aishell3_vctk_ckpt_1.2.0.zip
+unzip erniesat_aishell3_vctk_ckpt_1.2.0.zip
+
+# 下载 finetune 相关模型
+wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_aishell3_ckpt_1.1.0.zip
+unzip fastspeech2_aishell3_ckpt_1.1.0.zip
+
+# 下载声码器
+wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_ckpt_0.2.0.zip
+unzip hifigan_aishell3_ckpt_0.2.0.zip
+wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_ckpt_0.2.0.zip
+unzip hifigan_vctk_ckpt_0.2.0.zip
+
+cd ../../../
+```
+
+#### ERNIE-SAT 环境配置
+
+ERNIE-SAT 体验依赖于 [examples/aishell3_vctk/ernie_sat](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3_vctk/ernie_sat) 的环境。参考 `examples/aishell3_vctk/ernie_sat` 下的 `README.md`， 确保 `examples/aishell3_vctk/ernie_sat` 下 `run.sh` 相关示例代码有效。
+ 
+运行好 `examples/aishell3_vctk/ernie_sat` 后，回到当前目录，创建环境：
+```bash
+cd speech_server
+ln -snf ../../../examples/aishell3_vctk/ernie_sat/download .
+ln -snf ../../../examples/aishell3_vctk/ernie_sat/tools .
+cd ../
+```
+
+#### finetune 环境配置
+
+`finetune` 需要解压 `tools/aligner` 中的 `aishell3_model.zip`，finetune 过程需要使用到 `tools/aligner/aishell3_model/meta.yaml` 文件。
+
+```bash
+cd speech_server/tools/aligner
+unzip aishell3_model.zip
+cd -
+```
+
+#### 启动后端服务
+
+```
+cd speech_server
+# 默认8010端口
+python vc.py --port 8010
+```
+
+### 启动前端服务
 
 ```
 cd web_client
@@ -65,6 +173,9 @@ yarn dev --port 8011
 ```
 
 默认配置下，前端中配置的后台地址信息是 localhost，确保后端服务器和打开页面的游览器在同一台机器上，不在一台机器的配置方式见下方的 FAQ：【后端如果部署在其它机器或者别的端口如何修改】
+
+
+
 ## FAQ 
 
 #### Q: 如何安装node.js
@@ -75,7 +186,7 @@ A： node.js的安装可以参考[【菜鸟教程】](https://www.runoob.com/nod
 
 A：后端的配置地址有分散在两个文件中
 
-修改第一个文件 `PaddleSpeechWebClient/vite.config.js`
+修改第一个文件 `./web_client/vite.config.js`
 
 ```
 server: {
@@ -90,7 +201,7 @@ server: {
   }
 ```
 
-修改第二个文件 `PaddleSpeechWebClient/src/api/API.js`（ Websocket 代理配置失败，所以需要在这个文件中修改）
+修改第二个文件 `./web_client/src/api/API.js`（ Websocket 代理配置失败，所以需要在这个文件中修改）
 
 ```
 // websocket （这里改成后端所在的接口）
diff --git a/demos/speech_web/docs/效果展示.png b/demos/speech_web/docs/效果展示.png
deleted file mode 100644
index 5f7997c1..00000000
Binary files a/demos/speech_web/docs/效果展示.png and /dev/null differ
diff --git a/examples/other/tts_finetune/tts3/finetune.yaml b/demos/speech_web/speech_server/conf/tts3_finetune.yaml
similarity index 86%
rename from examples/other/tts_finetune/tts3/finetune.yaml
rename to demos/speech_web/speech_server/conf/tts3_finetune.yaml
index 374a69f3..4f708bd7 100644
--- a/examples/other/tts_finetune/tts3/finetune.yaml
+++ b/demos/speech_web/speech_server/conf/tts3_finetune.yaml
@@ -3,10 +3,10 @@
 ###########################################################
 # Set to -1 to indicate that the parameter is the same as the pretrained model configuration
 
-batch_size: -1
+batch_size: 10
 learning_rate: 0.0001     # learning rate
 num_snapshots: -1
 
 # frozen_layers should be a list
 # if you don't need to freeze, set frozen_layers to []
-frozen_layers: ["encoder", "duration_predictor"]
+frozen_layers: ["encoder"]
diff --git a/demos/speech_web/speech_server/main.py b/demos/speech_web/speech_server/main.py
index d4750d59..03e7e599 100644
--- a/demos/speech_web/speech_server/main.py
+++ b/demos/speech_web/speech_server/main.py
@@ -1,8 +1,3 @@
-# todo:
-# 1. 开启服务
-# 2. 接收录音音频，返回识别结果
-# 3. 接收ASR识别结果，返回NLP对话结果
-# 4. 接收NLP对话结果，返回TTS音频
 import argparse
 import base64
 import datetime
@@ -32,6 +27,7 @@ from starlette.requests import Request
 from starlette.responses import FileResponse
 from starlette.websockets import WebSocketState as WebSocketState
 
+from paddlespeech.cli.tts.infer import TTSExecutor
 from paddlespeech.server.engine.asr.online.python.asr_engine import PaddleASRConnectionHanddler
 from paddlespeech.server.utils.audio_process import float2pcm
 
@@ -55,7 +51,7 @@ asr_config = "conf/ws_conformer_wenetspeech_application_faster.yaml"
 asr_init_path = "source/demo/demo.wav"
 db_path = "source/db/vpr.sqlite"
 ie_model_path = "source/model"
-
+tts_model = TTSExecutor()
 # 路径配置
 UPLOAD_PATH = "source/vpr"
 WAV_PATH = "source/wav"
@@ -72,6 +68,14 @@ manager = ConnectionManager()
 aumanager = AudioMannger(chatbot)
 aumanager.init()
 vpr = VPR(db_path, dim=192, top_k=5)
+# 初始化下载模型
+tts_model(
+    text="今天天气准不错",
+    output="test.wav",
+    am='fastspeech2_mix',
+    spk_id=174,
+    voc='hifigan_csmsc',
+    lang='mix', )
 
 
 # 服务配置
@@ -331,6 +335,7 @@ async def ieOffline(nlp_base: NlpBase):
 #####################################################################
 
 
+# 端到端合成
 @app.post("/tts/offline")
 async def text2speechOffline(tts_base: TtsBase):
     text = tts_base.text
@@ -340,8 +345,14 @@ async def text2speechOffline(tts_base: TtsBase):
         now_name = "tts_" + datetime.datetime.strftime(
             datetime.datetime.now(), '%Y%m%d%H%M%S') + randName() + ".wav"
         out_file_path = os.path.join(WAV_PATH, now_name)
-        # 保存为文件，再转成base64传输
-        chatbot.text2speech(text, outpath=out_file_path)
+        # 使用中英混合CLI
+        tts_model(
+            text=text,
+            output=out_file_path,
+            am='fastspeech2_mix',
+            spk_id=174,
+            voc='hifigan_csmsc',
+            lang='mix')
         with open(out_file_path, "rb") as f:
             data_bin = f.read()
         base_str = base64.b64encode(data_bin)
diff --git a/demos/speech_web/speech_server/requirements.txt b/demos/speech_web/speech_server/requirements.txt
index 607f0d4d..cdc65465 100644
--- a/demos/speech_web/speech_server/requirements.txt
+++ b/demos/speech_web/speech_server/requirements.txt
@@ -1,13 +1,8 @@
 aiofiles
 faiss-cpu
-fastapi
-librosa
-numpy
-paddlenlp
-paddlepaddle
-paddlespeech
+praatio==5.0.0
 pydantic
-python-multipartscikit_learn
-SoundFile
+python-multipart
+scikit_learn
 starlette
 uvicorn
diff --git a/demos/speech_web/speech_server/src/ernie_sat.py b/demos/speech_web/speech_server/src/ernie_sat.py
new file mode 100644
index 00000000..b74dd8e3
--- /dev/null
+++ b/demos/speech_web/speech_server/src/ernie_sat.py
@@ -0,0 +1,195 @@
+import os
+
+from .util import MAIN_ROOT
+from .util import run_cmd
+
+
+class SAT:
+    def __init__(self):
+        # pretrain model path
+        self.zh_pretrain_model_path = os.path.realpath(
+            "source/model/erniesat_aishell3_ckpt_1.2.0")
+        self.en_pretrain_model_path = os.path.realpath(
+            "source/model/erniesat_vctk_ckpt_1.2.0")
+        self.cross_pretrain_model_path = os.path.realpath(
+            "source/model/erniesat_aishell3_vctk_ckpt_1.2.0")
+
+        self.zh_voc_model_path = os.path.realpath(
+            "source/model/hifigan_aishell3_ckpt_0.2.0")
+        self.eb_voc_model_path = os.path.realpath(
+            "source/model/hifigan_vctk_ckpt_0.2.0")
+        self.cross_voc_model_path = os.path.realpath(
+            "source/model/hifigan_aishell3_ckpt_0.2.0")
+
+        self.BIN_DIR = os.path.join(MAIN_ROOT,
+                                    "paddlespeech/t2s/exps/ernie_sat")
+
+    def zh_synthesize_edit(self,
+                           old_str: str,
+                           new_str: str,
+                           input_name: os.PathLike,
+                           output_name: os.PathLike,
+                           task_name: str="synthesize",
+                           erniesat_ckpt_name: str="snapshot_iter_289500.pdz"):
+
+        if task_name not in ['synthesize', 'edit']:
+            print("task name only in ['edit', 'synthesize']")
+            return None
+
+        # 推理文件配置
+        config_path = os.path.join(self.zh_pretrain_model_path, "default.yaml")
+        phones_dict = os.path.join(self.zh_pretrain_model_path,
+                                   "phone_id_map.txt")
+        erniesat_ckpt = os.path.join(self.zh_pretrain_model_path,
+                                     erniesat_ckpt_name)
+        erniesat_stat = os.path.join(self.zh_pretrain_model_path,
+                                     "speech_stats.npy")
+
+        voc = "hifigan_aishell3"
+        voc_config = os.path.join(self.zh_voc_model_path, "default.yaml")
+        voc_ckpt = os.path.join(self.zh_voc_model_path,
+                                "snapshot_iter_2500000.pdz")
+        voc_stat = os.path.join(self.zh_voc_model_path, "feats_stats.npy")
+
+        cmd = self.get_cmd(
+            task_name=task_name,
+            input_name=input_name,
+            old_str=old_str,
+            new_str=new_str,
+            config_path=config_path,
+            phones_dict=phones_dict,
+            erniesat_ckpt=erniesat_ckpt,
+            erniesat_stat=erniesat_stat,
+            voc=voc,
+            voc_config=voc_config,
+            voc_ckpt=voc_ckpt,
+            voc_stat=voc_stat,
+            output_name=output_name,
+            source_lang="zh",
+            target_lang="zh")
+
+        return run_cmd(cmd, output_name)
+
+    def crossclone(self,
+                   old_str: str,
+                   new_str: str,
+                   input_name: os.PathLike,
+                   output_name: os.PathLike,
+                   source_lang: str,
+                   target_lang: str,
+                   erniesat_ckpt_name: str="snapshot_iter_489000.pdz"):
+        # 推理文件配置
+        config_path = os.path.join(self.cross_pretrain_model_path,
+                                   "default.yaml")
+        phones_dict = os.path.join(self.cross_pretrain_model_path,
+                                   "phone_id_map.txt")
+        erniesat_ckpt = os.path.join(self.cross_pretrain_model_path,
+                                     erniesat_ckpt_name)
+        erniesat_stat = os.path.join(self.cross_pretrain_model_path,
+                                     "speech_stats.npy")
+
+        voc = "hifigan_aishell3"
+        voc_config = os.path.join(self.cross_voc_model_path, "default.yaml")
+        voc_ckpt = os.path.join(self.cross_voc_model_path,
+                                "snapshot_iter_2500000.pdz")
+        voc_stat = os.path.join(self.cross_voc_model_path, "feats_stats.npy")
+        task_name = "synthesize"
+        cmd = self.get_cmd(
+            task_name=task_name,
+            input_name=input_name,
+            old_str=old_str,
+            new_str=new_str,
+            config_path=config_path,
+            phones_dict=phones_dict,
+            erniesat_ckpt=erniesat_ckpt,
+            erniesat_stat=erniesat_stat,
+            voc=voc,
+            voc_config=voc_config,
+            voc_ckpt=voc_ckpt,
+            voc_stat=voc_stat,
+            output_name=output_name,
+            source_lang=source_lang,
+            target_lang=target_lang)
+
+        return run_cmd(cmd, output_name)
+
+    def en_synthesize_edit(self,
+                           old_str: str,
+                           new_str: str,
+                           input_name: os.PathLike,
+                           output_name: os.PathLike,
+                           task_name: str="synthesize",
+                           erniesat_ckpt_name: str="snapshot_iter_199500.pdz"):
+
+        # 推理文件配置
+        config_path = os.path.join(self.en_pretrain_model_path, "default.yaml")
+        phones_dict = os.path.join(self.en_pretrain_model_path,
+                                   "phone_id_map.txt")
+        erniesat_ckpt = os.path.join(self.en_pretrain_model_path,
+                                     erniesat_ckpt_name)
+        erniesat_stat = os.path.join(self.en_pretrain_model_path,
+                                     "speech_stats.npy")
+
+        voc = "hifigan_aishell3"
+        voc_config = os.path.join(self.zh_voc_model_path, "default.yaml")
+        voc_ckpt = os.path.join(self.zh_voc_model_path,
+                                "snapshot_iter_2500000.pdz")
+        voc_stat = os.path.join(self.zh_voc_model_path, "feats_stats.npy")
+
+        cmd = self.get_cmd(
+            task_name=task_name,
+            input_name=input_name,
+            old_str=old_str,
+            new_str=new_str,
+            config_path=config_path,
+            phones_dict=phones_dict,
+            erniesat_ckpt=erniesat_ckpt,
+            erniesat_stat=erniesat_stat,
+            voc=voc,
+            voc_config=voc_config,
+            voc_ckpt=voc_ckpt,
+            voc_stat=voc_stat,
+            output_name=output_name,
+            source_lang="en",
+            target_lang="en")
+
+        return run_cmd(cmd, output_name)
+
+    def get_cmd(self,
+                task_name: str,
+                input_name: str,
+                old_str: str,
+                new_str: str,
+                config_path: str,
+                phones_dict: str,
+                erniesat_ckpt: str,
+                erniesat_stat: str,
+                voc: str,
+                voc_config: str,
+                voc_ckpt: str,
+                voc_stat: str,
+                output_name: str,
+                source_lang: str,
+                target_lang: str):
+        cmd = f"""
+            FLAGS_allocator_strategy=naive_best_fit \
+            FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+            python3 {self.BIN_DIR}/synthesize_e2e.py \
+                --task_name={task_name} \
+                --wav_path={input_name} \
+                --old_str='{old_str}' \
+                --new_str='{new_str}' \
+                --source_lang={source_lang} \
+                --target_lang={target_lang} \
+                --erniesat_config={config_path} \
+                --phones_dict={phones_dict} \
+                --erniesat_ckpt={erniesat_ckpt} \
+                --erniesat_stat={erniesat_stat} \
+                --voc={voc} \
+                --voc_config={voc_config} \
+                --voc_ckpt={voc_ckpt} \
+                --voc_stat={voc_stat} \
+                --output_name={output_name}
+        """
+
+        return cmd
diff --git a/demos/speech_web/speech_server/src/finetune.py b/demos/speech_web/speech_server/src/finetune.py
new file mode 100644
index 00000000..d7a440f9
--- /dev/null
+++ b/demos/speech_web/speech_server/src/finetune.py
@@ -0,0 +1,125 @@
+import os
+
+from .util import MAIN_ROOT
+from .util import run_cmd
+
+
+def find_max_ckpt(model_path):
+    max_ckpt = 0
+    for filename in os.listdir(model_path):
+        if filename.endswith('.pdz'):
+            files = filename[:-4]
+            a1, a2, it = files.split("_")
+            if int(it) > max_ckpt:
+                max_ckpt = int(it)
+    return max_ckpt
+
+
+class FineTune:
+    def __init__(self):
+        self.now_file_path = os.path.dirname(__file__)
+        self.PYTHONPATH = os.path.join(MAIN_ROOT,
+                                       "examples/other/tts_finetune/tts3")
+        self.BIN_DIR = os.path.join(MAIN_ROOT,
+                                    "paddlespeech/t2s/exps/fastspeech2")
+        self.pretrained_model_dir = os.path.realpath(
+            "source/model/fastspeech2_aishell3_ckpt_1.1.0")
+        self.voc_model_dir = os.path.realpath(
+            "source/model/hifigan_aishell3_ckpt_0.2.0")
+        self.finetune_config = os.path.join("conf/tts3_finetune.yaml")
+
+    def finetune(self, input_dir, exp_dir='temp', epoch=100):
+        """
+        use cmd follow examples/other/tts_finetune/tts3/run.sh
+        """
+        newdir_name = "newdir"
+        new_dir = os.path.join(input_dir, newdir_name)
+        mfa_dir = os.path.join(exp_dir, 'mfa_result')
+        dump_dir = os.path.join(exp_dir, 'dump')
+        output_dir = os.path.join(exp_dir, 'exp')
+        lang = "zh"
+        ngpu = 1
+
+        cmd = f"""
+            # check oov
+            python3 {self.PYTHONPATH}/local/check_oov.py \
+                --input_dir={input_dir} \
+                --pretrained_model_dir={self.pretrained_model_dir} \
+                --newdir_name={newdir_name} \
+                --lang={lang}
+            
+            # get mfa result
+            python3 {self.PYTHONPATH}/local/get_mfa_result.py \
+                --input_dir={new_dir} \
+                --mfa_dir={mfa_dir} \
+                --lang={lang}
+            
+            # generate durations.txt
+            python3 {self.PYTHONPATH}/local/generate_duration.py \
+                --mfa_dir={mfa_dir} 
+            
+            # extract feature
+            python3 {self.PYTHONPATH}/local/extract_feature.py \
+                --duration_file="./durations.txt" \
+                --input_dir={new_dir} \
+                --dump_dir={dump_dir} \
+                --pretrained_model_dir={self.pretrained_model_dir}
+            
+            # create finetune env
+            python3 {self.PYTHONPATH}/local/prepare_env.py \
+                --pretrained_model_dir={self.pretrained_model_dir} \
+                --output_dir={output_dir}
+            
+            # finetune
+            python3 {self.PYTHONPATH}/local/finetune.py \
+                --pretrained_model_dir={self.pretrained_model_dir} \
+                --dump_dir={dump_dir} \
+                --output_dir={output_dir} \
+                --ngpu={ngpu} \
+                --epoch=100 \
+                --finetune_config={self.finetune_config}
+        """
+
+        print(cmd)
+
+        return run_cmd(cmd, exp_dir)
+
+    def synthesize(self, text, wav_name, out_wav_dir, exp_dir='temp'):
+
+        voc = "hifigan_aishell3"
+        dump_dir = os.path.join(exp_dir, 'dump')
+        output_dir = os.path.join(exp_dir, 'exp')
+        text_path = os.path.join(exp_dir, 'sentences.txt')
+        lang = "zh"
+        ngpu = 1
+
+        model_path = f"{output_dir}/checkpoints"
+        ckpt = find_max_ckpt(model_path)
+
+        # 生成对应的语句
+        with open(text_path, "w", encoding='utf8') as f:
+            f.write(wav_name + " " + text)
+
+        cmd = f"""
+            FLAGS_allocator_strategy=naive_best_fit \
+            FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+            python3 {self.BIN_DIR}/../synthesize_e2e.py \
+                --am=fastspeech2_aishell3 \
+                --am_config={self.pretrained_model_dir}/default.yaml \
+                --am_ckpt={output_dir}/checkpoints/snapshot_iter_{ckpt}.pdz \
+                --am_stat={self.pretrained_model_dir}/speech_stats.npy \
+                --voc={voc} \
+                --voc_config={self.voc_model_dir}/default.yaml \
+                --voc_ckpt={self.voc_model_dir}/snapshot_iter_2500000.pdz \
+                --voc_stat={self.voc_model_dir}/feats_stats.npy \
+                --lang={lang} \
+                --text={text_path} \
+                --output_dir={out_wav_dir} \
+                --phones_dict={dump_dir}/phone_id_map.txt \
+                --speaker_dict={dump_dir}/speaker_id_map.txt \
+                --spk_id=0 
+        """
+
+        out_path = os.path.join(out_wav_dir, f"{wav_name}.wav")
+
+        return run_cmd(cmd, out_path)
diff --git a/demos/speech_web/speech_server/src/ge2e_clone.py b/demos/speech_web/speech_server/src/ge2e_clone.py
new file mode 100644
index 00000000..d90013b9
--- /dev/null
+++ b/demos/speech_web/speech_server/src/ge2e_clone.py
@@ -0,0 +1,57 @@
+import os
+import shutil
+
+from .util import MAIN_ROOT
+from .util import run_cmd
+
+
+class VoiceCloneGE2E():
+    def __init__(self):
+        # Path 到指定路径上
+        self.BIN_DIR = os.path.join(MAIN_ROOT, "paddlespeech/t2s/exps")
+        # am
+        self.am = "fastspeech2_aishell3"
+        self.am_config = "source/model/fastspeech2_nosil_aishell3_vc1_ckpt_0.5/default.yaml"
+        self.am_ckpt = "source/model/fastspeech2_nosil_aishell3_vc1_ckpt_0.5/snapshot_iter_96400.pdz"
+        self.am_stat = "source/model/fastspeech2_nosil_aishell3_vc1_ckpt_0.5/speech_stats.npy"
+        self.phones_dict = "source/model/fastspeech2_nosil_aishell3_vc1_ckpt_0.5/phone_id_map.txt"
+        # voc
+        self.voc = "pwgan_aishell3"
+        self.voc_config = "source/model/pwg_aishell3_ckpt_0.5/default.yaml"
+        self.voc_ckpt = "source/model/pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz"
+        self.voc_stat = "source/model/pwg_aishell3_ckpt_0.5/feats_stats.npy"
+        # ge2e
+        self.ge2e_params_path = "source/model/ge2e_ckpt_0.3/step-3000000.pdparams"
+
+    def vc(self, text, input_wav, out_wav):
+
+        # input wav 需要形成临时单独文件夹
+        _, full_file_name = os.path.split(input_wav)
+        ref_audio_dir = os.path.realpath("tmp_dir/ge2e")
+        if os.path.exists(ref_audio_dir):
+            shutil.rmtree(ref_audio_dir)
+        else:
+            os.makedirs(ref_audio_dir, exist_ok=True)
+            shutil.copy(input_wav, ref_audio_dir)
+
+        output_dir = os.path.dirname(out_wav)
+
+        cmd = f"""
+            python3 {self.BIN_DIR}/voice_cloning.py \
+                    --am={self.am} \
+                    --am_config={self.am_config} \
+                    --am_ckpt={self.am_ckpt} \
+                    --am_stat={self.am_stat} \
+                    --voc={self.voc} \
+                    --voc_config={self.voc_config} \
+                    --voc_ckpt={self.voc_ckpt} \
+                    --voc_stat={self.voc_stat} \
+                    --ge2e_params_path={self.ge2e_params_path} \
+                    --text="{text}" \
+                    --input-dir={ref_audio_dir} \
+                    --output-dir={output_dir} \
+                    --phones-dict={self.phones_dict}
+        """
+
+        output_name = os.path.join(output_dir, full_file_name)
+        return run_cmd(cmd, output_name=output_name)
diff --git a/demos/speech_web/speech_server/src/tdnn_clone.py b/demos/speech_web/speech_server/src/tdnn_clone.py
new file mode 100644
index 00000000..c24b9b07
--- /dev/null
+++ b/demos/speech_web/speech_server/src/tdnn_clone.py
@@ -0,0 +1,54 @@
+import os
+import shutil
+
+from .util import MAIN_ROOT
+from .util import run_cmd
+
+
+class VoiceCloneTDNN():
+    def __init__(self):
+        # Path 到指定路径上
+        self.BIN_DIR = os.path.join(MAIN_ROOT, "paddlespeech/t2s/exps")
+
+        self.am = "fastspeech2_aishell3"
+        self.am_config = "source/model/fastspeech2_aishell3_ckpt_vc2_1.2.0/default.yaml"
+        self.am_ckpt = "source/model/fastspeech2_aishell3_ckpt_vc2_1.2.0/snapshot_iter_96400.pdz"
+        self.am_stat = "source/model/fastspeech2_aishell3_ckpt_vc2_1.2.0/speech_stats.npy"
+        self.phones_dict = "source/model/fastspeech2_aishell3_ckpt_vc2_1.2.0/phone_id_map.txt"
+        # voc
+        self.voc = "pwgan_aishell3"
+        self.voc_config = "source/model/pwg_aishell3_ckpt_0.5/default.yaml"
+        self.voc_ckpt = "source/model/pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz"
+        self.voc_stat = "source/model/pwg_aishell3_ckpt_0.5/feats_stats.npy"
+
+    def vc(self, text, input_wav, out_wav):
+        # input wav 需要形成临时单独文件夹
+        _, full_file_name = os.path.split(input_wav)
+        ref_audio_dir = os.path.realpath("tmp_dir/tdnn")
+        if os.path.exists(ref_audio_dir):
+            shutil.rmtree(ref_audio_dir)
+        else:
+            os.makedirs(ref_audio_dir, exist_ok=True)
+            shutil.copy(input_wav, ref_audio_dir)
+
+        output_dir = os.path.dirname(out_wav)
+
+        cmd = f"""
+            python3 {self.BIN_DIR}/voice_cloning.py \
+                    --am={self.am} \
+                    --am_config={self.am_config} \
+                    --am_ckpt={self.am_ckpt} \
+                    --am_stat={self.am_stat} \
+                    --voc={self.voc} \
+                    --voc_config={self.voc_config} \
+                    --voc_ckpt={self.voc_ckpt} \
+                    --voc_stat={self.voc_stat} \
+                    --text="{text}" \
+                    --input-dir={ref_audio_dir} \
+                    --output-dir={output_dir} \
+                    --phones-dict={self.phones_dict} \
+                    --use_ecapa=True
+        """
+
+        output_name = os.path.join(output_dir, full_file_name)
+        return run_cmd(cmd, output_name=output_name)
diff --git a/demos/speech_web/speech_server/src/util.py b/demos/speech_web/speech_server/src/util.py
index 4a566b6e..a69e6c42 100644
--- a/demos/speech_web/speech_server/src/util.py
+++ b/demos/speech_web/speech_server/src/util.py
@@ -1,4 +1,9 @@
+import os
 import random
+import subprocess
+
+NOW_FILE_PATH = os.path.dirname(__file__)
+MAIN_ROOT = os.path.realpath(os.path.join(NOW_FILE_PATH, "../../../../"))
 
 
 def randName(n=5):
@@ -11,3 +16,20 @@ def SuccessRequest(result=None, message="ok"):
 
 def ErrorRequest(result=None, message="error"):
     return {"code": -1, "result": result, "message": message}
+
+
+def run_cmd(cmd, output_name):
+    p = subprocess.Popen(cmd, shell=True)
+    res = p.wait()
+    print(cmd)
+    print("运行结果：", res)
+    if res == 0:
+        # 运行成功
+        if os.path.exists(output_name):
+            return output_name
+        else:
+            # 合成的文件不存在
+            return None
+    else:
+        # 运行失败
+        return None
diff --git a/demos/speech_web/speech_server/vc.py b/demos/speech_web/speech_server/vc.py
new file mode 100644
index 00000000..99e56b40
--- /dev/null
+++ b/demos/speech_web/speech_server/vc.py
@@ -0,0 +1,547 @@
+import argparse
+import base64
+import datetime
+import json
+import os
+from typing import List
+
+import aiofiles
+import librosa
+import soundfile as sf
+import uvicorn
+from fastapi import FastAPI
+from fastapi import UploadFile
+from pydantic import BaseModel
+from src.ernie_sat import SAT
+from src.finetune import FineTune
+from src.ge2e_clone import VoiceCloneGE2E
+from src.tdnn_clone import VoiceCloneTDNN
+from src.util import *
+from starlette.responses import FileResponse
+
+from paddlespeech.server.utils.audio_process import float2pcm
+
+# 解析配置
+parser = argparse.ArgumentParser(prog='PaddleSpeechDemo', add_help=True)
+
+parser.add_argument(
+    "--port",
+    action="store",
+    type=int,
+    help="port of the app",
+    default=8010,
+    required=False)
+
+args = parser.parse_args()
+port = args.port
+
+# 这里会对finetune产生影响，所以finetune使用了cmd
+vc_model = VoiceCloneGE2E()
+vc_model_tdnn = VoiceCloneTDNN()
+
+sat_model = SAT()
+ft_model = FineTune()
+
+# 配置文件
+tts_config = "conf/tts_online_application.yaml"
+asr_config = "conf/ws_conformer_wenetspeech_application_faster.yaml"
+asr_init_path = "source/demo/demo.wav"
+db_path = "source/db/vc.sqlite"
+ie_model_path = "source/model"
+
+# 路径配置
+VC_UPLOAD_PATH = "source/wav/vc/upload"
+VC_OUT_PATH = "source/wav/vc/out"
+
+FT_UPLOAD_PATH = "source/wav/finetune/upload"
+FT_OUT_PATH = "source/wav/finetune/out"
+FT_LABEL_PATH = "source/wav/finetune/label.json"
+FT_LABEL_TXT_PATH = "source/wav/finetune/labels.txt"
+FT_DEFAULT_PATH = "source/wav/finetune/default"
+FT_EXP_BASE_PATH = "tmp_dir/finetune"
+
+SAT_UPLOAD_PATH = "source/wav/SAT/upload"
+SAT_OUT_PATH = "source/wav/SAT/out"
+SAT_LABEL_PATH = "source/wav/SAT/label.json"
+
+# SAT 标注结果初始化
+if os.path.exists(SAT_LABEL_PATH):
+    with open(SAT_LABEL_PATH, "r", encoding='utf8') as f:
+        sat_label_dic = json.load(f)
+else:
+    sat_label_dic = {}
+
+# ft 标注结果初始化
+if os.path.exists(FT_LABEL_PATH):
+    with open(FT_LABEL_PATH, "r", encoding='utf8') as f:
+        ft_label_dic = json.load(f)
+else:
+    ft_label_dic = {}
+
+# 新建文件夹
+base_sources = [
+    VC_UPLOAD_PATH,
+    VC_OUT_PATH,
+    FT_UPLOAD_PATH,
+    FT_OUT_PATH,
+    FT_DEFAULT_PATH,
+    SAT_UPLOAD_PATH,
+    SAT_OUT_PATH,
+]
+for path in base_sources:
+    os.makedirs(path, exist_ok=True)
+#####################################################################
+########################### APP初始化  ###############################
+#####################################################################
+app = FastAPI()
+
+######################################################################
+########################### 接口类型  #################################
+#####################################################################
+
+
+# 接口结构
+class VcBase(BaseModel):
+    wavName: str
+    wavPath: str
+
+
+class VcBaseText(BaseModel):
+    wavName: str
+    wavPath: str
+    text: str
+    func: str
+
+
+class VcBaseSAT(BaseModel):
+    old_str: str
+    new_str: str
+    language: str
+    function: str
+    wav: str  # base64编码
+    filename: str
+
+
+class FTPath(BaseModel):
+    dataPath: str
+
+
+class VcBaseFT(BaseModel):
+    wav: str  # base64编码
+    filename: str
+    wav_path: str
+
+
+class VcBaseFTModel(BaseModel):
+    wav_path: str
+
+
+class VcBaseFTSyn(BaseModel):
+    exp_path: str
+    text: str
+
+
+######################################################################
+########################### 文件列表查询与保存服务 #################################
+#####################################################################
+
+
+def getVCList(path):
+    VC_FileDict = []
+    # 查询upload路径下的wav文件名
+    for root, dirs, files in os.walk(path, topdown=False):
+        for name in files:
+            # print(os.path.join(root, name))
+            VC_FileDict.append({'name': name, 'path': os.path.join(root, name)})
+    VC_FileDict = sorted(VC_FileDict, key=lambda x: x['name'], reverse=True)
+    return VC_FileDict
+
+
+async def saveFiles(files, SavePath):
+    right = 0
+    error = 0
+    error_info = "错误文件："
+    for file in files:
+        try:
+            if 'blob' in file.filename:
+                out_file_path = os.path.join(
+                    SavePath,
+                    datetime.datetime.strftime(datetime.datetime.now(),
+                                               '%H%M') + randName(3) + ".wav")
+            else:
+                out_file_path = os.path.join(SavePath, file.filename)
+
+            print("上传文件名:", out_file_path)
+            async with aiofiles.open(out_file_path, 'wb') as out_file:
+                content = await file.read()  # async read
+                await out_file.write(content)  # async write
+            # 将文件转成24k, 16bit类型的wav文件
+            wav, sr = librosa.load(out_file_path, sr=16000)
+            sf.write(out_file_path, data=wav, samplerate=sr)
+            right += 1
+        except Exception as e:
+            error += 1
+            error_info = error_info + file.filename + " " + str(e) + "\n"
+            continue
+    return f"上传成功：{right}, 上传失败：{error}, 失败原因： {error_info}"
+
+
+# 音频下载
+@app.post("/vc/download")
+async def VcDownload(base: VcBase):
+    if os.path.exists(base.wavPath):
+        return FileResponse(base.wavPath)
+    else:
+        return ErrorRequest(message="下载请求失败，文件不存在")
+
+
+# 音频下载base64
+@app.post("/vc/download_base64")
+async def VcDownloadBase64(base: VcBase):
+    if os.path.exists(base.wavPath):
+        # 将文件转成16k, 16bit类型的wav文件
+        wav, sr = librosa.load(base.wavPath, sr=16000)
+        wav = float2pcm(wav)  # float32 to int16
+        wav_bytes = wav.tobytes()  # to bytes
+        wav_base64 = base64.b64encode(wav_bytes).decode('utf8')
+        return SuccessRequest(result=wav_base64)
+    else:
+        return ErrorRequest(message="播放请求失败，文件不存在")
+
+
+######################################################################
+########################### VC 服务 #################################
+#####################################################################
+
+
+# 上传文件
+@app.post("/vc/upload")
+async def VcUpload(files: List[UploadFile]):
+    # res = saveFiles(files, VC_UPLOAD_PATH)
+    right = 0
+    error = 0
+    error_info = "错误文件："
+    for file in files:
+        try:
+            if 'blob' in file.filename:
+                out_file_path = os.path.join(
+                    VC_UPLOAD_PATH,
+                    datetime.datetime.strftime(datetime.datetime.now(),
+                                               '%H%M') + randName(3) + ".wav")
+            else:
+                out_file_path = os.path.join(VC_UPLOAD_PATH, file.filename)
+
+            print("上传文件名:", out_file_path)
+            async with aiofiles.open(out_file_path, 'wb') as out_file:
+                content = await file.read()  # async read
+                await out_file.write(content)  # async write
+            # 将文件转成24k, 16bit类型的wav文件
+            wav, sr = librosa.load(out_file_path, sr=16000)
+            sf.write(out_file_path, data=wav, samplerate=sr)
+            right += 1
+        except Exception as e:
+            error += 1
+            error_info = error_info + file.filename + " " + str(e) + "\n"
+            continue
+    return SuccessRequest(
+        result=f"上传成功：{right}, 上传失败：{error}, 失败原因： {error_info}")
+
+
+# 获取文件列表
+@app.get("/vc/list")
+async def VcList():
+    res = getVCList(VC_UPLOAD_PATH)
+    return SuccessRequest(result=res)
+
+
+# 获取音频文件
+@app.post("/vc/file")
+async def VcFileGet(base: VcBase):
+    if os.path.exists(base.wavPath):
+        return FileResponse(base.wavPath)
+    else:
+        return ErrorRequest(result="获取文件失败")
+
+
+# 删除音频文件
+@app.post("/vc/del")
+async def VcFileDel(base: VcBase):
+    if os.path.exists(base.wavPath):
+        os.remove(base.wavPath)
+        return SuccessRequest(result="删除成功")
+    else:
+        return ErrorRequest(result="删除失败")
+
+
+# 声音克隆G2P
+@app.post("/vc/clone_g2p")
+async def VcCloneG2P(base: VcBaseText):
+    if os.path.exists(base.wavPath):
+        try:
+            if base.func == 'ge2e':
+                wavName = base.wavName
+                wavPath = os.path.join(VC_OUT_PATH, wavName)
+                vc_model.vc(
+                    text=base.text, input_wav=base.wavPath, out_wav=wavPath)
+            else:
+                wavName = base.wavName
+                wavPath = os.path.join(VC_OUT_PATH, wavName)
+                vc_model_tdnn.vc(
+                    text=base.text, input_wav=base.wavPath, out_wav=wavPath)
+            res = {"wavName": wavName, "wavPath": wavPath}
+            return SuccessRequest(result=res)
+        except Exception as e:
+            print(e)
+            return ErrorRequest(message="克隆失败，合成过程报错")
+    else:
+        return ErrorRequest(message="克隆失败，音频不存在")
+
+
+######################################################################
+########################### SAT 服务 #################################
+#####################################################################
+# 声音克隆SAT
+@app.post("/vc/clone_sat")
+async def VcCloneSAT(base: VcBaseSAT):
+    # 重新整理 sat_label_dict
+    if base.filename not in sat_label_dic or sat_label_dic[
+            base.filename] != base.old_str:
+        sat_label_dic[base.filename] = base.old_str
+        with open(SAT_LABEL_PATH, "w", encoding='utf8') as f:
+            json.dump(sat_label_dic, f, ensure_ascii=False, indent=4)
+
+    input_file_path = base.wav
+
+    # 选择任务
+    if base.language == "zh":
+        # 中文
+        if base.function == "synthesize":
+            output_file_path = os.path.join(SAT_OUT_PATH,
+                                            "sat_syn_zh_" + base.filename)
+            # 中文克隆
+            sat_result = sat_model.zh_synthesize_edit(
+                old_str=base.old_str,
+                new_str=base.new_str,
+                input_name=os.path.realpath(input_file_path),
+                output_name=os.path.realpath(output_file_path),
+                task_name="synthesize")
+        elif base.function == "edit":
+            output_file_path = os.path.join(SAT_OUT_PATH,
+                                            "sat_edit_zh_" + base.filename)
+            # 中文语音编辑
+            sat_result = sat_model.zh_synthesize_edit(
+                old_str=base.old_str,
+                new_str=base.new_str,
+                input_name=os.path.realpath(input_file_path),
+                output_name=os.path.realpath(output_file_path),
+                task_name="edit")
+        elif base.function == "crossclone":
+            output_file_path = os.path.join(SAT_OUT_PATH,
+                                            "sat_cross_zh_" + base.filename)
+            # 中文跨语言
+            sat_result = sat_model.crossclone(
+                old_str=base.old_str,
+                new_str=base.new_str,
+                input_name=os.path.realpath(input_file_path),
+                output_name=os.path.realpath(output_file_path),
+                source_lang="zh",
+                target_lang="en")
+        else:
+            return ErrorRequest(
+                message="请检查功能选项是否正确，仅支持:synthesize, edit, crossclone")
+    elif base.language == "en":
+        if base.function == "synthesize":
+            output_file_path = os.path.join(SAT_OUT_PATH,
+                                            "sat_syn_zh_" + base.filename)
+            # 英文语音克隆
+            sat_result = sat_model.en_synthesize_edit(
+                old_str=base.old_str,
+                new_str=base.new_str,
+                input_name=os.path.realpath(input_file_path),
+                output_name=os.path.realpath(output_file_path),
+                task_name="synthesize")
+        elif base.function == "edit":
+            output_file_path = os.path.join(SAT_OUT_PATH,
+                                            "sat_edit_zh_" + base.filename)
+            # 英文语音编辑
+            sat_result = sat_model.en_synthesize_edit(
+                old_str=base.old_str,
+                new_str=base.new_str,
+                input_name=os.path.realpath(input_file_path),
+                output_name=os.path.realpath(output_file_path),
+                task_name="edit")
+        elif base.function == "crossclone":
+            output_file_path = os.path.join(SAT_OUT_PATH,
+                                            "sat_cross_zh_" + base.filename)
+            # 英文跨语言
+            sat_result = sat_model.crossclone(
+                old_str=base.old_str,
+                new_str=base.new_str,
+                input_name=os.path.realpath(input_file_path),
+                output_name=os.path.realpath(output_file_path),
+                source_lang="en",
+                target_lang="zh")
+        else:
+            return ErrorRequest(
+                message="请检查功能选项是否正确，仅支持:synthesize, edit, crossclone")
+    else:
+        return ErrorRequest(message="请检查功能选项是否正确，仅支持中文和英文")
+
+    if sat_result:
+        return SuccessRequest(result=sat_result, message="SAT合成成功")
+    else:
+        return ErrorRequest(message="SAT 合成失败，请从后台检查错误信息！")
+
+
+# SAT 文件列表
+@app.get("/sat/list")
+async def SatList():
+    res = []
+    filelist = getVCList(SAT_UPLOAD_PATH)
+    for fileitem in filelist:
+        if fileitem['name'] in sat_label_dic:
+            fileitem['label'] = sat_label_dic[fileitem['name']]
+        else:
+            fileitem['label'] = ""
+        res.append(fileitem)
+    return SuccessRequest(result=res)
+
+
+# 上传 SAT 音频
+# 上传文件
+@app.post("/sat/upload")
+async def SATUpload(files: List[UploadFile]):
+    right = 0
+    error = 0
+    error_info = "错误文件："
+    for file in files:
+        try:
+            if 'blob' in file.filename:
+                out_file_path = os.path.join(
+                    SAT_UPLOAD_PATH,
+                    datetime.datetime.strftime(datetime.datetime.now(),
+                                               '%H%M') + randName(3) + ".wav")
+            else:
+                out_file_path = os.path.join(SAT_UPLOAD_PATH, file.filename)
+
+            print("上传文件名:", out_file_path)
+            async with aiofiles.open(out_file_path, 'wb') as out_file:
+                content = await file.read()  # async read
+                await out_file.write(content)  # async write
+            # 将文件转成24k, 16bit类型的wav文件
+            wav, sr = librosa.load(out_file_path, sr=16000)
+            sf.write(out_file_path, data=wav, samplerate=sr)
+            right += 1
+        except Exception as e:
+            error += 1
+            error_info = error_info + file.filename + " " + str(e) + "\n"
+            continue
+    return SuccessRequest(
+        result=f"上传成功：{right}, 上传失败：{error}, 失败原因： {error_info}")
+
+
+######################################################################
+########################### FinueTune 服务 #################################
+#####################################################################
+
+
+# finetune 文件列表
+@app.post("/finetune/list")
+async def FineTuneList(Path: FTPath):
+    dataPath = Path.dataPath
+    if dataPath == "default":
+        # 默认路径
+        FT_PATH = FT_DEFAULT_PATH
+    else:
+        FT_PATH = dataPath
+
+    res = []
+    filelist = getVCList(FT_PATH)
+    for name, value in ft_label_dic.items():
+        wav_path = os.path.join(FT_PATH, name)
+        if not os.path.exists(wav_path):
+            wav_path = ""
+        d = {'text': value['text'], 'name': name, 'path': wav_path}
+        res.append(d)
+    return SuccessRequest(result=res)
+
+
+# 一键重置，获取新的文件地址
+@app.get('/finetune/newdir')
+async def FTGetNewDir():
+    new_path = os.path.join(FT_UPLOAD_PATH, randName(3))
+    if not os.path.exists(new_path):
+        os.makedirs(new_path, exist_ok=True)
+    # 把 labels.txt 复制进去
+    cmd = f"cp {FT_LABEL_TXT_PATH} {new_path}"
+    os.system(cmd)
+    return SuccessRequest(result=new_path)
+
+
+# finetune 上传文件
+@app.post("/finetune/upload")
+async def FTUpload(base: VcBaseFT):
+    try:
+        # 文件夹是否存在
+        if not os.path.exists(base.wav_path):
+            os.makedirs(base.wav_path)
+        # 保存音频文件
+        out_file_path = os.path.join(base.wav_path, base.filename)
+        wav_b = base64.b64decode(base.wav)
+        async with aiofiles.open(out_file_path, 'wb') as out_file:
+            await out_file.write(wav_b)  # async write
+
+        return SuccessRequest(result="上传成功")
+    except Exception as e:
+        return ErrorRequest(result="上传失败")
+
+
+# finetune 微调
+@app.post("/finetune/clone_finetune")
+async def FTModel(base: VcBaseFTModel):
+    # 先检查 wav_path 是否有效
+    if base.wav_path == 'default':
+        data_path = FT_DEFAULT_PATH
+    else:
+        data_path = base.wav_path
+    if not os.path.exists(data_path):
+        return ErrorRequest(message="数据文件夹不存在")
+
+    data_base = data_path.split(os.sep)[-1]
+    exp_dir = os.path.join(FT_EXP_BASE_PATH, data_base)
+    try:
+        exp_dir = ft_model.finetune(
+            input_dir=os.path.realpath(data_path),
+            exp_dir=os.path.realpath(exp_dir))
+        if exp_dir:
+            return SuccessRequest(result=exp_dir)
+        else:
+            return ErrorRequest(message="微调失败")
+    except Exception as e:
+        print(e)
+        return ErrorRequest(message="微调失败")
+
+
+# finetune 合成
+@app.post("/finetune/clone_finetune_syn")
+async def FTSyn(base: VcBaseFTSyn):
+    try:
+        if not os.path.exists(base.exp_path):
+            return ErrorRequest(result="模型路径不存在")
+        wav_name = randName(5)
+        wav_path = ft_model.synthesize(
+            text=base.text,
+            wav_name=wav_name,
+            out_wav_dir=os.path.realpath(FT_OUT_PATH),
+            exp_dir=os.path.realpath(base.exp_path))
+        if wav_path:
+            res = {"wavName": wav_name + ".wav", "wavPath": wav_path}
+            return SuccessRequest(result=res)
+        else:
+            return ErrorRequest(message="音频合成失败")
+    except Exception as e:
+        return ErrorRequest(message="音频合成失败")
+
+
+if __name__ == '__main__':
+    uvicorn.run(app=app, host='0.0.0.0', port=port)
diff --git a/demos/speech_web/web_client/package.json b/demos/speech_web/web_client/package.json
index 7f28d4c9..d8c213e4 100644
--- a/demos/speech_web/web_client/package.json
+++ b/demos/speech_web/web_client/package.json
@@ -8,6 +8,7 @@
     "preview": "vite preview"
   },
   "dependencies": {
+    "@element-plus/icons-vue": "^2.0.9",
     "ant-design-vue": "^2.2.8",
     "axios": "^0.26.1",
     "element-plus": "^2.1.9",
@@ -18,6 +19,7 @@
   },
   "devDependencies": {
     "@vitejs/plugin-vue": "^2.3.0",
-    "vite": "^2.9.0"
+    "vite": "^2.9.13",
+    "@vue/compiler-sfc": "^3.1.0"
   }
 }
diff --git a/demos/speech_web/web_client/src/api/API.js b/demos/speech_web/web_client/src/api/API.js
index 0feaa63f..5adca362 100644
--- a/demos/speech_web/web_client/src/api/API.js
+++ b/demos/speech_web/web_client/src/api/API.js
@@ -19,6 +19,26 @@ export const apiURL =   {
     CHAT_SOCKET_RECORD: 'ws://localhost:8010/ws/asr/offlineStream', // ChatBot websocket 接口
     ASR_SOCKET_RECORD: 'ws://localhost:8010/ws/asr/onlineStream',  // Stream ASR 接口
     TTS_SOCKET_RECORD: 'ws://localhost:8010/ws/tts/online', // Stream TTS 接口
+
+    // voice clone
+    // Voice Clone
+    VC_List: '/api/vc/list',
+    SAT_List: '/api/sat/list',
+    FineTune_List: '/api/finetune/list',
+
+    VC_Upload: '/api/vc/upload',
+    SAT_Upload: '/api/sat/upload',
+    FineTune_Upload: '/api/finetune/upload',
+    FineTune_NewDir: '/api/finetune/newdir',
+
+    VC_Download: '/api/vc/download',
+    VC_Download_Base64: '/api/vc/download_base64',
+    VC_Del: '/api/vc/del',
+    
+    VC_CloneG2p: '/api/vc/clone_g2p',
+    VC_CloneSAT: '/api/vc/clone_sat',
+    VC_CloneFineTune: '/api/finetune/clone_finetune',
+    VC_CloneFineTuneSyn: '/api/finetune/clone_finetune_syn',
 }
 
 
diff --git a/demos/speech_web/web_client/src/api/ApiVC.js b/demos/speech_web/web_client/src/api/ApiVC.js
new file mode 100644
index 00000000..0dc0f683
--- /dev/null
+++ b/demos/speech_web/web_client/src/api/ApiVC.js
@@ -0,0 +1,88 @@
+import axios from 'axios'
+import {apiURL} from "./API.js"
+
+// 上传音频-vc
+export async function vcUpload(params){
+    const result = await axios.post(apiURL.VC_Upload, params);
+    return result
+}
+
+// 上传音频-sat
+export async function satUpload(params){
+    const result = await axios.post(apiURL.SAT_Upload, params);
+    return result
+}
+
+// 上传音频-finetune
+export async function fineTuneUpload(params){
+    const result = await axios.post(apiURL.FineTune_Upload, params);
+    return result
+}
+
+// 删除音频
+export async function vcDel(params){
+    const result = await axios.post(apiURL.VC_Del, params);
+    return result
+}
+
+// 获取音频列表vc
+export async function vcList(){
+    const result = await axios.get(apiURL.VC_List);
+    return result
+}
+// 获取音频列表Sat
+export async function satList(){
+    const result = await axios.get(apiURL.SAT_List);
+    return result
+}
+
+// 获取音频列表fineTune
+export async function fineTuneList(params){
+    const result = await axios.post(apiURL.FineTune_List, params);
+    return result
+}
+
+// fineTune 一键重置 获取新的文件夹
+export async function fineTuneNewDir(){
+    const result = await axios.get(apiURL.FineTune_NewDir);
+    return result
+}
+
+// 获取音频数据
+export async function vcDownload(params){
+    const result = await axios.post(apiURL.VC_Download, params);
+    return result
+}
+
+// 获取音频数据Base64
+export async function vcDownloadBase64(params){
+    const result = await axios.post(apiURL.VC_Download_Base64, params);
+    return result
+}
+
+
+// 克隆合成G2P
+export async function vcCloneG2P(params){
+    const result = await axios.post(apiURL.VC_CloneG2p, params);
+    return result
+}
+
+// 克隆合成SAT
+export async function vcCloneSAT(params){
+    const result = await axios.post(apiURL.VC_CloneSAT, params);
+    return result
+}
+
+// 克隆合成 - finetune 微调
+export async function vcCloneFineTune(params){
+    const result = await axios.post(apiURL.VC_CloneFineTune, params);
+    return result
+}
+
+// 克隆合成 - finetune 合成
+export async function vcCloneFineTuneSyn(params){
+    const result = await axios.post(apiURL.VC_CloneFineTuneSyn, params);
+    return result
+}
+
+
diff --git a/demos/speech_web/web_client/src/components/Content/Header/Header.vue b/demos/speech_web/web_client/src/components/Content/Header/Header.vue
index 8135a2bf..c20f3366 100644
--- a/demos/speech_web/web_client/src/components/Content/Header/Header.vue
+++ b/demos/speech_web/web_client/src/components/Content/Header/Header.vue
@@ -4,7 +4,7 @@
         飞桨-PaddleSpeech
       </div>
       <div className="speech_header_describe">
-        PaddleSpeech 是基于飞桨 PaddlePaddle 的语音方向的开源模型库，用于语音和音频中的各种关键任务的开发，欢迎大家Star收藏鼓励
+        PaddleSpeech 是基于飞桨 PaddlePaddle 的语音方向的开源模型库，用于语音和音频中的各种关键任务的开发。支持语音识别，语音合成，声纹识别，声音分类，语音唤醒，语音翻译等多种语音任务，荣获 NAACL2022 Best Demo Award 。如果你喜欢这个示例，欢迎在 github 中 star 收藏鼓励。
       </div>
       <div className="speech_header_link_box">
         <a href="https://github.com/PaddlePaddle/PaddleSpeech" className="speech_header_link"  target='_blank' rel='noreferrer' key={index}>
diff --git a/demos/speech_web/web_client/src/components/Content/Header/style.less b/demos/speech_web/web_client/src/components/Content/Header/style.less
index 9d026137..cc97c741 100644
--- a/demos/speech_web/web_client/src/components/Content/Header/style.less
+++ b/demos/speech_web/web_client/src/components/Content/Header/style.less
@@ -43,6 +43,7 @@
         margin-bottom: 40px;
         display: flex;
         align-items: center;
+        margin-top: 40px;
     };
     .speech_header_link {
         display: block;
diff --git a/demos/speech_web/web_client/src/components/Experience.vue b/demos/speech_web/web_client/src/components/Experience.vue
index 5620d6af..4f32faf9 100644
--- a/demos/speech_web/web_client/src/components/Experience.vue
+++ b/demos/speech_web/web_client/src/components/Experience.vue
@@ -6,6 +6,10 @@ import TTST from './SubMenu/TTS/TTST.vue'
 import VPRT from './SubMenu/VPR/VPRT.vue'
 import IET from './SubMenu/IE/IET.vue'
 
+import VoiceCloneT from './SubMenu/VoiceClone/VoiceClone.vue'
+import ENIRE_SATT from './SubMenu/ENIRE_SAT/ENIRE_SAT.vue'
+import FineTuneT from './SubMenu/FineTune/FineTune.vue'
+
 </script>
 
 <template>
@@ -37,6 +41,15 @@ import IET from './SubMenu/IE/IET.vue'
             <el-tab-pane label="语音指令" key="5">
             <IET></IET>
             </el-tab-pane>
+            <el-tab-pane label="一句话合成" key="6">
+            <VoiceCloneT></VoiceCloneT>
+            </el-tab-pane>
+            <el-tab-pane label="小数据微调" key="7">
+            <FineTuneT></FineTuneT>
+            </el-tab-pane>
+            <el-tab-pane label="ENIRE SAT" key="8">
+            <ENIRE_SATT></ENIRE_SATT>
+            </el-tab-pane>
           </el-tabs>
         </div>
       </div>
diff --git a/demos/speech_web/web_client/src/components/SubMenu/ENIRE_SAT/ENIRE_SAT.vue b/demos/speech_web/web_client/src/components/SubMenu/ENIRE_SAT/ENIRE_SAT.vue
new file mode 100644
index 00000000..e1a4f234
--- /dev/null
+++ b/demos/speech_web/web_client/src/components/SubMenu/ENIRE_SAT/ENIRE_SAT.vue
@@ -0,0 +1,487 @@
+<template>
+    <div class="sat">
+      <el-row :gutter="20">
+            <el-col :span="12"><div class="grid-content ep-bg-purple" />
+                <el-row :gutter="60" class="btn_row_wav" justify="center">
+                    <el-button class="ml-3" v-if="onEnrollRec === 0" @click="startRecorderEnroll()" type="primary">录制音频</el-button>
+                    <el-button class="ml-3" v-else-if="onEnrollRec === 1" @click="stopRecorderEnroll()" type="danger">停止录音</el-button>
+                    <el-button class="ml-3" v-else @click="uploadRecord()" type="success">上传录音</el-button>
+                    <a>&#12288</a>
+                    <el-upload
+                        :multiple="false"
+                        :accept="'.wav'"
+                        :auto-upload="false"
+                        :on-change="handleChange"
+                        :show-file-list="false"
+                    >
+                        <el-button class="ml-3" type="success">上传音频文件</el-button>
+                    </el-upload>
+                </el-row>
+                <div class="recording_table">
+                <el-table :data="vcDatas" border class="recording_table_box" scrollbar-always-on max-height="250px">
+                    <!-- <el-table-column prop="wavId" label="序号" width="60"/> -->
+                    <el-table-column prop="wavName" label="文件名" width="150"/>
+                    <el-table-column label="文本">
+                      <template #default="scope">
+                            <el-input 
+                              v-model="scope.row.label"
+                              :autosize="{ minRows: 8, maxRows: 13 }" 
+                              placeholder="Please input"
+                              />
+                            
+                        </template>
+                    </el-table-column>
+                    <el-table-column label="操作" width="80">
+                        <template #default="scope">
+                            <div class="flex justify-space-between mb-4 flex-wrap gap-4">
+                                <a @click="PlayTable(scope.row.wavId)"><el-icon><VideoPlay /></el-icon></a>
+                                <a>&#12288</a>
+                                <a @click="delWav(scope.row.wavId)"><el-icon><DeleteFilled /></el-icon></a>
+                            </div>
+                        </template>
+                    </el-table-column>
+                    <el-table-column fixed="right" label="选择" width="70">
+                        <template #default="scope">
+                            <el-switch v-model="scope.row.status"  @click="choseWav(scope.row.wavId)"/>
+                        </template>
+                    </el-table-column>
+                </el-table>
+                </div>
+
+            </el-col>
+            <el-col :span="8"><div class="grid-content ep-bg-purple" />
+                <el-space direction="vertical">
+                    <el-card class="box-card" style="width: 250px; height:310px">
+                        <template #header>
+                            <div class="card-header">
+                            <span>功能选择</span>
+                            </div>
+                        </template>  
+                        <el-radio-group v-model="funcMode">
+                          <el-radio label="1" size="middle" border style="margin-bottom: 10px">个性化语音合成</el-radio>
+                            <el-input
+                              v-if="funcMode === '1'"
+                              v-model="ttsText"
+                              :autosize="{ minRows: 2, maxRows: 2 }"
+                              type="textarea"
+                              placeholder="Please input"
+                              style="margin-bottom: 10px"
+                              />
+                          <el-radio label="2" size="middle" border style="margin-bottom: 10px">跨语言语音合成</el-radio>
+                            <el-input
+                              v-if="funcMode === '2'"
+                              v-model="ttsText"
+                              :autosize="{ minRows: 2, maxRows: 2 }"
+                              type="textarea"
+                              placeholder="Please input"
+                              style="margin-bottom: 10px"
+                              />
+                          <el-radio label="3" size="middle" border style="margin-bottom: 10px">语音编辑</el-radio>
+                            <el-input
+                                v-if="funcMode === '3'"
+                                v-model="ttsText"
+                                :autosize="{ minRows: 2, maxRows: 2 }"
+                                type="textarea"
+                                placeholder="Please input"
+                                style="margin-bottom: 10px"
+                                />
+                        </el-radio-group>
+                    </el-card>                    
+                </el-space>
+            </el-col>
+            <el-col :span="4"><div class="grid-content ep-bg-purple" />
+                <div class="play_board">
+                    <el-space direction="vertical">
+                        <el-row :gutter="20">
+                            <el-button size="large" v-if="onSyn === 0" type="primary" @click="SatSyn()">开始合成</el-button>
+                            <el-button size="large" v-else :loading-icon="Eleme" type="danger">合成中</el-button>
+                        </el-row>
+                        <el-row :gutter="20">
+                            <el-button v-if='this.cloneWav' type="success" @click="PlaySyn()">播放</el-button>
+                            <el-button v-else disabled type="success" @click="PlaySyn()">播放</el-button>
+                            <el-button v-if='this.cloneWav' type="primary" @click="downLoadCloneWav()">下载</el-button>
+                            <el-button v-else disabled type="primary" @click="downLoadCloneWav()">下载</el-button>
+                        </el-row>
+                    </el-space>
+                </div>
+            </el-col>
+        </el-row>
+</div>
+</template>
+
+<script>
+import { vcCloneSAT, vcDownload, vcDownloadBase64, satUpload, satList, vcDel } from '../../../api/ApiVC'
+import Recorder from 'js-audio-recorder'
+
+let audioCtx = new AudioContext({
+latencyHint: 'interactive',
+sampleRate: 24000,
+});
+
+// 初始化录音
+const recorder = new Recorder({
+  sampleBits: 16,                 // 采样位数，支持 8 或 16，默认是16
+  sampleRate: 16000,              // 采样率，支持 11025、16000、22050、24000、44100、48000，根据浏览器默认值，我的chrome是48000
+  numChannels: 1,                 // 声道，支持 1 或 2， 默认是1
+  compiling: true
+})
+
+export default {
+name:"",
+data(){
+    return {
+        uploadStatus : 0,
+        recognitionStatus : 0,
+        asrResult : "",
+        indicator : "",
+        
+        filename: "",
+        upfile: "",
+        mode: 1,
+        language: 1,
+        wav_input: "卡尔普陪外孙玩滑梯",
+        new_input: "卡尔普陪外孙打滑梯",
+        received_file:"",
+
+        // 分割线
+        onEnrollRec: 0,
+        onSyn:0,
+        vcDatas: [],
+        funcMode: '1',
+        selected_Id: -1,
+        ttsText: '',
+        cloneWav: '',
+        wav:''
+    }
+},
+
+mounted () {
+        this.GetList()
+    },
+
+methods:{
+    // 获取文件列表
+    async GetList(){
+            this.vcDatas =[]
+            const result = await satList();
+            console.log("List: ", result);
+            for(let i=0; i < result.data.result.length; i++){
+                this.vcDatas.push({
+                    wavName: result.data.result[i]['name'],
+                    wavId: i,
+                    wavPath: result.data.result[i]['path'],
+                    status: false,
+                    label: result.data.result[i]['label']
+                })
+            }
+            console.log("vcDatas: ", this.vcDatas);
+            this.$nextTick(()=>{})
+    },
+
+    // 上传文件切换
+    async handleChange(file, fileList){
+      for(let i=0; i<fileList.length; i++){
+        this.uploadFile(fileList[i])
+      }
+      this.GetList()
+    },
+
+    async uploadFile(file){
+      let formData = new FormData();
+      formData.append('files', file.raw);
+      const result = await satUpload(formData);
+      if (result.data.code === 0) {
+          this.$message.success("音频上传成功")
+          
+      } else {
+          this.$message.error("音频上传失败")
+      }
+    },
+
+    // 开始录音
+    startRecorderEnroll(){
+            this.onEnrollRec = 1
+            recorder.clear()
+            recorder.start()
+        },
+    
+    // 结束录音
+    stopRecorderEnroll(){
+        this.onEnrollRec = 2
+        recorder.stop()
+        this.wav = recorder.getWAVBlob()
+    },
+
+    // 上传录音
+    async uploadRecord(){
+            this.onEnrollRec = 0
+            if(this.wav === ""){
+                this.$message.error("未检测到录音，录音失败，请重新录制")
+                return
+            } else {
+                if(this.wav === ''){
+                    this.$message.error("请先完成录音");
+                    this.onEnrollRec = 0
+                    return
+                } else {
+                    let formData = new FormData();
+                    formData.append('files', this.wav);
+                    const result = await satUpload(formData);
+                    console.log(result)
+                    this.GetList() 
+                }
+                this.$message.success("录音上传成功")
+            }
+        }, 
+
+    // 删除音频文件
+    async delWav(wavId){
+            console.log('wavId', wavId)
+            // 删除文件
+            const result = await vcDel(
+                {
+                  wavName: this.vcDatas[wavId]['wavName'],
+                  wavPath: this.vcDatas[wavId]['wavPath']
+                }
+            );
+            if(!result.data.code){
+                this.$message.success("删除成功")
+            } else {
+                this.$message.error(result.data.msg)
+            }
+            this.GetList()
+            this.reset()
+        },
+    
+    // 播放表格
+    async PlayTable(wavId){
+        this.Play(this.vcDatas[wavId])
+    },
+
+    // 播放音频
+    async Play(wavBase){
+        // 获取音频数据
+        const result = await vcDownloadBase64(wavBase);
+        // console.log('play result', result)
+        if (result.data.code === 0) {
+            // base转换二进制数
+            let typedArray = this.base64ToUint8Array(result.data.result)
+            // 添加wav文件头
+            let view = new DataView(typedArray.buffer);
+            view = Recorder.encodeWAV(view, 16000, 16000, 1, 16, true);
+            // 播放音频
+            this.playAudioData(view.buffer);
+        };
+        },
+    // chose wav
+    choseWav(wavId){
+            this.cloneWav = ''
+            this.nowFile = this.vcDatas[wavId].wavName
+            this.nowIndex = wavId
+            // only wavId is true else false
+            for(let i=0; i<this.vcDatas.length; i++){
+                if(i==wavId){
+                    this.vcDatas[wavId].status = true
+                    this.selected_Id = wavId
+                    this.ttsText = this.vcDatas[wavId]['label']
+                } else {
+                    this.vcDatas[i].status = false
+                }
+            }
+            this.$nextTick(()=>{})
+        },
+
+    // 播放音频
+    playAudioData(wav_buffer){
+        audioCtx.decodeAudioData(wav_buffer, buffer => {
+            let source = audioCtx.createBufferSource();
+            source.buffer = buffer
+            source.connect(audioCtx.destination);
+            source.start();
+        }, function (e) {
+        });
+    },
+
+
+    base64ToUint8Array(base64String){
+       const padding = '='.repeat((4 - base64String.length % 4) % 4);
+        const base64 = (base64String + padding)
+            .replace(/-/g, '+')
+            .replace(/_/g, '/');
+    
+        const rawData = window.atob(base64);
+        const outputArray = new Uint8Array(rawData.length);
+    
+        for (let i = 0; i < rawData.length; ++i) {
+            outputArray[i] = rawData.charCodeAt(i);
+        }
+        return outputArray; 
+    },
+
+    // 检查是否包含中文
+    hasChinese(str) {
+      return /[\u4E00-\u9FA5]+/g.test(str)
+    },
+
+    // SAT合成
+    async SatSyn(){
+      // 检查 select id
+      if(this.selected_Id < 0){
+        return this.$message.error("请先选择音频文件！")
+      }
+
+      // 检查音频对应的文本
+      if(!this.vcDatas[this.selected_Id]['label']){
+        return this.$message.error("音频对应文本不可以为空！")
+      }
+
+      // 检查待合成文本
+      if(!this.ttsText){
+        return this.$message.error("合成文本不可以为空！")
+      }
+
+      // 合成中
+      this.onSyn = 1
+      // 重置 clone wav
+      this.cloneWav = ""
+  
+      const old_str = this.vcDatas[this.selected_Id]['label']
+      const new_str = this.ttsText
+      let language = ""
+      // 包含中文
+      if(this.hasChinese(old_str)){
+        language = "zh"
+      } else{
+        language = "en"
+      }
+      // 功能选择
+      let func = ""
+      if(this.funcMode === '1') {
+        func = "synthesize"
+      } else if(this.funcMode === '2'){
+        func = "crossclone"
+      } else {
+        func = "edit"
+      }
+      
+      let wav_path = this.vcDatas[this.selected_Id]['wavPath']
+      let filename = this.vcDatas[this.selected_Id]['wavName']
+
+      const data = {
+        old_str: old_str,
+        new_str: new_str,
+        language: language,
+        function: func,
+        wav: wav_path,
+        filename: filename
+
+      }
+
+      console.log("sat data: ", data)
+      
+      // sat 接口
+      const result = await vcCloneSAT(data)
+      // 合成完成
+      this.onSyn = 0
+      console.log(result);
+      // debugger
+      if (result.data.code === 0) {
+
+        this.$message.success(result.data.message)
+        // 获取识别文本
+        this.cloneWav = result.data.result
+        console.log("cloneWave", this.cloneWav);
+
+      } else {
+        this.$message.error(result.data.message)
+      };
+    },
+    // 播放合成的音频
+    // 播放音频
+    async PlaySyn(){
+        // 获取音频数据
+        const data = {
+          wavName: "sat_"+this.filename,
+          wavPath: this.cloneWav
+        }
+        const result = await vcDownloadBase64(data);
+        // console.log('play result', result)
+        if (result.data.code === 0) {
+            // base转换二进制数
+            let typedArray = this.base64ToUint8Array(result.data.result)
+            // 添加wav文件头
+            let view = new DataView(typedArray.buffer);
+            view = Recorder.encodeWAV(view, 16000, 16000, 1, 16, true);
+            // 播放音频
+            this.playAudioData(view.buffer);
+        };
+        },
+
+
+    // 下载合成文件
+    async downLoadCloneWav(){
+    if(this.cloneWav  === ""){
+        this.$message.error("音频合成完毕后再下载！")
+    } else {
+        // const result = await vcDownload(this.cloneWav);
+        // 获取音频数据
+        const data = {
+          wavName: "sat_"+this.filename,
+          wavPath: this.cloneWav
+        }
+        const result = await vcDownloadBase64(data);
+        let view;
+        // console.log('play result', result)
+        if (result.data.code === 0) {
+            // base转换二进制数
+            let typedArray = this.base64ToUint8Array(result.data.result)
+            // 添加wav文件头
+            view = new DataView(typedArray.buffer);
+            view = Recorder.encodeWAV(view, 16000, 16000, 1, 16, true);
+            // 播放音频
+            // this.playAudioData(view.buffer);
+        }
+        console.log(view.buffer)
+        // debugger
+        const blob = new Blob([view.buffer], { type: 'audio/wav' });
+        const fileName = new Date().getTime() + '.wav';
+        const down = document.createElement('a');
+        down.download = fileName;
+        down.style.display = 'none';//隐藏,没必要展示出来
+        down.href = URL.createObjectURL(blob);
+        document.body.appendChild(down);
+        down.click();
+        URL.revokeObjectURL(down.href); // 释放URL 对象
+        document.body.removeChild(down);//下载完成移除
+      }
+    },
+
+}
+}   
+
+</script>
+
+<style lang="less" scoped>
+// @import "./style.less";
+.sat {
+    width: 1200px;
+    height: 410px;
+    background: #FFFFFF;
+    padding: 5px 80px 56px 80px;
+    box-sizing: border-box;
+}
+
+.el-row {
+  margin-bottom: 20px;
+}
+.grid-content {
+  border-radius: 4px;
+  min-height: 36px;
+}
+.play_board{
+    height: 100%;
+    display: flex;
+    align-items: center;
+}
+
+</style>
\ No newline at end of file
diff --git a/demos/speech_web/web_client/src/components/SubMenu/FineTune/FineTune.vue b/demos/speech_web/web_client/src/components/SubMenu/FineTune/FineTune.vue
new file mode 100644
index 00000000..895dd586
--- /dev/null
+++ b/demos/speech_web/web_client/src/components/SubMenu/FineTune/FineTune.vue
@@ -0,0 +1,427 @@
+<template>
+    <div class="finetune">
+      <el-row :gutter="20"> 
+        <el-col :span="12"><div class="grid-content ep-bg-purple" />
+          <el-row :gutter="60" class="btn_row_wav" justify="center">
+              <el-button class="ml-3" @click="clearAll()" type="primary">一键重置</el-button>
+              <el-button class="ml-3" @click="resetDefault()" type="primary">默认示例</el-button>
+              <el-button v-if='onFinetune === 0' class="ml-3" @click="fineTuneModel()" type="primary">一键微调</el-button>
+              <el-button v-else-if='onFinetune === 1' class="ml-3" @click="fineTuneModel()" type="danger">微调中</el-button>
+              <el-button v-else-if='onFinetune === 2' class="ml-3" @click="resetFinetuneBtn()" type="success">微调成功</el-button>
+              <el-button v-else class="ml-3" @click="resetFinetuneBtn()" type="success">微调失败</el-button>
+              <!-- <el-button class="ml-3" @click="chooseHistory()" type="warning">历史数据选择</el-button> -->
+        </el-row>
+
+        <div class="recording_table">
+            <el-table :data="vcDatas" border class="recording_table_box" scrollbar-always-on max-height="250px">
+                <el-table-column prop="wavId" label="序号" width="60"/>
+                <el-table-column prop="text" label="文本" />
+                <el-table-column label="音频" width="80">
+                    <template #default="scope">
+                        <a v-if="scope.row.wavPath != ''">{{ scope.row.wavName }}</a>
+                        <a v-else>
+                            
+                            <el-button class="ml-3" v-if="onEnrollRec === 0" @click="startRecorderEnroll()" type="primary" circle>
+                                <el-icon><Microphone /></el-icon>
+                            </el-button>
+                            <el-button class="ml-3" v-else-if="onEnrollRec === 1" @click="stopRecorderEnroll()" type="danger" circle>
+                                <el-icon><Microphone /></el-icon>
+                            </el-button>
+                            <el-button class="ml-3" v-else @click="uploadRecord(scope.row.wavId)" type="success" circle>
+                                <el-icon><Upload /></el-icon>
+                            </el-button>
+                        </a>
+                    </template>
+                </el-table-column>
+                <el-table-column label="操作" width="80" fixed="right">
+                    <template #default="scope">
+                        <div class="flex justify-space-between mb-4 flex-wrap gap-4">
+                            <a @click="PlayTable(scope.row.wavId)"><el-icon><VideoPlay /></el-icon></a>
+                            <a>&#12288</a>
+                            <a @click="delWav(scope.row.wavId)"><el-icon><DeleteFilled /></el-icon></a>
+                        </div>
+                    </template>
+                </el-table-column>
+            </el-table>
+        </div>
+
+            </el-col>
+            <el-col :span="8"><div class="grid-content ep-bg-purple" />
+                <el-space direction="vertical">
+                    <el-card class="box-card" style="width: 250px; height:310px">
+                        <template #header>
+                            
+                            <div class="card-header">
+                                <span>试验路径</span>
+                                <el-input
+                                    v-model="expPath"
+                                    :autosize="{ minRows: 2, maxRows: 3 }"
+                                    type="textarea"
+                                    placeholder="一键微调自动生成，可使用历史试验路径"
+                                    />
+                            </div>
+                        </template>
+                        <span>请输入中文文本</span>
+                        <el-input
+                            v-model="ttsText"
+                            :autosize="{ minRows: 5, maxRows: 6 }"
+                            type="textarea"
+                            placeholder="请输入待合成文本"
+                            />
+                    </el-card>                    
+                </el-space>
+            </el-col>
+            <el-col :span="4"><div class="grid-content ep-bg-purple" />
+                <div class="play_board">
+                    <el-space direction="vertical">
+                        <el-row :gutter="20">
+                            <el-button size="large" v-if="onSyn === 0" type="primary" @click="fineTuneSyn()">开始合成</el-button>
+                            <el-button size="large" v-else :loading-icon="Eleme" type="danger">合成中</el-button>
+                        </el-row>
+
+                        <el-row :gutter="20">
+                            <el-button v-if='this.cloneWav' type="primary" @click="PlaySyn()">播放</el-button>
+                            <el-button v-else disabled type="primary" @click="PlaySyn()">播放</el-button>
+                            <el-button v-if='this.cloneWav' type="primary" @click="downLoadCloneWav()">下载</el-button>
+                            <el-button v-else disabled type="primary" @click="downLoadCloneWav()">下载</el-button>
+                        </el-row>
+                    </el-space>
+                </div>
+            </el-col>
+        </el-row>
+    </div>
+    </template>
+    
+    <script>
+    import Recorder from 'js-audio-recorder'
+    import { vcDownload, vcDownloadBase64, vcCloneFineTune, vcCloneFineTuneSyn, fineTuneList, vcDel, fineTuneUpload, fineTuneNewDir } from '../../../api/ApiVC';
+    
+    // 初始化录音
+    const recorder = new Recorder({
+      sampleBits: 16,                 // 采样位数，支持 8 或 16，默认是16
+      sampleRate: 16000,              // 采样率，支持 11025、16000、22050、24000、44100、48000，根据浏览器默认值，我的chrome是48000
+      numChannels: 1,                 // 声道，支持 1 或 2， 默认是1
+      compiling: true
+    })
+    
+    // 初始化播放器
+    const audioCtx = new AudioContext({
+        latencyHint: 'interactive',
+        sampleRate: 16000,
+    });
+
+    function blobToDataURL(blob, callback) {
+        let a = new FileReader();
+        a.onload = function (e) { callback(e.target.result); }
+        a.readAsDataURL(blob);
+    }
+
+    
+    export default {
+        data(){
+            return {
+              vcDatas:[],
+              defaultDataPath: 'default',
+              nowDataPath: '',
+              expPath: '',
+              wav: '',
+              wav_base64: '',
+              ttsText: '',
+              cloneWav: '',
+              
+              onEnrollRec: 0,  // 录音状态
+              onFinetune: 0,  // 微调状态
+              onSyn: 0, // 合成状态
+            }
+        },
+        mounted () {
+            this.nowDataPath = this.defaultDataPath
+            this.GetList()
+            
+        },
+        methods: {
+            // 重置 btn 
+            resetFinetuneBtn(){
+                this.onFinetune = 0
+            },
+        
+        // 一键重置
+        async clearAll(){
+            this.vcDatas = []
+            const result = await fineTuneNewDir()
+            console.log("clearALL: ", result.data.result);
+            this.nowDataPath = result.data.result
+            this.expPath = ''
+            this.onFinetune = 0
+            await this.GetList()
+        },
+        // 显示默认
+        async resetDefault(){
+            this.nowDataPath = this.defaultDataPath
+            await this.GetList()
+            this.expPath = ''
+        },
+
+        // 开始录音
+        startRecorderEnroll(){
+            this.onEnrollRec = 1
+            recorder.clear()
+            recorder.start()
+        },
+        // 结束录音
+        stopRecorderEnroll(){
+            this.onEnrollRec = 2
+            recorder.stop()
+            this.wav = recorder.getWAVBlob()
+        },
+
+        // 上传录音
+        async uploadRecord(wavId){
+            this.onEnrollRec = 0
+            if(this.wav === ""){
+                this.$message.error("未检测到录音，录音失败，请重新录制")
+                return
+            } else {
+                if(this.wav === ''){
+                    this.$message.error("请先完成录音");
+                    this.onEnrollRec = 0
+                    return
+                } else {
+                    let fileRes = ""
+                    let fileString = ""
+                    fileRes = await this.readFile(this.wav);
+                    fileString = fileRes.result;
+                    const audioBase64type = (fileString.match(/data:[^;]*;base64,/))?.[0] ?? '';
+                    const isBase64 = !!fileString.match(/data:[^;]*;base64,/);
+                    const uploadBase64 = fileString.substr(audioBase64type.length);
+                    
+                    // 上传时指定文件路径
+                    const data = {
+                        'wav': uploadBase64,
+                        'filename': this.vcDatas[wavId]['wavName'],
+                        'wav_path': this.nowDataPath
+                    }
+
+                    const result = await fineTuneUpload(data);
+                    console.log(result)
+                    this.GetList() 
+                }
+                this.$message.success("录音上传成功")
+            }
+        }, 
+        // 读取文件和Blob
+        readFile(file) {
+            return new Promise((resolve, reject) => {
+                const fileReader = new FileReader();
+                fileReader.onload = function () {
+                    resolve(fileReader);
+                };
+                fileReader.onerror = function (err) {
+                    reject(err);
+                };
+                fileReader.readAsDataURL(file);
+                });
+            },
+
+            // 获取文件列表
+          async GetList(){
+            this.vcDatas = []
+            const result = await fineTuneList({
+              dataPath: this.nowDataPath
+            });
+            console.log(result, result.data.result);
+            for(let i=0; i<result.data.result.length; i++){
+                this.vcDatas.push({
+                  wavId: i,
+                  text: result.data.result[i]['text'],
+                  wavName: result.data.result[i]['name'],
+                  wavPath: result.data.result[i]['path'],
+                })
+            }
+            this.$nextTick(()=>{})
+          },
+                  // 播放音频
+    playAudioData( wav_buffer ) {
+        audioCtx.decodeAudioData(wav_buffer, buffer => {
+            var source = audioCtx.createBufferSource();
+            source.buffer = buffer;
+            source.connect(audioCtx.destination);
+            source.start();
+        }, function(e) {
+            Recorder.throwError(e);
+            })
+    },
+        // base64解码
+        base64ToUint8Array(base64String) {
+        const padding = '='.repeat((4 - base64String.length % 4) % 4);
+        const base64 = (base64String + padding)
+                        .replace(/-/g, '+')
+                        .replace(/_/g, '/');
+
+        const rawData = window.atob(base64);
+        const outputArray = new Uint8Array(rawData.length);
+
+        for (let i = 0; i < rawData.length; ++i) {
+                outputArray[i] = rawData.charCodeAt(i);
+        }
+        return outputArray;
+    },
+            // 播放表格
+        async PlayTable(wavId){
+            this.Play(this.vcDatas[wavId])
+        },
+        // 播放合成后的音频
+        async PlaySyn(){
+           
+            if(this.cloneWav  === ""){
+                this.$message.error("请合成音频后再播放！！")
+                return
+            } else {
+                this.Play(this.cloneWav)
+            }
+        },
+        // 播放音频
+        async Play(wavBase){
+                // 获取音频数据
+                const result = await vcDownloadBase64(wavBase);
+                // console.log('play result', result)
+                if (result.data.code === 0) {
+                    // base转换二进制数
+                    let typedArray = this.base64ToUint8Array(result.data.result)
+                    // 添加wav文件头
+                    let view = new DataView(typedArray.buffer);
+                    view = Recorder.encodeWAV(view, 16000, 16000, 1, 16, true);
+                    // 播放音频
+                    this.playAudioData(view.buffer);
+                } else {
+                    this.$message.error("获取音频文件失败")
+                }
+        },
+                // 下载合成文件
+        async downLoadCloneWav(){
+            if(this.cloneWav  === ""){
+                this.$message.error("音频合成完毕后再下载！")
+            } else {
+                // const result = await vcDownload(this.cloneWav);
+                // 获取音频数据
+                const result = await vcDownloadBase64(this.cloneWav);
+                let view;
+                // console.log('play result', result)
+                if (result.data.code === 0) {
+                    // base转换二进制数
+                    let typedArray = this.base64ToUint8Array(result.data.result)
+                    // 添加wav文件头
+                    view = new DataView(typedArray.buffer);
+                    view = Recorder.encodeWAV(view, 16000, 16000, 1, 16, true);
+                    // 播放音频
+                    // this.playAudioData(view.buffer);
+                }
+                console.log(view.buffer)
+                // debugger
+                const blob = new Blob([view.buffer], { type: 'audio/wav' });
+                const fileName = new Date().getTime() + '.wav';
+                const down = document.createElement('a');
+                down.download = fileName;
+                down.style.display = 'none';//隐藏,没必要展示出来
+                down.href = URL.createObjectURL(blob);
+                document.body.appendChild(down);
+                down.click();
+                URL.revokeObjectURL(down.href); // 释放URL 对象
+                document.body.removeChild(down);//下载完成移除
+            }
+        },
+        // 删除音频文件
+        async delWav(wavId){
+            if(this.nowDataPath === this.defaultDataPath){
+                this.$message.error("默认音频不允许删除，可以一键重置，重新录音")
+                return 
+            }
+
+            console.log('wavId', wavId)
+            // 删除文件
+            const result = await vcDel(
+                {
+                    wavName: this.vcDatas[wavId]['wavName'],
+                    wavPath: this.vcDatas[wavId]['wavPath']
+                }
+            );
+            if(!result.data.code){
+                this.$message.success("删除成功")
+                this.GetList()
+            } else {
+                this.$message.error("文件删除失败")
+            }
+        }, 
+        // 微调模型
+        async fineTuneModel(){
+            // 先检查是否都有录音
+            for(let i=0; i < this.vcDatas.length; i++){
+                if(this.vcDatas['wavPath'] === ''){
+                    return this.$message.error("还有录音未完成，请先完成录音！")
+                }
+            }
+            this.onFinetune = 1
+            const result = await vcCloneFineTune(
+                {
+                    wav_path: this.nowDataPath,
+                }
+            );
+            if(!result.data.code){
+                this.onFinetune = 2
+                this.expPath = result.data.result
+                console.log("this.expPath: ", this.expPath)
+                this.$message.success("小数据微调成功")
+            } else {
+                this.onFinetune = 3
+                this.$message.error(result.data.msg)
+            }
+        },
+        // 合成音频
+        async fineTuneSyn(){
+            if(!this.expPath){
+                return this.$message.error("请先微调生成模型后再生成！")
+            }
+            // 合成
+            this.onSyn = 1
+            const result = await vcCloneFineTuneSyn(
+                {
+                    exp_path: this.expPath,
+                    text: this.ttsText
+                }
+            );
+            this.onSyn = 0
+            if(!result.data.code){
+                this.cloneWav = result.data.result
+                console.log("clone wav: ", this.cloneWav)
+                this.$message.success("音色克隆成功")
+            } else {
+                this.$message.error(result.data.msg)
+            }
+            this.$nextTick(()=>{})
+        }
+},
+};
+</script>
+    
+<style lang="less" scoped>
+// @import "./style.less";
+.finetune {
+  width: 1200px;
+  height: 410px;
+  background: #FFFFFF;
+  padding: 5px 80px 56px 80px;
+  box-sizing: border-box;
+}
+.el-row {
+  margin-bottom: 20px;
+}
+.grid-content {
+  border-radius: 4px;
+  min-height: 36px;
+}
+.play_board{
+    height: 100%;
+    display: flex;
+    align-items: center;
+}
+</style>
\ No newline at end of file
diff --git a/demos/speech_web/web_client/src/components/SubMenu/VoiceClone/VoiceClone.vue b/demos/speech_web/web_client/src/components/SubMenu/VoiceClone/VoiceClone.vue
new file mode 100644
index 00000000..1e380d28
--- /dev/null
+++ b/demos/speech_web/web_client/src/components/SubMenu/VoiceClone/VoiceClone.vue
@@ -0,0 +1,379 @@
+<template>
+    <div class="voiceclone">
+        <el-row :gutter="20">
+            <el-col :span="12"><div class="grid-content ep-bg-purple" />
+                <el-row :gutter="60" class="btn_row_wav" justify="center">
+                    <el-button class="ml-3" v-if="onEnrollRec === 0" @click="startRecorderEnroll()" type="primary">录制音频</el-button>
+                    <el-button class="ml-3" v-else-if="onEnrollRec === 1" @click="stopRecorderEnroll()" type="danger">停止录音</el-button>
+                    <el-button class="ml-3" v-else @click="uploadRecord()" type="success">上传录音</el-button>
+                    <a>&#12288</a>
+                    <el-upload
+                        :multiple="false"
+                        :accept="'.wav'"
+                        :auto-upload="false"
+                        :on-change="handleChange"
+                        :show-file-list="false"
+                    >
+                        <el-button class="ml-3" type="success">上传音频文件</el-button>
+                    </el-upload>
+                </el-row>
+                <div class="recording_table">
+                <el-table :data="vcDatas" border class="recording_table_box" scrollbar-always-on max-height="250px">
+                    <el-table-column prop="wavId" label="序号" width="60"/>
+                    <el-table-column prop="wavName" label="文件名" />
+                    <el-table-column label="操作" width="80">
+                        <template #default="scope">
+                            <div class="flex justify-space-between mb-4 flex-wrap gap-4">
+                                <a @click="PlayTable(scope.row.wavId)"><el-icon><VideoPlay /></el-icon></a>
+                                <a>&#12288</a>
+                                <a @click="delWav(scope.row.wavId)"><el-icon><DeleteFilled /></el-icon></a>
+                            </div>
+                        </template>
+                    </el-table-column>
+                    <el-table-column fixed="right" label="选择" width="70">
+                        <template #default="scope">
+                            <el-switch v-model="scope.row.status"  @click="choseWav(scope.row.wavId)"/>
+                        </template>
+                    </el-table-column>
+                </el-table>
+                </div>
+
+            </el-col>
+            <el-col :span="8"><div class="grid-content ep-bg-purple" />
+                <el-space direction="vertical">
+                    <el-card class="box-card" style="width: 250px; height:310px">
+                        <template #header>
+                            <div class="card-header">
+                            <span>请输入中文文本</span>
+                            </div>
+                        </template>
+                        <div class="mb-2 flex items-center text-sm">
+                            <el-radio-group v-model="func_radio" class="ml-4">
+                            <el-radio label="1" size="large">GE2E</el-radio>
+                            <el-radio label="2" size="large">ECAPA-TDNN</el-radio>
+                            </el-radio-group>
+                        </div>
+                        <el-input
+                            v-model="ttsText"
+                            :autosize="{ minRows: 8, maxRows: 13 }"
+                            type="textarea"
+                            placeholder="Please input"
+                            />
+                    </el-card>                    
+                </el-space>
+            </el-col>
+            <el-col :span="4"><div class="grid-content ep-bg-purple" />
+                <div class="play_board">
+                    <el-space direction="vertical">
+                        <el-row :gutter="20">
+                            <el-button size="large" v-if="g2pOnSys === 0" type="primary" @click="g2pClone()">开始合成</el-button>
+                            <el-button size="large" v-else :loading-icon="Eleme" type="danger">合成中</el-button>
+                        </el-row>
+
+                        <el-row :gutter="20">
+                            <el-button v-if='this.cloneWav' type="primary" @click="PlaySyn()">播放</el-button>
+                            <el-button v-else disabled type="primary" @click="PlaySyn()">播放</el-button>
+                            <el-button v-if='this.cloneWav' type="primary" @click="downLoadCloneWav()">下载</el-button>
+                            <el-button v-else disabled type="primary" @click="downLoadCloneWav()">下载</el-button>
+                        </el-row>
+                    </el-space>
+                </div>
+            </el-col>
+        </el-row>
+    </div>
+</template>
+
+<script>
+
+import Recorder from 'js-audio-recorder'
+import { vcCloneG2P, vcCloneSAT, vcDel, vcUpload, vcList, vcDownload, vcDownloadBase64 } from '../../../api/ApiVC';
+
+// 初始化录音
+const recorder = new Recorder({
+  sampleBits: 16,                 // 采样位数，支持 8 或 16，默认是16
+  sampleRate: 16000,              // 采样率，支持 11025、16000、22050、24000、44100、48000，根据浏览器默认值，我的chrome是48000
+  numChannels: 1,                 // 声道，支持 1 或 2， 默认是1
+  compiling: true
+})
+
+// 初始化播放器
+const audioCtx = new AudioContext({
+    latencyHint: 'interactive',
+    sampleRate: 16000,
+});
+
+export default {
+    data(){
+         return {
+            onEnrollRec: 0,     // 注册录音状态
+            wav: '',            // 录音结果
+            vcDatas: [],       // 已录制的音频
+            nowFile: "",        // 当前选择的音频
+            ttsText: "欢迎使用飞桨语音套件",
+            nowIndex: -1,
+            cloneWav: "",
+            g2pOnSys: 0,
+            func_radio: '1',
+         }
+    },
+    mounted () {
+        this.GetList()
+    },
+    methods:{
+        // 重置
+        reset(){
+            this.onEnrollRec = 0
+            this.wav = ''
+            this.vcDatas = []
+            this.nowFile = ""
+            this.ttsText = "欢迎使用飞桨语音套件"
+            this.nowIndex = -1
+        },
+        // 开始录音
+        startRecorderEnroll(){
+            this.onEnrollRec = 1
+            recorder.clear()
+            recorder.start()
+        },
+        // 结束录音
+        stopRecorderEnroll(){
+            this.onEnrollRec = 2
+            recorder.stop()
+            this.wav = recorder.getWAVBlob()
+        },
+        // chose wav
+        choseWav(wavId){
+            this.cloneWav = ''
+            this.nowFile = this.vcDatas[wavId].wavName
+            this.nowIndex = wavId
+            // only wavId is true else false
+            for(let i=0; i<this.vcDatas.length; i++){
+                if(i==wavId){
+                    this.vcDatas[wavId].status = true
+                } else {
+                    this.vcDatas[i].status = false
+                }
+            }
+            this.$nextTick(()=>{})
+        },
+        // 上传录音
+        async uploadRecord(){
+            this.onEnrollRec = 0
+            if(this.wav === ""){
+                this.$message.error("未检测到录音，录音失败，请重新录制")
+                return
+            } else {
+                if(this.wav === ''){
+                    this.$message.error("请先完成录音");
+                    this.onEnrollRec = 0
+                    return
+                } else {
+                    let formData = new FormData();
+                    formData.append('files', this.wav);
+                    const result = await vcUpload(formData);
+                    console.log(result)
+                    this.GetList() 
+                }
+                this.$message.success("录音上传成功")
+            }
+        }, 
+        // 上传列表改变
+        async handleChange(file, fileList){
+            for(let i=0; i<fileList.length; i++){
+                this.uploadFile(fileList[i])
+            } 
+        },
+
+        // 上传音频
+        async uploadFile(file){
+            let formData = new FormData();
+            formData.append('files', file.raw);
+            const result = await vcUpload(formData);
+            if (result.data.code === 0) {
+                this.$message.success("音频上传成功")
+                this.GetList()
+            } else {
+                this.$message.error("音频上传失败")
+            }
+        },
+        // 获取文件列表
+        async GetList(){
+            this.vcDatas =[]
+            const result = await vcList();
+            for(let i=0; i<result.data.result.length; i++){
+                this.vcDatas.push({
+                    wavName: result.data.result[i]['name'],
+                    wavId: i,
+                    wavPath: result.data.result[i]['path'],
+                    status: false
+                })
+            }
+            this.$nextTick(()=>{})
+        },
+        // 删除音频文件
+        async delWav(wavId){
+            console.log('wavId', wavId)
+            // 删除文件
+            const result = await vcDel(
+                {
+                    wavName: this.vcDatas[wavId]['wavName'],
+                    wavPath: this.vcDatas[wavId]['wavPath']
+                }
+            );
+            if(!result.data.code){
+                this.$message.success("删除成功")
+            } else {
+                this.$message.error(result.data.msg)
+            }
+            this.GetList()
+            this.reset()
+        },
+        // 下载合成文件
+        async downLoadCloneWav(){
+            if(this.cloneWav  === ""){
+                this.$message.error("音频合成完毕后再下载！")
+            } else {
+                // const result = await vcDownload(this.cloneWav);
+                // 获取音频数据
+                const result = await vcDownloadBase64(this.cloneWav);
+                let view;
+                // console.log('play result', result)
+                if (result.data.code === 0) {
+                    // base转换二进制数
+                    let typedArray = this.base64ToUint8Array(result.data.result)
+                    // 添加wav文件头
+                    view = new DataView(typedArray.buffer);
+                    view = Recorder.encodeWAV(view, 16000, 16000, 1, 16, true);
+                    // 播放音频
+                    // this.playAudioData(view.buffer);
+                }
+                console.log(view.buffer)
+                // debugger
+                const blob = new Blob([view.buffer], { type: 'audio/wav' });
+                const fileName = new Date().getTime() + '.wav';
+                const down = document.createElement('a');
+                down.download = fileName;
+                down.style.display = 'none';//隐藏,没必要展示出来
+                down.href = URL.createObjectURL(blob);
+                document.body.appendChild(down);
+                down.click();
+                URL.revokeObjectURL(down.href); // 释放URL 对象
+                document.body.removeChild(down);//下载完成移除
+            }
+        },
+        // g2p voice clone
+        async g2pClone(){
+            if(this.nowIndex === -1){
+                return this.$message.error("请先录音并上传，选择音频后再点击合成")
+            } else if (this.ttsText === ""){
+                return this.$message.error("合成文本不可以为空")
+            } else if (this.nowIndex >= this.vcDatas.length){
+                return this.$message.error("当前序号不可以超过音频个数")
+            }
+            let func = ''
+            if(this.func_radio === '1'){
+                func = 'ge2e'
+            } else {
+                func = 'ecapa_tdnn'
+            }
+            console.log('func', func)
+
+            // 合成
+            this.g2pOnSys = 1
+            const result = await vcCloneG2P(
+                {
+                    wavName: this.vcDatas[this.nowIndex]['wavName'],
+                    wavPath: this.vcDatas[this.nowIndex]['wavPath'],
+                    text: this.ttsText,
+                    func: func
+                }
+            );
+            this.g2pOnSys = 0
+            if(!result.data.code){
+                this.cloneWav = result.data.result
+                console.log("clone wav: ", this.cloneWav)
+                this.$message.success("音色克隆成功")
+            } else {
+                this.$message.error(result.data.msg)
+            }
+        },
+        // 播放表格
+        async PlayTable(wavId){
+            this.Play(this.vcDatas[wavId])
+        },
+        // 播放合成后的音频
+        async PlaySyn(){
+            if(this.cloneWav  === ""){
+                this.$message.error("请合成音频后再播放！！")
+                return
+            } else {
+                this.Play(this.cloneWav)
+            }
+        },
+        // 播放音频
+        async Play(wavBase){
+                // 获取音频数据
+                const result = await vcDownloadBase64(wavBase);
+                // console.log('play result', result)
+                if (result.data.code === 0) {
+                    // base转换二进制数
+                    let typedArray = this.base64ToUint8Array(result.data.result)
+                    // 添加wav文件头
+                    let view = new DataView(typedArray.buffer);
+                    view = Recorder.encodeWAV(view, 16000, 16000, 1, 16, true);
+                    // 播放音频
+                    this.playAudioData(view.buffer);
+                };
+        },
+        // base64解码
+        base64ToUint8Array(base64String) {
+            const padding = '='.repeat((4 - base64String.length % 4) % 4);
+            const base64 = (base64String + padding)
+                            .replace(/-/g, '+')
+                            .replace(/_/g, '/');
+
+            const rawData = window.atob(base64);
+            const outputArray = new Uint8Array(rawData.length);
+
+            for (let i = 0; i < rawData.length; ++i) {
+                    outputArray[i] = rawData.charCodeAt(i);
+            }
+            return outputArray;
+        }, 
+        // 播放音频
+        playAudioData( wav_buffer ) {
+        audioCtx.decodeAudioData(wav_buffer, buffer => {
+            var source = audioCtx.createBufferSource();
+            source.buffer = buffer;
+            source.connect(audioCtx.destination);
+            source.start();
+        }, function(e) {
+            Recorder.throwError(e);
+            })
+        },
+    },
+}
+</script>
+
+<style lang="less" scoped>
+// @import "./style.less";
+.voiceclone {
+    width: 1200px;
+    height: 410px;
+    background: #FFFFFF;
+    padding: 5px 80px 56px 80px;
+    box-sizing: border-box;
+}
+.el-row {
+  margin-bottom: 20px;
+}
+.grid-content {
+  border-radius: 4px;
+  min-height: 36px;
+}
+.play_board{
+    height: 100%;
+    display: flex;
+    align-items: center;
+}
+</style>
\ No newline at end of file
diff --git a/demos/speech_web/web_client/src/main.js b/demos/speech_web/web_client/src/main.js
index 3fbf87c8..544f5b30 100644
--- a/demos/speech_web/web_client/src/main.js
+++ b/demos/speech_web/web_client/src/main.js
@@ -1,5 +1,6 @@
 import { createApp } from 'vue'
 import ElementPlus from 'element-plus'
+import * as ElementPlusIconsVue from '@element-plus/icons-vue'
 import 'element-plus/dist/index.css'
 import Antd from 'ant-design-vue';
 import 'ant-design-vue/dist/antd.css';
@@ -9,5 +10,8 @@ import axios from 'axios'
 const app = createApp(App)
 app.config.globalProperties.$http = axios
 
+for (const [key, component] of Object.entries(ElementPlusIconsVue)) {
+    app.component(key, component)
+  }
 app.use(ElementPlus).use(Antd)
 app.mount('#app')
diff --git a/demos/speech_web/web_client/yarn.lock b/demos/speech_web/web_client/yarn.lock
index 6777cf4c..7f07daa0 100644
--- a/demos/speech_web/web_client/yarn.lock
+++ b/demos/speech_web/web_client/yarn.lock
@@ -44,6 +44,11 @@
   resolved "https://registry.npmmirror.com/@element-plus/icons-vue/-/icons-vue-1.1.4.tgz"
   integrity sha512-Iz/nHqdp1sFPmdzRwHkEQQA3lKvoObk8azgABZ81QUOpW9s/lUyQVUSh0tNtEPZXQlKwlSh7SPgoVxzrE0uuVQ==
 
+"@element-plus/icons-vue@^2.0.9":
+  version "2.0.9"
+  resolved "https://registry.npmmirror.com/@element-plus/icons-vue/-/icons-vue-2.0.9.tgz#b7777c57534522e387303d194451d50ff549d49a"
+  integrity sha512-okdrwiVeKBmW41Hkl0eMrXDjzJwhQMuKiBOu17rOszqM+LS/yBYpNQNV5Jvoh06Wc+89fMmb/uhzf8NZuDuUaQ==
+
 "@floating-ui/core@^0.6.1":
   version "0.6.1"
   resolved "https://registry.npmmirror.com/@floating-ui/core/-/core-0.6.1.tgz"
diff --git a/docs/requirements.txt b/docs/requirements.txt
index 3fb82367..fd7a481b 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -20,6 +20,7 @@ onnxruntime==1.10.0
 opencc
 paddlenlp
 paddlepaddle>=2.2.2
+paddlespeech_ctcdecoders
 paddlespeech_feat
 pandas
 pathos == 0.2.8
@@ -27,8 +28,8 @@ pattern_singleton
 Pillow>=9.0.0
 praatio==5.0.0
 prettytable
-pypinyin<=0.44.0
 pypinyin-dict
+pypinyin<=0.44.0
 python-dateutil
 pyworld==0.2.12
 recommonmark>=0.5.0
diff --git a/docs/source/api/paddlespeech.cls.exps.panns.deploy.predict.rst b/docs/source/api/paddlespeech.cls.exps.panns.deploy.predict.rst
deleted file mode 100644
index d4f92a2e..00000000
--- a/docs/source/api/paddlespeech.cls.exps.panns.deploy.predict.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-paddlespeech.cls.exps.panns.deploy.predict module
-=================================================
-
-.. automodule:: paddlespeech.cls.exps.panns.deploy.predict
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.cls.exps.panns.deploy.rst b/docs/source/api/paddlespeech.cls.exps.panns.deploy.rst
index 4415c933..369862cc 100644
--- a/docs/source/api/paddlespeech.cls.exps.panns.deploy.rst
+++ b/docs/source/api/paddlespeech.cls.exps.panns.deploy.rst
@@ -12,4 +12,3 @@ Submodules
 .. toctree::
    :maxdepth: 4
 
-   paddlespeech.cls.exps.panns.deploy.predict
diff --git a/docs/source/api/paddlespeech.cls.exps.panns.export_model.rst b/docs/source/api/paddlespeech.cls.exps.panns.export_model.rst
deleted file mode 100644
index 6c39c2bc..00000000
--- a/docs/source/api/paddlespeech.cls.exps.panns.export_model.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-paddlespeech.cls.exps.panns.export\_model module
-================================================
-
-.. automodule:: paddlespeech.cls.exps.panns.export_model
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.cls.exps.panns.predict.rst b/docs/source/api/paddlespeech.cls.exps.panns.predict.rst
deleted file mode 100644
index 88cd4033..00000000
--- a/docs/source/api/paddlespeech.cls.exps.panns.predict.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-paddlespeech.cls.exps.panns.predict module
-==========================================
-
-.. automodule:: paddlespeech.cls.exps.panns.predict
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.cls.exps.panns.rst b/docs/source/api/paddlespeech.cls.exps.panns.rst
index 6147b245..72f30ba6 100644
--- a/docs/source/api/paddlespeech.cls.exps.panns.rst
+++ b/docs/source/api/paddlespeech.cls.exps.panns.rst
@@ -20,6 +20,3 @@ Submodules
 .. toctree::
    :maxdepth: 4
 
-   paddlespeech.cls.exps.panns.export_model
-   paddlespeech.cls.exps.panns.predict
-   paddlespeech.cls.exps.panns.train
diff --git a/docs/source/api/paddlespeech.cls.exps.panns.train.rst b/docs/source/api/paddlespeech.cls.exps.panns.train.rst
deleted file mode 100644
index a89b7eec..00000000
--- a/docs/source/api/paddlespeech.cls.exps.panns.train.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-paddlespeech.cls.exps.panns.train module
-========================================
-
-.. automodule:: paddlespeech.cls.exps.panns.train
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.kws.exps.mdtc.plot_det_curve.rst b/docs/source/api/paddlespeech.kws.exps.mdtc.plot_det_curve.rst
deleted file mode 100644
index 46a149b0..00000000
--- a/docs/source/api/paddlespeech.kws.exps.mdtc.plot_det_curve.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-paddlespeech.kws.exps.mdtc.plot\_det\_curve module
-==================================================
-
-.. automodule:: paddlespeech.kws.exps.mdtc.plot_det_curve
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.kws.exps.mdtc.rst b/docs/source/api/paddlespeech.kws.exps.mdtc.rst
index f6cad64e..33d4a55c 100644
--- a/docs/source/api/paddlespeech.kws.exps.mdtc.rst
+++ b/docs/source/api/paddlespeech.kws.exps.mdtc.rst
@@ -14,6 +14,5 @@ Submodules
 
    paddlespeech.kws.exps.mdtc.collate
    paddlespeech.kws.exps.mdtc.compute_det
-   paddlespeech.kws.exps.mdtc.plot_det_curve
    paddlespeech.kws.exps.mdtc.score
    paddlespeech.kws.exps.mdtc.train
diff --git a/docs/source/api/paddlespeech.s2t.decoders.ctcdecoder.rst b/docs/source/api/paddlespeech.s2t.decoders.ctcdecoder.rst
index 8093619b..dfcd274c 100644
--- a/docs/source/api/paddlespeech.s2t.decoders.ctcdecoder.rst
+++ b/docs/source/api/paddlespeech.s2t.decoders.ctcdecoder.rst
@@ -13,5 +13,4 @@ Submodules
    :maxdepth: 4
 
    paddlespeech.s2t.decoders.ctcdecoder.decoders_deprecated
-   paddlespeech.s2t.decoders.ctcdecoder.scorer_deprecated
    paddlespeech.s2t.decoders.ctcdecoder.swig_wrapper
diff --git a/docs/source/api/paddlespeech.s2t.decoders.ctcdecoder.scorer_deprecated.rst b/docs/source/api/paddlespeech.s2t.decoders.ctcdecoder.scorer_deprecated.rst
deleted file mode 100644
index 1079d672..00000000
--- a/docs/source/api/paddlespeech.s2t.decoders.ctcdecoder.scorer_deprecated.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-paddlespeech.s2t.decoders.ctcdecoder.scorer\_deprecated module
-==============================================================
-
-.. automodule:: paddlespeech.s2t.decoders.ctcdecoder.scorer_deprecated
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.s2t.decoders.recog_bin.rst b/docs/source/api/paddlespeech.s2t.decoders.recog_bin.rst
deleted file mode 100644
index 4952e2e6..00000000
--- a/docs/source/api/paddlespeech.s2t.decoders.recog_bin.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-paddlespeech.s2t.decoders.recog\_bin module
-===========================================
-
-.. automodule:: paddlespeech.s2t.decoders.recog_bin
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.s2t.decoders.rst b/docs/source/api/paddlespeech.s2t.decoders.rst
index e4eabedf..53e0d9c4 100644
--- a/docs/source/api/paddlespeech.s2t.decoders.rst
+++ b/docs/source/api/paddlespeech.s2t.decoders.rst
@@ -23,5 +23,4 @@ Submodules
    :maxdepth: 4
 
    paddlespeech.s2t.decoders.recog
-   paddlespeech.s2t.decoders.recog_bin
    paddlespeech.s2t.decoders.utils
diff --git a/docs/source/api/paddlespeech.s2t.decoders.scorers.ngram.rst b/docs/source/api/paddlespeech.s2t.decoders.scorers.ngram.rst
deleted file mode 100644
index f38a6109..00000000
--- a/docs/source/api/paddlespeech.s2t.decoders.scorers.ngram.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-paddlespeech.s2t.decoders.scorers.ngram module
-==============================================
-
-.. automodule:: paddlespeech.s2t.decoders.scorers.ngram
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.s2t.decoders.scorers.rst b/docs/source/api/paddlespeech.s2t.decoders.scorers.rst
index 83808c49..ca834f6b 100644
--- a/docs/source/api/paddlespeech.s2t.decoders.scorers.rst
+++ b/docs/source/api/paddlespeech.s2t.decoders.scorers.rst
@@ -15,5 +15,4 @@ Submodules
    paddlespeech.s2t.decoders.scorers.ctc
    paddlespeech.s2t.decoders.scorers.ctc_prefix_score
    paddlespeech.s2t.decoders.scorers.length_bonus
-   paddlespeech.s2t.decoders.scorers.ngram
    paddlespeech.s2t.decoders.scorers.scorer_interface
diff --git a/docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.deploy.client.rst b/docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.deploy.client.rst
deleted file mode 100644
index a73a5685..00000000
--- a/docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.deploy.client.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-paddlespeech.s2t.exps.deepspeech2.bin.deploy.client module
-==========================================================
-
-.. automodule:: paddlespeech.s2t.exps.deepspeech2.bin.deploy.client
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.deploy.record.rst b/docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.deploy.record.rst
deleted file mode 100644
index bc107848..00000000
--- a/docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.deploy.record.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-paddlespeech.s2t.exps.deepspeech2.bin.deploy.record module
-==========================================================
-
-.. automodule:: paddlespeech.s2t.exps.deepspeech2.bin.deploy.record
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.deploy.rst b/docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.deploy.rst
index d1f966fc..28de0f7f 100644
--- a/docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.deploy.rst
+++ b/docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.deploy.rst
@@ -12,8 +12,5 @@ Submodules
 .. toctree::
    :maxdepth: 4
 
-   paddlespeech.s2t.exps.deepspeech2.bin.deploy.client
-   paddlespeech.s2t.exps.deepspeech2.bin.deploy.record
    paddlespeech.s2t.exps.deepspeech2.bin.deploy.runtime
-   paddlespeech.s2t.exps.deepspeech2.bin.deploy.send
    paddlespeech.s2t.exps.deepspeech2.bin.deploy.server
diff --git a/docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.deploy.send.rst b/docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.deploy.send.rst
deleted file mode 100644
index ba1ae0a6..00000000
--- a/docs/source/api/paddlespeech.s2t.exps.deepspeech2.bin.deploy.send.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-paddlespeech.s2t.exps.deepspeech2.bin.deploy.send module
-========================================================
-
-.. automodule:: paddlespeech.s2t.exps.deepspeech2.bin.deploy.send
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.s2t.exps.u2.rst b/docs/source/api/paddlespeech.s2t.exps.u2.rst
index e0ebb7fc..bf565670 100644
--- a/docs/source/api/paddlespeech.s2t.exps.u2.rst
+++ b/docs/source/api/paddlespeech.s2t.exps.u2.rst
@@ -21,4 +21,3 @@ Submodules
    :maxdepth: 4
 
    paddlespeech.s2t.exps.u2.model
-   paddlespeech.s2t.exps.u2.trainer
diff --git a/docs/source/api/paddlespeech.s2t.exps.u2.trainer.rst b/docs/source/api/paddlespeech.s2t.exps.u2.trainer.rst
deleted file mode 100644
index 0cd28945..00000000
--- a/docs/source/api/paddlespeech.s2t.exps.u2.trainer.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-paddlespeech.s2t.exps.u2.trainer module
-=======================================
-
-.. automodule:: paddlespeech.s2t.exps.u2.trainer
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.s2t.exps.u2_kaldi.bin.recog.rst b/docs/source/api/paddlespeech.s2t.exps.u2_kaldi.bin.recog.rst
deleted file mode 100644
index bc749c8f..00000000
--- a/docs/source/api/paddlespeech.s2t.exps.u2_kaldi.bin.recog.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-paddlespeech.s2t.exps.u2\_kaldi.bin.recog module
-================================================
-
-.. automodule:: paddlespeech.s2t.exps.u2_kaldi.bin.recog
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.s2t.exps.u2_kaldi.bin.rst b/docs/source/api/paddlespeech.s2t.exps.u2_kaldi.bin.rst
index ff1a6efe..087b8767 100644
--- a/docs/source/api/paddlespeech.s2t.exps.u2_kaldi.bin.rst
+++ b/docs/source/api/paddlespeech.s2t.exps.u2_kaldi.bin.rst
@@ -12,6 +12,5 @@ Submodules
 .. toctree::
    :maxdepth: 4
 
-   paddlespeech.s2t.exps.u2_kaldi.bin.recog
    paddlespeech.s2t.exps.u2_kaldi.bin.test
    paddlespeech.s2t.exps.u2_kaldi.bin.train
diff --git a/docs/source/api/paddlespeech.s2t.training.extensions.rst b/docs/source/api/paddlespeech.s2t.training.extensions.rst
index f31b8427..13530a8d 100644
--- a/docs/source/api/paddlespeech.s2t.training.extensions.rst
+++ b/docs/source/api/paddlespeech.s2t.training.extensions.rst
@@ -15,5 +15,3 @@ Submodules
    paddlespeech.s2t.training.extensions.evaluator
    paddlespeech.s2t.training.extensions.extension
    paddlespeech.s2t.training.extensions.plot
-   paddlespeech.s2t.training.extensions.snapshot
-   paddlespeech.s2t.training.extensions.visualizer
diff --git a/docs/source/api/paddlespeech.s2t.training.extensions.snapshot.rst b/docs/source/api/paddlespeech.s2t.training.extensions.snapshot.rst
deleted file mode 100644
index e0ca21a7..00000000
--- a/docs/source/api/paddlespeech.s2t.training.extensions.snapshot.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-paddlespeech.s2t.training.extensions.snapshot module
-====================================================
-
-.. automodule:: paddlespeech.s2t.training.extensions.snapshot
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.s2t.training.extensions.visualizer.rst b/docs/source/api/paddlespeech.s2t.training.extensions.visualizer.rst
deleted file mode 100644
index 22ae11f1..00000000
--- a/docs/source/api/paddlespeech.s2t.training.extensions.visualizer.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-paddlespeech.s2t.training.extensions.visualizer module
-======================================================
-
-.. automodule:: paddlespeech.s2t.training.extensions.visualizer
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.s2t.training.updaters.rst b/docs/source/api/paddlespeech.s2t.training.updaters.rst
index a0617016..b38704a0 100644
--- a/docs/source/api/paddlespeech.s2t.training.updaters.rst
+++ b/docs/source/api/paddlespeech.s2t.training.updaters.rst
@@ -13,5 +13,4 @@ Submodules
    :maxdepth: 4
 
    paddlespeech.s2t.training.updaters.standard_updater
-   paddlespeech.s2t.training.updaters.trainer
    paddlespeech.s2t.training.updaters.updater
diff --git a/docs/source/api/paddlespeech.s2t.training.updaters.trainer.rst b/docs/source/api/paddlespeech.s2t.training.updaters.trainer.rst
deleted file mode 100644
index 6981a8f0..00000000
--- a/docs/source/api/paddlespeech.s2t.training.updaters.trainer.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-paddlespeech.s2t.training.updaters.trainer module
-=================================================
-
-.. automodule:: paddlespeech.s2t.training.updaters.trainer
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.s2t.transform.add_deltas.rst b/docs/source/api/paddlespeech.s2t.transform.add_deltas.rst
deleted file mode 100644
index 5007fd9d..00000000
--- a/docs/source/api/paddlespeech.s2t.transform.add_deltas.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-paddlespeech.s2t.transform.add\_deltas module
-=============================================
-
-.. automodule:: paddlespeech.s2t.transform.add_deltas
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.s2t.transform.channel_selector.rst b/docs/source/api/paddlespeech.s2t.transform.channel_selector.rst
deleted file mode 100644
index e08dd253..00000000
--- a/docs/source/api/paddlespeech.s2t.transform.channel_selector.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-paddlespeech.s2t.transform.channel\_selector module
-===================================================
-
-.. automodule:: paddlespeech.s2t.transform.channel_selector
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.s2t.transform.cmvn.rst b/docs/source/api/paddlespeech.s2t.transform.cmvn.rst
deleted file mode 100644
index 8348e3d4..00000000
--- a/docs/source/api/paddlespeech.s2t.transform.cmvn.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-paddlespeech.s2t.transform.cmvn module
-======================================
-
-.. automodule:: paddlespeech.s2t.transform.cmvn
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.s2t.transform.functional.rst b/docs/source/api/paddlespeech.s2t.transform.functional.rst
deleted file mode 100644
index eb2b54a6..00000000
--- a/docs/source/api/paddlespeech.s2t.transform.functional.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-paddlespeech.s2t.transform.functional module
-============================================
-
-.. automodule:: paddlespeech.s2t.transform.functional
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.s2t.transform.perturb.rst b/docs/source/api/paddlespeech.s2t.transform.perturb.rst
deleted file mode 100644
index 0be28ab7..00000000
--- a/docs/source/api/paddlespeech.s2t.transform.perturb.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-paddlespeech.s2t.transform.perturb module
-=========================================
-
-.. automodule:: paddlespeech.s2t.transform.perturb
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.s2t.transform.rst b/docs/source/api/paddlespeech.s2t.transform.rst
deleted file mode 100644
index 5016ff4f..00000000
--- a/docs/source/api/paddlespeech.s2t.transform.rst
+++ /dev/null
@@ -1,24 +0,0 @@
-paddlespeech.s2t.transform package
-==================================
-
-.. automodule:: paddlespeech.s2t.transform
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-Submodules
-----------
-
-.. toctree::
-   :maxdepth: 4
-
-   paddlespeech.s2t.transform.add_deltas
-   paddlespeech.s2t.transform.channel_selector
-   paddlespeech.s2t.transform.cmvn
-   paddlespeech.s2t.transform.functional
-   paddlespeech.s2t.transform.perturb
-   paddlespeech.s2t.transform.spec_augment
-   paddlespeech.s2t.transform.spectrogram
-   paddlespeech.s2t.transform.transform_interface
-   paddlespeech.s2t.transform.transformation
-   paddlespeech.s2t.transform.wpe
diff --git a/docs/source/api/paddlespeech.s2t.transform.spec_augment.rst b/docs/source/api/paddlespeech.s2t.transform.spec_augment.rst
deleted file mode 100644
index 00fd3ea1..00000000
--- a/docs/source/api/paddlespeech.s2t.transform.spec_augment.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-paddlespeech.s2t.transform.spec\_augment module
-===============================================
-
-.. automodule:: paddlespeech.s2t.transform.spec_augment
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.s2t.transform.spectrogram.rst b/docs/source/api/paddlespeech.s2t.transform.spectrogram.rst
deleted file mode 100644
index 33c499a7..00000000
--- a/docs/source/api/paddlespeech.s2t.transform.spectrogram.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-paddlespeech.s2t.transform.spectrogram module
-=============================================
-
-.. automodule:: paddlespeech.s2t.transform.spectrogram
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.s2t.transform.transform_interface.rst b/docs/source/api/paddlespeech.s2t.transform.transform_interface.rst
deleted file mode 100644
index 009b0658..00000000
--- a/docs/source/api/paddlespeech.s2t.transform.transform_interface.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-paddlespeech.s2t.transform.transform\_interface module
-======================================================
-
-.. automodule:: paddlespeech.s2t.transform.transform_interface
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.s2t.transform.transformation.rst b/docs/source/api/paddlespeech.s2t.transform.transformation.rst
deleted file mode 100644
index a03e731a..00000000
--- a/docs/source/api/paddlespeech.s2t.transform.transformation.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-paddlespeech.s2t.transform.transformation module
-================================================
-
-.. automodule:: paddlespeech.s2t.transform.transformation
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.s2t.transform.wpe.rst b/docs/source/api/paddlespeech.s2t.transform.wpe.rst
deleted file mode 100644
index c4831f7f..00000000
--- a/docs/source/api/paddlespeech.s2t.transform.wpe.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-paddlespeech.s2t.transform.wpe module
-=====================================
-
-.. automodule:: paddlespeech.s2t.transform.wpe
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.server.engine.acs.python.acs_engine.rst b/docs/source/api/paddlespeech.server.engine.acs.python.acs_engine.rst
deleted file mode 100644
index 9b61633e..00000000
--- a/docs/source/api/paddlespeech.server.engine.acs.python.acs_engine.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-paddlespeech.server.engine.acs.python.acs\_engine module
-========================================================
-
-.. automodule:: paddlespeech.server.engine.acs.python.acs_engine
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.server.engine.acs.python.rst b/docs/source/api/paddlespeech.server.engine.acs.python.rst
index 3c06ba08..7e5582bd 100644
--- a/docs/source/api/paddlespeech.server.engine.acs.python.rst
+++ b/docs/source/api/paddlespeech.server.engine.acs.python.rst
@@ -12,4 +12,3 @@ Submodules
 .. toctree::
    :maxdepth: 4
 
-   paddlespeech.server.engine.acs.python.acs_engine
diff --git a/docs/source/api/paddlespeech.server.utils.log.rst b/docs/source/api/paddlespeech.server.utils.log.rst
deleted file mode 100644
index 453b4a61..00000000
--- a/docs/source/api/paddlespeech.server.utils.log.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-paddlespeech.server.utils.log module
-====================================
-
-.. automodule:: paddlespeech.server.utils.log
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.t2s.exps.rst b/docs/source/api/paddlespeech.t2s.exps.rst
index bee18a97..643f97b4 100644
--- a/docs/source/api/paddlespeech.t2s.exps.rst
+++ b/docs/source/api/paddlespeech.t2s.exps.rst
@@ -30,10 +30,10 @@ Submodules
 
    paddlespeech.t2s.exps.inference
    paddlespeech.t2s.exps.inference_streaming
+   paddlespeech.t2s.models.vits.monotonic_align
    paddlespeech.t2s.exps.ort_predict
    paddlespeech.t2s.exps.ort_predict_e2e
    paddlespeech.t2s.exps.ort_predict_streaming
-   paddlespeech.t2s.exps.stream_play_tts
    paddlespeech.t2s.exps.syn_utils
    paddlespeech.t2s.exps.synthesize
    paddlespeech.t2s.exps.synthesize_e2e
diff --git a/docs/source/api/paddlespeech.t2s.exps.stream_play_tts.rst b/docs/source/api/paddlespeech.t2s.exps.stream_play_tts.rst
deleted file mode 100644
index cb22dde0..00000000
--- a/docs/source/api/paddlespeech.t2s.exps.stream_play_tts.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-paddlespeech.t2s.exps.stream\_play\_tts module
-==============================================
-
-.. automodule:: paddlespeech.t2s.exps.stream_play_tts
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.t2s.models.ernie_sat.mlm.rst b/docs/source/api/paddlespeech.t2s.models.ernie_sat.mlm.rst
deleted file mode 100644
index f0e8fd11..00000000
--- a/docs/source/api/paddlespeech.t2s.models.ernie_sat.mlm.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-paddlespeech.t2s.models.ernie\_sat.mlm module
-=============================================
-
-.. automodule:: paddlespeech.t2s.models.ernie_sat.mlm
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.core.rst b/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.core.rst
deleted file mode 100644
index 7aaba795..00000000
--- a/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.core.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-paddlespeech.t2s.models.vits.monotonic\_align.core module
-=========================================================
-
-.. automodule:: paddlespeech.t2s.models.vits.monotonic_align.core
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.rst b/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.rst
deleted file mode 100644
index 25c819a7..00000000
--- a/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.rst
+++ /dev/null
@@ -1,16 +0,0 @@
-paddlespeech.t2s.models.vits.monotonic\_align package
-=====================================================
-
-.. automodule:: paddlespeech.t2s.models.vits.monotonic_align
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
-Submodules
-----------
-
-.. toctree::
-   :maxdepth: 4
-
-   paddlespeech.t2s.models.vits.monotonic_align.core
-   paddlespeech.t2s.models.vits.monotonic_align.setup
diff --git a/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.setup.rst b/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.setup.rst
deleted file mode 100644
index a93c3b8b..00000000
--- a/docs/source/api/paddlespeech.t2s.models.vits.monotonic_align.setup.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-paddlespeech.t2s.models.vits.monotonic\_align.setup module
-==========================================================
-
-.. automodule:: paddlespeech.t2s.models.vits.monotonic_align.setup
-   :members:
-   :undoc-members:
-   :show-inheritance:
diff --git a/docs/source/api/paddlespeech.t2s.models.vits.rst b/docs/source/api/paddlespeech.t2s.models.vits.rst
index 3146094b..205496f0 100644
--- a/docs/source/api/paddlespeech.t2s.models.vits.rst
+++ b/docs/source/api/paddlespeech.t2s.models.vits.rst
@@ -12,7 +12,6 @@ Subpackages
 .. toctree::
    :maxdepth: 4
 
-   paddlespeech.t2s.models.vits.monotonic_align
    paddlespeech.t2s.models.vits.wavenet
 
 Submodules
diff --git a/docs/source/tts/demo.rst b/docs/source/tts/demo.rst
index ca2fd98e..1ae687f8 100644
--- a/docs/source/tts/demo.rst
+++ b/docs/source/tts/demo.rst
@@ -42,7 +42,7 @@ Audio samples generated from ground-truth spectrograms with a vocoder.
         <tr>
             <td >Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition</td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/ljspeech_gt/LJ001-0001.wav"
                         type="audio/wav">
@@ -50,7 +50,7 @@ Audio samples generated from ground-truth spectrograms with a vocoder.
                 
             
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_0.wav"
                         type="audio/wav">
@@ -61,7 +61,7 @@ Audio samples generated from ground-truth spectrograms with a vocoder.
         <tr>
             <td>in being comparatively modern.</td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/ljspeech_gt/LJ001-0002.wav"
                         type="audio/wav">
@@ -70,7 +70,7 @@ Audio samples generated from ground-truth spectrograms with a vocoder.
 
             </td>
             <td>
-             <audio controls="controls">
+             <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_1.wav"
                         type="audio/wav">
@@ -81,7 +81,7 @@ Audio samples generated from ground-truth spectrograms with a vocoder.
         <tr>
             <td>For although the Chinese took impressions from wood blocks engraved in relief for centuries before the woodcutters of the Netherlands, by a similar process</td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/ljspeech_gt/LJ001-0003.wav"
                         type="audio/wav">
@@ -89,7 +89,7 @@ Audio samples generated from ground-truth spectrograms with a vocoder.
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_2.wav"
                         type="audio/wav">
@@ -100,7 +100,7 @@ Audio samples generated from ground-truth spectrograms with a vocoder.
         <tr>
             <td>produced the block books, which were the immediate predecessors of the true printed book</td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/ljspeech_gt/LJ001-0004.wav"
                         type="audio/wav">
@@ -108,7 +108,7 @@ Audio samples generated from ground-truth spectrograms with a vocoder.
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_3.wav"
                         type="audio/wav">
@@ -119,7 +119,7 @@ Audio samples generated from ground-truth spectrograms with a vocoder.
         <tr>
             <td>the invention of movable metal letters in the middle of the fifteenth century may justly be considered as the invention of the art of printing.</td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/ljspeech_gt/LJ001-0005.wav"
                         type="audio/wav">
@@ -127,7 +127,7 @@ Audio samples generated from ground-truth spectrograms with a vocoder.
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/waveflow_res128_ljspeech_samples_1.0/step_2000k_sentence_4.wav"
                         type="audio/wav">
@@ -153,7 +153,7 @@ Audio samples generated from ground-truth spectrograms with a vocoder.
         <tr>
             <td>昨日，这名“伤者”与医生全部被警方依法刑事拘留</td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/baker_gt_24k/009901.wav"
                         type="audio/wav">
@@ -161,7 +161,7 @@ Audio samples generated from ground-truth spectrograms with a vocoder.
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/pwg_baker_ckpt_0.4/009901.wav"
                         type="audio/wav">
@@ -172,7 +172,7 @@ Audio samples generated from ground-truth spectrograms with a vocoder.
         <tr>
             <td>钱伟长想到上海来办学校是经过深思熟虑的。</td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/baker_gt_24k/009902.wav"
                         type="audio/wav">
@@ -180,7 +180,7 @@ Audio samples generated from ground-truth spectrograms with a vocoder.
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/pwg_baker_ckpt_0.4/009902.wav"
                         type="audio/wav">
@@ -191,7 +191,7 @@ Audio samples generated from ground-truth spectrograms with a vocoder.
         <tr>
             <td>她见我一进门就骂，吃饭时也骂，骂得我抬不起头。</td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/baker_gt_24k/009903.wav"
                         type="audio/wav">
@@ -199,7 +199,7 @@ Audio samples generated from ground-truth spectrograms with a vocoder.
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/pwg_baker_ckpt_0.4/009903.wav"
                         type="audio/wav">
@@ -210,7 +210,7 @@ Audio samples generated from ground-truth spectrograms with a vocoder.
         <tr>
             <td>李述德在离开之前，只说了一句“柱驼杀父亲了”</td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/baker_gt_24k/009904.wav"
                         type="audio/wav">
@@ -218,7 +218,7 @@ Audio samples generated from ground-truth spectrograms with a vocoder.
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/pwg_baker_ckpt_0.4/009904.wav"
                         type="audio/wav">
@@ -230,7 +230,7 @@ Audio samples generated from ground-truth spectrograms with a vocoder.
         <tr>
             <td>这种车票和保险单捆绑出售属于重复性购买。</td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/baker_gt_24k/009905.wav"
                         type="audio/wav">
@@ -238,7 +238,7 @@ Audio samples generated from ground-truth spectrograms with a vocoder.
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/pwg_baker_ckpt_0.4/009905.wav"
                         type="audio/wav">
@@ -271,7 +271,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
         <tr>
             <td>Life was like a box of chocolates, you never know what you're gonna get.</td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                         <source
                             src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/transformer_tts_ljspeech_ckpt_0.4_waveflow_ljspeech_ckpt_0.3/001.wav"
                             type="audio/wav">
@@ -279,7 +279,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
                 </audio>
             </td>
             <td> 
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                         <source
                             src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/tacotron2_ljspeech_waveflow_samples_0.2/sentence_1.wav"
                             type="audio/wav">
@@ -290,7 +290,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
         <tr>
             <td>With great power there must come great responsibility.</td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                         <source
                             src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/transformer_tts_ljspeech_ckpt_0.4_waveflow_ljspeech_ckpt_0.3/002.wav"
                             type="audio/wav">
@@ -298,7 +298,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
                 </audio>
             </td>
             <td> 
-            <audio controls="controls">
+            <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/tacotron2_ljspeech_waveflow_samples_0.2/sentence_2.wav"
                         type="audio/wav">
@@ -309,7 +309,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
         <tr>
             <td>To be or not to be, that’s a question.</td>
             <td>
-            <audio controls="controls">
+            <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/transformer_tts_ljspeech_ckpt_0.4_waveflow_ljspeech_ckpt_0.3/003.wav"
                         type="audio/wav">
@@ -318,7 +318,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
             </td>
 
             <td> 
-            <audio controls="controls">
+            <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/tacotron2_ljspeech_waveflow_samples_0.2/sentence_3.wav"
                         type="audio/wav">
@@ -330,7 +330,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
         <tr>
             <td>A man can be destroyed but not defeated.</td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/transformer_tts_ljspeech_ckpt_0.4_waveflow_ljspeech_ckpt_0.3/004.wav"
                         type="audio/wav">
@@ -339,7 +339,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
             </td>
 
             <td> 
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/tacotron2_ljspeech_waveflow_samples_0.2/sentence_4.wav"
                         type="audio/wav">
@@ -350,7 +350,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
         <tr>
             <td>Do not, for one repulse, give up the purpose that you resolved to effort.</td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/transformer_tts_ljspeech_ckpt_0.4_waveflow_ljspeech_ckpt_0.3/005.wav"
                         type="audio/wav">
@@ -359,7 +359,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
             </td>
 
             <td> 
-            <audio controls="controls">
+            <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/tacotron2_ljspeech_waveflow_samples_0.2/sentence_5.wav"
                         type="audio/wav">
@@ -370,7 +370,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
         <tr>
             <td>Death is just a part of life, something we're all destined to do.</td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/transformer_tts_ljspeech_ckpt_0.4_waveflow_ljspeech_ckpt_0.3/006.wav"
                         type="audio/wav">
@@ -379,7 +379,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
             </td>
 
             <td> 
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/tacotron2_ljspeech_waveflow_samples_0.2/sentence_6.wav"
                         type="audio/wav">
@@ -390,7 +390,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
         <tr>
             <td>I think it's hard winning a war with words. </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/transformer_tts_ljspeech_ckpt_0.4_waveflow_ljspeech_ckpt_0.3/007.wav"
                         type="audio/wav">
@@ -399,7 +399,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
             </td>
 
             <td> 
-            <audio controls="controls">
+            <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/tacotron2_ljspeech_waveflow_samples_0.2/sentence_7.wav"
                         type="audio/wav">
@@ -410,7 +410,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
         <tr>
             <td>Don’t argue with the people of strong determination, because they may change the fact!</td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/transformer_tts_ljspeech_ckpt_0.4_waveflow_ljspeech_ckpt_0.3/008.wav"
                         type="audio/wav">
@@ -419,7 +419,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
             </td>
 
             <td> 
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/tacotron2_ljspeech_waveflow_samples_0.2/sentence_8.wav"
                         type="audio/wav">
@@ -430,7 +430,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
         <tr>
             <td>Love you three thousand times.</td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/transformer_tts_ljspeech_ckpt_0.4_waveflow_ljspeech_ckpt_0.3/009.wav"
                         type="audio/wav">
@@ -439,7 +439,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
             </td>
 
             <td> 
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/tacotron2_ljspeech_waveflow_samples_0.2/sentence_9.wav"
                         type="audio/wav">
@@ -465,7 +465,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
         <tr>
             <td>凯莫瑞安联合体的经济崩溃，迫在眉睫。</td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speedyspeech_baker_ckpt_0.4_pwg_baker_ckpt_0.4/001.wav"
                         type="audio/wav">
@@ -473,7 +473,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_nosil_baker_ckpt_0.4_parallel_wavegan_baker_ckpt_0.4/001.wav"
                         type="audio/wav">
@@ -484,7 +484,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
         <tr>
             <td>对于所有想要离开那片废土，去寻找更美好生活的人来说。</td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speedyspeech_baker_ckpt_0.4_pwg_baker_ckpt_0.4/002.wav"
                         type="audio/wav">
@@ -492,7 +492,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_nosil_baker_ckpt_0.4_parallel_wavegan_baker_ckpt_0.4/002.wav"
                         type="audio/wav">
@@ -503,7 +503,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
         <tr>
             <td>克哈，是你们所有人安全的港湾。</td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speedyspeech_baker_ckpt_0.4_pwg_baker_ckpt_0.4/003.wav"
                         type="audio/wav">
@@ -511,7 +511,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_nosil_baker_ckpt_0.4_parallel_wavegan_baker_ckpt_0.4/003.wav"
                         type="audio/wav">
@@ -523,7 +523,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
         <tr>
             <td>为了保护尤摩扬人民不受异虫的残害，我所做的，比他们自己的领导委员会都多。</td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speedyspeech_baker_ckpt_0.4_pwg_baker_ckpt_0.4/004.wav"
                         type="audio/wav">
@@ -531,7 +531,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_nosil_baker_ckpt_0.4_parallel_wavegan_baker_ckpt_0.4/004.wav"
                         type="audio/wav">
@@ -542,7 +542,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
         <tr>
             <td>无论他们如何诽谤我，我将继续为所有泰伦人的最大利益，而努力奋斗。</td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speedyspeech_baker_ckpt_0.4_pwg_baker_ckpt_0.4/005.wav"
                         type="audio/wav">
@@ -550,7 +550,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_nosil_baker_ckpt_0.4_parallel_wavegan_baker_ckpt_0.4/005.wav"
                         type="audio/wav">
@@ -561,7 +561,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
         <tr>
             <td>身为你们的元首，我带领泰伦人实现了人类统治领地和经济的扩张。</td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speedyspeech_baker_ckpt_0.4_pwg_baker_ckpt_0.4/006.wav"
                         type="audio/wav">
@@ -569,7 +569,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_nosil_baker_ckpt_0.4_parallel_wavegan_baker_ckpt_0.4/006.wav"
                         type="audio/wav">
@@ -580,7 +580,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
         <tr>
             <td>我们将继续成长，用行动回击那些只会说风凉话，不愿意和我们相向而行的害群之马。</td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speedyspeech_baker_ckpt_0.4_pwg_baker_ckpt_0.4/007.wav"
                         type="audio/wav">
@@ -588,7 +588,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_nosil_baker_ckpt_0.4_parallel_wavegan_baker_ckpt_0.4/007.wav"
                         type="audio/wav">
@@ -599,7 +599,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
         <tr>
             <td>帝国武装力量，无数的优秀儿女，正时刻守卫着我们的家园大门，但是他们孤木难支。</td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speedyspeech_baker_ckpt_0.4_pwg_baker_ckpt_0.4/008.wav"
                         type="audio/wav">
@@ -607,7 +607,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_nosil_baker_ckpt_0.4_parallel_wavegan_baker_ckpt_0.4/008.wav"
                         type="audio/wav">
@@ -618,7 +618,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
         <tr>
             <td>凡是今天应征入伍者，所获的所有刑罚罪责，减半。</td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speedyspeech_baker_ckpt_0.4_pwg_baker_ckpt_0.4/009.wav"
                         type="audio/wav">
@@ -626,7 +626,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_nosil_baker_ckpt_0.4_parallel_wavegan_baker_ckpt_0.4/009.wav"
                         type="audio/wav">
@@ -641,11 +641,11 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
 
     <table border="2" cellspacing="1" cellpadding="1"> 
         <tr>
-            <th align="center"> FastSpeech2-Conformer + ParallelWaveGAN </th>
+            <th align="center"> FastSpeech2-Conformer + </br>ParallelWaveGAN </th>
         </tr>
         <tr>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_conformer_baker_ckpt_0.5_pwg_baker_ckpt_0.4/001.wav"
                         type="audio/wav">
@@ -655,7 +655,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
         </tr>
         <tr>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_conformer_baker_ckpt_0.5_pwg_baker_ckpt_0.4/002.wav"
                         type="audio/wav">
@@ -665,7 +665,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
         </tr>
         <tr>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_conformer_baker_ckpt_0.5_pwg_baker_ckpt_0.4/003.wav"
                         type="audio/wav">
@@ -676,7 +676,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
 
         <tr>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_conformer_baker_ckpt_0.5_pwg_baker_ckpt_0.4/004.wav"
                         type="audio/wav">
@@ -686,7 +686,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
         </tr>
         <tr>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_conformer_baker_ckpt_0.5_pwg_baker_ckpt_0.4/005.wav"
                         type="audio/wav">
@@ -696,7 +696,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
         </tr>
         <tr>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_conformer_baker_ckpt_0.5_pwg_baker_ckpt_0.4/006.wav"
                         type="audio/wav">
@@ -706,7 +706,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
         </tr>
         <tr>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_conformer_baker_ckpt_0.5_pwg_baker_ckpt_0.4/007.wav"
                         type="audio/wav">
@@ -716,7 +716,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
         </tr>
         <tr>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_conformer_baker_ckpt_0.5_pwg_baker_ckpt_0.4/008.wav"
                         type="audio/wav">
@@ -726,7 +726,7 @@ Audio samples generated by a TTS system. Text is first transformed into spectrog
         </tr>
         <tr>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fastspeech2_conformer_baker_ckpt_0.5_pwg_baker_ckpt_0.4/009.wav"
                         type="audio/wav">
@@ -756,7 +756,7 @@ PaddleSpeech also support Multi-Speaker TTS, we provide the audio demos generate
         </tr>
         <tr>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/target/0.wav"
                         type="audio/wav">
@@ -764,7 +764,7 @@ PaddleSpeech also support Multi-Speaker TTS, we provide the audio demos generate
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/generated/0_002.wav"
                         type="audio/wav">
@@ -774,7 +774,7 @@ PaddleSpeech also support Multi-Speaker TTS, we provide the audio demos generate
         </tr>
         <tr>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/target/1.wav"
                         type="audio/wav">
@@ -782,7 +782,7 @@ PaddleSpeech also support Multi-Speaker TTS, we provide the audio demos generate
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/generated/1_002.wav"
                         type="audio/wav">
@@ -792,7 +792,7 @@ PaddleSpeech also support Multi-Speaker TTS, we provide the audio demos generate
         </tr>
         <tr>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/target/2.wav"
                         type="audio/wav">
@@ -800,7 +800,7 @@ PaddleSpeech also support Multi-Speaker TTS, we provide the audio demos generate
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/generated/2_002.wav"
                         type="audio/wav">
@@ -810,7 +810,7 @@ PaddleSpeech also support Multi-Speaker TTS, we provide the audio demos generate
         </tr>
         <tr>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/target/3.wav"
                         type="audio/wav">
@@ -818,7 +818,7 @@ PaddleSpeech also support Multi-Speaker TTS, we provide the audio demos generate
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/generated/3_002.wav"
                         type="audio/wav">
@@ -828,7 +828,7 @@ PaddleSpeech also support Multi-Speaker TTS, we provide the audio demos generate
         </tr>
         <tr>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/target/4.wav"
                         type="audio/wav">
@@ -836,7 +836,7 @@ PaddleSpeech also support Multi-Speaker TTS, we provide the audio demos generate
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/generated/4_002.wav"
                         type="audio/wav">
@@ -846,7 +846,7 @@ PaddleSpeech also support Multi-Speaker TTS, we provide the audio demos generate
         </tr>
         <tr>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/target/5.wav"
                         type="audio/wav">
@@ -854,7 +854,7 @@ PaddleSpeech also support Multi-Speaker TTS, we provide the audio demos generate
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/generated/5_002.wav"
                         type="audio/wav">
@@ -864,7 +864,7 @@ PaddleSpeech also support Multi-Speaker TTS, we provide the audio demos generate
         </tr>
         <tr>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/target/6.wav"
                         type="audio/wav">
@@ -872,7 +872,7 @@ PaddleSpeech also support Multi-Speaker TTS, we provide the audio demos generate
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/generated/6_002.wav"
                         type="audio/wav">
@@ -882,7 +882,7 @@ PaddleSpeech also support Multi-Speaker TTS, we provide the audio demos generate
         </tr>
         <tr>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/target/7.wav"
                         type="audio/wav">
@@ -890,7 +890,7 @@ PaddleSpeech also support Multi-Speaker TTS, we provide the audio demos generate
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/generated/7_002.wav"
                         type="audio/wav">
@@ -900,7 +900,7 @@ PaddleSpeech also support Multi-Speaker TTS, we provide the audio demos generate
         </tr>
         <tr>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/target/8.wav"
                         type="audio/wav">
@@ -908,7 +908,7 @@ PaddleSpeech also support Multi-Speaker TTS, we provide the audio demos generate
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/generated/8_002.wav"
                         type="audio/wav">
@@ -918,7 +918,7 @@ PaddleSpeech also support Multi-Speaker TTS, we provide the audio demos generate
         </tr>
         <tr>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/target/9.wav"
                         type="audio/wav">
@@ -926,7 +926,7 @@ PaddleSpeech also support Multi-Speaker TTS, we provide the audio demos generate
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/generated/9_002.wav"
                         type="audio/wav">
@@ -936,7 +936,7 @@ PaddleSpeech also support Multi-Speaker TTS, we provide the audio demos generate
         </tr>
         <tr>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/target/10.wav"
                         type="audio/wav">
@@ -944,7 +944,7 @@ PaddleSpeech also support Multi-Speaker TTS, we provide the audio demos generate
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/generated/10_002.wav"
                         type="audio/wav">
@@ -954,7 +954,7 @@ PaddleSpeech also support Multi-Speaker TTS, we provide the audio demos generate
         </tr>
         <tr>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/target/11.wav"
                         type="audio/wav">
@@ -962,7 +962,7 @@ PaddleSpeech also support Multi-Speaker TTS, we provide the audio demos generate
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/generated/11_002.wav"
                         type="audio/wav">
@@ -972,7 +972,7 @@ PaddleSpeech also support Multi-Speaker TTS, we provide the audio demos generate
         </tr>
         <tr>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/target/12.wav"
                         type="audio/wav">
@@ -980,7 +980,7 @@ PaddleSpeech also support Multi-Speaker TTS, we provide the audio demos generate
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/generated/12_002.wav"
                         type="audio/wav">
@@ -990,7 +990,7 @@ PaddleSpeech also support Multi-Speaker TTS, we provide the audio demos generate
         </tr>
         <tr>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/target/13.wav"
                         type="audio/wav">
@@ -998,7 +998,7 @@ PaddleSpeech also support Multi-Speaker TTS, we provide the audio demos generate
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/generated/13_002.wav"
                         type="audio/wav">
@@ -1008,7 +1008,7 @@ PaddleSpeech also support Multi-Speaker TTS, we provide the audio demos generate
         </tr>
         <tr>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/target/14.wav"
                         type="audio/wav">
@@ -1016,7 +1016,7 @@ PaddleSpeech also support Multi-Speaker TTS, we provide the audio demos generate
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/generated/14_002.wav"
                         type="audio/wav">
@@ -1026,7 +1026,7 @@ PaddleSpeech also support Multi-Speaker TTS, we provide the audio demos generate
         </tr>
         <tr>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/target/15.wav"
                         type="audio/wav">
@@ -1034,7 +1034,7 @@ PaddleSpeech also support Multi-Speaker TTS, we provide the audio demos generate
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/generated/15_002.wav"
                         type="audio/wav">
@@ -1044,7 +1044,7 @@ PaddleSpeech also support Multi-Speaker TTS, we provide the audio demos generate
         </tr>
         <tr>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/target/16.wav"
                         type="audio/wav">
@@ -1052,7 +1052,7 @@ PaddleSpeech also support Multi-Speaker TTS, we provide the audio demos generate
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/generated/16_002.wav"
                         type="audio/wav">
@@ -1062,7 +1062,7 @@ PaddleSpeech also support Multi-Speaker TTS, we provide the audio demos generate
         </tr>
         <tr>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/target/17.wav"
                         type="audio/wav">
@@ -1070,7 +1070,7 @@ PaddleSpeech also support Multi-Speaker TTS, we provide the audio demos generate
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/generated/17_002.wav"
                         type="audio/wav">
@@ -1080,7 +1080,7 @@ PaddleSpeech also support Multi-Speaker TTS, we provide the audio demos generate
         </tr>
         <tr>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/target/18.wav"
                         type="audio/wav">
@@ -1088,7 +1088,7 @@ PaddleSpeech also support Multi-Speaker TTS, we provide the audio demos generate
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/generated/18_002.wav"
                         type="audio/wav">
@@ -1098,7 +1098,7 @@ PaddleSpeech also support Multi-Speaker TTS, we provide the audio demos generate
         </tr>
         <tr>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/target/19.wav"
                         type="audio/wav">
@@ -1106,7 +1106,7 @@ PaddleSpeech also support Multi-Speaker TTS, we provide the audio demos generate
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/fs2_aishell3_demos/generated/19_002.wav"
                         type="audio/wav">
@@ -1142,7 +1142,7 @@ The duration control in FastSpeech2 can control the speed of audios will keep th
         </tr>
         <tr>
              <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 250px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speed/x0.8_001.wav"
                         type="audio/wav">
@@ -1150,7 +1150,7 @@ The duration control in FastSpeech2 can control the speed of audios will keep th
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 250px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speed/x1_001.wav"
                         type="audio/wav">
@@ -1158,7 +1158,7 @@ The duration control in FastSpeech2 can control the speed of audios will keep th
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 250px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speed/x1.2_001.wav"
                         type="audio/wav">
@@ -1168,7 +1168,7 @@ The duration control in FastSpeech2 can control the speed of audios will keep th
         </tr>
         <tr>
              <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 250px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speed/x0.8_002.wav"
                         type="audio/wav">
@@ -1176,7 +1176,7 @@ The duration control in FastSpeech2 can control the speed of audios will keep th
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 250px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speed/x1_002.wav"
                         type="audio/wav">
@@ -1184,7 +1184,7 @@ The duration control in FastSpeech2 can control the speed of audios will keep th
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 250px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speed/x1.2_002.wav"
                         type="audio/wav">
@@ -1194,7 +1194,7 @@ The duration control in FastSpeech2 can control the speed of audios will keep th
         </tr>
         <tr>
              <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 250px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speed/x0.8_003.wav"
                         type="audio/wav">
@@ -1202,7 +1202,7 @@ The duration control in FastSpeech2 can control the speed of audios will keep th
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 250px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speed/x1_003.wav"
                         type="audio/wav">
@@ -1210,7 +1210,7 @@ The duration control in FastSpeech2 can control the speed of audios will keep th
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 250px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speed/x1.2_003.wav"
                         type="audio/wav">
@@ -1220,7 +1220,7 @@ The duration control in FastSpeech2 can control the speed of audios will keep th
         </tr>
         <tr>
              <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 250px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speed/x0.8_004.wav"
                         type="audio/wav">
@@ -1228,7 +1228,7 @@ The duration control in FastSpeech2 can control the speed of audios will keep th
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 250px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speed/x1_004.wav"
                         type="audio/wav">
@@ -1236,7 +1236,7 @@ The duration control in FastSpeech2 can control the speed of audios will keep th
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 250px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speed/x1.2_004.wav"
                         type="audio/wav">
@@ -1246,7 +1246,7 @@ The duration control in FastSpeech2 can control the speed of audios will keep th
         </tr>
         <tr>
              <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 250px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speed/x0.8_005.wav"
                         type="audio/wav">
@@ -1254,7 +1254,7 @@ The duration control in FastSpeech2 can control the speed of audios will keep th
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 250px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speed/x1_005.wav"
                         type="audio/wav">
@@ -1262,7 +1262,7 @@ The duration control in FastSpeech2 can control the speed of audios will keep th
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 250px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speed/x1.2_005.wav"
                         type="audio/wav">
@@ -1272,7 +1272,7 @@ The duration control in FastSpeech2 can control the speed of audios will keep th
         </tr>
         <tr>
              <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 250px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speed/x0.8_007.wav"
                         type="audio/wav">
@@ -1280,7 +1280,7 @@ The duration control in FastSpeech2 can control the speed of audios will keep th
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 250px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speed/x1_007.wav"
                         type="audio/wav">
@@ -1288,7 +1288,7 @@ The duration control in FastSpeech2 can control the speed of audios will keep th
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 250px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speed/x1.2_007.wav"
                         type="audio/wav">
@@ -1298,7 +1298,7 @@ The duration control in FastSpeech2 can control the speed of audios will keep th
         </tr>
         <tr>
              <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 250px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speed/x0.8_008.wav"
                         type="audio/wav">
@@ -1306,7 +1306,7 @@ The duration control in FastSpeech2 can control the speed of audios will keep th
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 250px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speed/x1_008.wav"
                         type="audio/wav">
@@ -1314,7 +1314,7 @@ The duration control in FastSpeech2 can control the speed of audios will keep th
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 250px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speed/x1.2_008.wav"
                         type="audio/wav">
@@ -1324,7 +1324,7 @@ The duration control in FastSpeech2 can control the speed of audios will keep th
         </tr>
         <tr>
              <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 250px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speed/x0.8_009.wav"
                         type="audio/wav">
@@ -1332,7 +1332,7 @@ The duration control in FastSpeech2 can control the speed of audios will keep th
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 250px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speed/x1_009.wav"
                         type="audio/wav">
@@ -1340,7 +1340,7 @@ The duration control in FastSpeech2 can control the speed of audios will keep th
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 250px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/speed/x1.2_009.wav"
                         type="audio/wav">
@@ -1374,7 +1374,7 @@ The nomal audios are in the second column of the previous table.
         </tr>
         <tr>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/robot/001.wav"
                         type="audio/wav">
@@ -1382,7 +1382,7 @@ The nomal audios are in the second column of the previous table.
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/child_voice/001.wav"
                         type="audio/wav">
@@ -1392,7 +1392,7 @@ The nomal audios are in the second column of the previous table.
         </tr>
         <tr>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/robot/002.wav"
                         type="audio/wav">
@@ -1400,7 +1400,7 @@ The nomal audios are in the second column of the previous table.
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/child_voice/002.wav"
                         type="audio/wav">
@@ -1410,7 +1410,7 @@ The nomal audios are in the second column of the previous table.
         </tr>
         <tr>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/robot/003.wav"
                         type="audio/wav">
@@ -1418,7 +1418,7 @@ The nomal audios are in the second column of the previous table.
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/child_voice/003.wav"
                         type="audio/wav">
@@ -1428,7 +1428,7 @@ The nomal audios are in the second column of the previous table.
         </tr>
         <tr>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/robot/004.wav"
                         type="audio/wav">
@@ -1436,7 +1436,7 @@ The nomal audios are in the second column of the previous table.
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/child_voice//004.wav"
                         type="audio/wav">
@@ -1446,7 +1446,7 @@ The nomal audios are in the second column of the previous table.
         </tr>
         <tr>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/robot/005.wav"
                         type="audio/wav">
@@ -1454,7 +1454,7 @@ The nomal audios are in the second column of the previous table.
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/child_voice//005.wav"
                         type="audio/wav">
@@ -1464,7 +1464,7 @@ The nomal audios are in the second column of the previous table.
         </tr>
         <tr>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/robot/007.wav"
                         type="audio/wav">
@@ -1472,7 +1472,7 @@ The nomal audios are in the second column of the previous table.
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/child_voice//007.wav"
                         type="audio/wav">
@@ -1482,7 +1482,7 @@ The nomal audios are in the second column of the previous table.
         </tr>
         <tr>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/robot/008.wav"
                         type="audio/wav">
@@ -1490,7 +1490,7 @@ The nomal audios are in the second column of the previous table.
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/child_voice//008.wav"
                         type="audio/wav">
@@ -1500,7 +1500,7 @@ The nomal audios are in the second column of the previous table.
         </tr>
         <tr>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/robot/009.wav"
                         type="audio/wav">
@@ -1508,7 +1508,7 @@ The nomal audios are in the second column of the previous table.
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/child_voice//009.wav"
                         type="audio/wav">
@@ -1542,7 +1542,7 @@ We use ``FastSpeech2`` + ``ParallelWaveGAN`` here.
         <tr>
             <td>他只是一个纸老虎。</td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/with_frontend/001.wav"
                         type="audio/wav">
@@ -1550,7 +1550,7 @@ We use ``FastSpeech2`` + ``ParallelWaveGAN`` here.
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/without_frontend/001.wav"
                         type="audio/wav">
@@ -1561,7 +1561,7 @@ We use ``FastSpeech2`` + ``ParallelWaveGAN`` here.
         <tr>
             <td>手表厂有五种好产品。</td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/with_frontend/002.wav"
                         type="audio/wav">
@@ -1569,7 +1569,7 @@ We use ``FastSpeech2`` + ``ParallelWaveGAN`` here.
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/without_frontend/002.wav"
                         type="audio/wav">
@@ -1580,7 +1580,7 @@ We use ``FastSpeech2`` + ``ParallelWaveGAN`` here.
         <tr>
             <td>老板的轿车需要保养。</td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/with_frontend/003.wav"
                         type="audio/wav">
@@ -1588,7 +1588,7 @@ We use ``FastSpeech2`` + ``ParallelWaveGAN`` here.
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/without_frontend/003.wav"
                         type="audio/wav">
@@ -1599,7 +1599,7 @@ We use ``FastSpeech2`` + ``ParallelWaveGAN`` here.
         <tr>
             <td>我们所有人都好喜欢你呀。</td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/with_frontend/004.wav"
                         type="audio/wav">
@@ -1607,7 +1607,7 @@ We use ``FastSpeech2`` + ``ParallelWaveGAN`` here.
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/without_frontend/004.wav"
                         type="audio/wav">
@@ -1618,7 +1618,7 @@ We use ``FastSpeech2`` + ``ParallelWaveGAN`` here.
         <tr>
             <td>岂有此理。</td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/with_frontend/005.wav"
                         type="audio/wav">
@@ -1626,7 +1626,7 @@ We use ``FastSpeech2`` + ``ParallelWaveGAN`` here.
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/without_frontend/005.wav"
                         type="audio/wav">
@@ -1637,7 +1637,7 @@ We use ``FastSpeech2`` + ``ParallelWaveGAN`` here.
         <tr>
             <td>虎骨酒多少钱一瓶。</td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/with_frontend/006.wav"
                         type="audio/wav">
@@ -1645,7 +1645,7 @@ We use ``FastSpeech2`` + ``ParallelWaveGAN`` here.
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/without_frontend/006.wav"
                         type="audio/wav">
@@ -1656,7 +1656,7 @@ We use ``FastSpeech2`` + ``ParallelWaveGAN`` here.
         <tr>
             <td>这件事情需要冷处理。</td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/with_frontend/007.wav"
                         type="audio/wav">
@@ -1664,7 +1664,7 @@ We use ``FastSpeech2`` + ``ParallelWaveGAN`` here.
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/without_frontend/007.wav"
                         type="audio/wav">
@@ -1675,7 +1675,7 @@ We use ``FastSpeech2`` + ``ParallelWaveGAN`` here.
         <tr>
             <td>这个老奶奶是个大喇叭。</td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/with_frontend/008.wav"
                         type="audio/wav">
@@ -1683,7 +1683,7 @@ We use ``FastSpeech2`` + ``ParallelWaveGAN`` here.
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/without_frontend/008.wav"
                         type="audio/wav">
@@ -1694,7 +1694,7 @@ We use ``FastSpeech2`` + ``ParallelWaveGAN`` here.
         <tr>
             <td>我喜欢说相声。</td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/with_frontend/009.wav"
                         type="audio/wav">
@@ -1702,7 +1702,7 @@ We use ``FastSpeech2`` + ``ParallelWaveGAN`` here.
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/without_frontend/009.wav"
                         type="audio/wav">
@@ -1713,7 +1713,7 @@ We use ``FastSpeech2`` + ``ParallelWaveGAN`` here.
         <tr>
             <td>有一天，我路过了一栋楼。</td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/with_frontend/010.wav"
                         type="audio/wav">
@@ -1721,7 +1721,7 @@ We use ``FastSpeech2`` + ``ParallelWaveGAN`` here.
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/without_frontend/010.wav"
                         type="audio/wav">
@@ -1735,4 +1735,142 @@ We use ``FastSpeech2`` + ``ParallelWaveGAN`` here.
     <br>
     <br> 
 
-   
\ No newline at end of file
+
+Finetune FastSpeech2 for CSMSC
+--------------------------------------
+
+Finetuning demos of `tts_finetune/tts3 <https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/tts_finetune/tts3>`_  for CSMSC dataset.
+
+When finetuning for CSMSC, we thought ``Freeze encoder`` > ``Non Frozen`` > ``Freeze encoder && duration_predictor`` for audio quality.
+
+.. raw:: html
+
+    <div class="table">
+    CSMSC reference audio (fastspeech2_csmsc + hifigan_aishlle3 in CLI): 欢迎使用飞桨语音套件。
+    <br>
+    <br>
+    <audio controls="controls" style="width: 220px;">
+        <source
+            src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/finetune/ref_fastspeech2_csmsc_hifigan_aishell3.wav"
+            type="audio/wav">
+        Your browser does not support the <code>audio</code> element.
+    </audio>
+    <br>
+    <br>
+    <table border="2" cellspacing="1" cellpadding="1">
+        <tr>
+            <th align="center"> Frozen Method</th>
+            <th align="center"> train_num=10, </br> bs=10, </br> epoch=100, </br> lr=1e-4 </th>
+            <th align="center"> train_num=18, </br> bs=18, </br> epoch=100, </br> lr=1e-4 </th>
+            <th align="center"> train_num=97, </br> bs=64, </br> epoch=100, </br> lr=1e-4 </th>
+            <th align="center"> train_num=196, </br> bs=64, </br> epoch=100, </br> lr=1e-4 </th>
+        </tr>
+        <tr>
+            <td>Non Frozen</td>
+            <td>
+                <audio controls="controls" style="width: 150px;">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/finetune/train10_bn10_epoch100_lr0.0001.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls" style="width: 150px;">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/finetune/train18_bn18_epoch100_lr0.0001.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls" style="width: 150px;">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/finetune/train97_bn64_epoch100_lr0.0001.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls" style="width: 150px;">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/finetune/train196_bn64_epoch100_lr0.0001.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>Freeze encoder</td>
+            <td>
+                <audio controls="controls" style="width: 150px;">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/finetune/train10_fr_encoder_bn10_epoch100_lr0.0001.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls" style="width: 150px;">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/finetune/train18_fr_encoder_bn18_epoch100_lr0.0001.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls" style="width: 150px;">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/finetune/train97_fr_encoder_bn64_epoch100_lr0.0001.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls" style="width: 150px;">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/finetune/train196_fr_encoder_bn64_epoch100_lr0.0001.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+        <tr>
+            <td>Freeze encoder && </br> duration_predictor</td>
+            <td>
+                <audio controls="controls" style="width: 150px;">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/finetune/train10_fr_encoder_duration_bn10_epoch100_lr0.0001.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls" style="width: 150px;">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/finetune/train18_fr_encoder_duration_bn18_epoch100_lr0.0001.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls" style="width: 150px;">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/finetune/train97_fr_encoder_duration_bn64_epoch100_lr0.0001.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+            <td>
+                <audio controls="controls" style="width: 150px;">
+                    <source
+                        src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/finetune/train196_fr_encoder_duration_bn64_epoch100_lr0.0001.wav"
+                        type="audio/wav">
+                    Your browser does not support the <code>audio</code> element.
+                </audio>
+            </td>
+        </tr>
+    <table>
+    </div>
+    <br>
+    <br> 
diff --git a/docs/source/tts/demo_2.rst b/docs/source/tts/demo_2.rst
index 2f0ca7cd..06d0d039 100644
--- a/docs/source/tts/demo_2.rst
+++ b/docs/source/tts/demo_2.rst
@@ -19,7 +19,7 @@ FastSpeech2 + Parallel WaveGAN in CSMSC
         <tr>
             <td>早上好，今天是2020/10/29，最低温度是-3°C。</td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/001.wav"
                         type="audio/wav">
@@ -27,7 +27,7 @@ FastSpeech2 + Parallel WaveGAN in CSMSC
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/001.wav"
                         type="audio/wav">
@@ -38,7 +38,7 @@ FastSpeech2 + Parallel WaveGAN in CSMSC
         <tr>
             <td>你好，我的编号是37249，很高兴为您服务。</td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/002.wav"
                         type="audio/wav">
@@ -46,7 +46,7 @@ FastSpeech2 + Parallel WaveGAN in CSMSC
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/002.wav"
                         type="audio/wav">
@@ -57,7 +57,7 @@ FastSpeech2 + Parallel WaveGAN in CSMSC
         <tr>
             <td>我们公司有37249个人。</td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/003.wav"
                         type="audio/wav">
@@ -65,7 +65,7 @@ FastSpeech2 + Parallel WaveGAN in CSMSC
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/003.wav"
                         type="audio/wav">
@@ -76,7 +76,7 @@ FastSpeech2 + Parallel WaveGAN in CSMSC
         <tr>
             <td>我出生于2005年10月8日。</td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/004.wav"
                         type="audio/wav">
@@ -84,7 +84,7 @@ FastSpeech2 + Parallel WaveGAN in CSMSC
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/004.wav"
                         type="audio/wav">
@@ -95,7 +95,7 @@ FastSpeech2 + Parallel WaveGAN in CSMSC
         <tr>
             <td>我们习惯在12:30吃中午饭。</td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/005.wav"
                         type="audio/wav">
@@ -103,7 +103,7 @@ FastSpeech2 + Parallel WaveGAN in CSMSC
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/005.wav"
                         type="audio/wav">
@@ -114,7 +114,7 @@ FastSpeech2 + Parallel WaveGAN in CSMSC
         <tr>
             <td>只要有超过3/4的人投票同意，你就会成为我们的新班长。</td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/006.wav"
                         type="audio/wav">
@@ -122,7 +122,7 @@ FastSpeech2 + Parallel WaveGAN in CSMSC
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/006.wav"
                         type="audio/wav">
@@ -133,7 +133,7 @@ FastSpeech2 + Parallel WaveGAN in CSMSC
         <tr>
             <td>我要买一只价值999.9元的手表。</td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/007.wav"
                         type="audio/wav">
@@ -141,7 +141,7 @@ FastSpeech2 + Parallel WaveGAN in CSMSC
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/007.wav"
                         type="audio/wav">
@@ -152,7 +152,7 @@ FastSpeech2 + Parallel WaveGAN in CSMSC
         <tr>
             <td>我的手机号是18544139121，欢迎来电。</td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/008.wav"
                         type="audio/wav">
@@ -160,7 +160,7 @@ FastSpeech2 + Parallel WaveGAN in CSMSC
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/008.wav"
                         type="audio/wav">
@@ -171,7 +171,7 @@ FastSpeech2 + Parallel WaveGAN in CSMSC
         <tr>
             <td>明天有62%的概率降雨。</td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/009.wav"
                         type="audio/wav">
@@ -179,7 +179,7 @@ FastSpeech2 + Parallel WaveGAN in CSMSC
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/009.wav"
                         type="audio/wav">
@@ -190,7 +190,7 @@ FastSpeech2 + Parallel WaveGAN in CSMSC
         <tr>
             <td>手表厂有五种好产品。</td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/010.wav"
                         type="audio/wav">
@@ -198,7 +198,7 @@ FastSpeech2 + Parallel WaveGAN in CSMSC
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/010.wav"
                         type="audio/wav">
@@ -209,7 +209,7 @@ FastSpeech2 + Parallel WaveGAN in CSMSC
         <tr>
             <td>跑马场有五百匹很勇敢的千里马。</td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/011.wav"
                         type="audio/wav">
@@ -217,7 +217,7 @@ FastSpeech2 + Parallel WaveGAN in CSMSC
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/011.wav"
                         type="audio/wav">
@@ -228,7 +228,7 @@ FastSpeech2 + Parallel WaveGAN in CSMSC
         <tr>
             <td>有一天，我看到了一栋楼，我顿感不妙，因为我看不清里面有没有人。</td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/012.wav"
                         type="audio/wav">
@@ -236,7 +236,7 @@ FastSpeech2 + Parallel WaveGAN in CSMSC
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/012.wav"
                         type="audio/wav">
@@ -247,7 +247,7 @@ FastSpeech2 + Parallel WaveGAN in CSMSC
         <tr>
             <td>史小姐拿着小雨伞去找她的老保姆了。</td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/013.wav"
                         type="audio/wav">
@@ -255,7 +255,7 @@ FastSpeech2 + Parallel WaveGAN in CSMSC
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/013.wav"
                         type="audio/wav">
@@ -266,7 +266,7 @@ FastSpeech2 + Parallel WaveGAN in CSMSC
         <tr>
             <td>不要相信这个老奶奶说的话，她一点儿也不好。</td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/espent/014.wav"
                         type="audio/wav">
@@ -274,7 +274,7 @@ FastSpeech2 + Parallel WaveGAN in CSMSC
                 </audio>
             </td>
             <td>
-                <audio controls="controls">
+                <audio controls="controls" style="width: 220px;">
                     <source
                         src="https://paddlespeech.bj.bcebos.com/Parakeet/docs/demos/parakeet_espnet_fs2_pwg_demo/tn_g2p/parakeet/014.wav"
                         type="audio/wav">
diff --git a/examples/aishell3/ernie_sat/README.md b/examples/aishell3/ernie_sat/README.md
index eb867ab7..9b776898 100644
--- a/examples/aishell3/ernie_sat/README.md
+++ b/examples/aishell3/ernie_sat/README.md
@@ -1,4 +1,4 @@
-# ERNIE-SAT with VCTK dataset
+# ERNIE-SAT with AISHELL-3 dataset
 ERNIE-SAT speech-text joint pretraining framework, which achieves SOTA results in cross-lingual multi-speaker speech synthesis and cross-lingual speech editing tasks, It can be applied to a series of scenarios such as Speech Editing, personalized Speech Synthesis, and Voice Cloning.
 
 ## Model Framework
diff --git a/examples/aishell3/vc0/local/synthesize.sh b/examples/aishell3/vc0/local/synthesize.sh
index 98430280..04dc23ea 100755
--- a/examples/aishell3/vc0/local/synthesize.sh
+++ b/examples/aishell3/vc0/local/synthesize.sh
@@ -4,8 +4,6 @@ config_path=$1
 train_output_path=$2
 ckpt_name=$3
 
-FLAGS_allocator_strategy=naive_best_fit \
-FLAGS_fraction_of_gpu_memory_to_use=0.01 \
 python3 ${BIN_DIR}/../synthesize.py \
     --am=tacotron2_aishell3 \
     --am_config=${config_path} \
diff --git a/examples/aishell3/vc0/local/voice_cloning.sh b/examples/aishell3/vc0/local/voice_cloning.sh
index 79831f3f..20cba281 100755
--- a/examples/aishell3/vc0/local/voice_cloning.sh
+++ b/examples/aishell3/vc0/local/voice_cloning.sh
@@ -6,8 +6,6 @@ ckpt_name=$3
 ge2e_params_path=$4
 ref_audio_dir=$5
 
-FLAGS_allocator_strategy=naive_best_fit \
-FLAGS_fraction_of_gpu_memory_to_use=0.01 \
 python3 ${BIN_DIR}/../voice_cloning.py \
     --am=tacotron2_aishell3 \
     --am_config=${config_path} \
diff --git a/examples/aishell3/vc1/local/synthesize.sh b/examples/aishell3/vc1/local/synthesize.sh
index 8c61e3f3..8fd8977d 100755
--- a/examples/aishell3/vc1/local/synthesize.sh
+++ b/examples/aishell3/vc1/local/synthesize.sh
@@ -4,8 +4,6 @@ config_path=$1
 train_output_path=$2
 ckpt_name=$3
 
-FLAGS_allocator_strategy=naive_best_fit \
-FLAGS_fraction_of_gpu_memory_to_use=0.01 \
 python3 ${BIN_DIR}/../synthesize.py \
     --am=fastspeech2_aishell3 \
     --am_config=${config_path} \
diff --git a/examples/aishell3/vc1/local/voice_cloning.sh b/examples/aishell3/vc1/local/voice_cloning.sh
index 2a8864ba..71c11956 100755
--- a/examples/aishell3/vc1/local/voice_cloning.sh
+++ b/examples/aishell3/vc1/local/voice_cloning.sh
@@ -6,8 +6,6 @@ ckpt_name=$3
 ge2e_params_path=$4
 ref_audio_dir=$5
 
-FLAGS_allocator_strategy=naive_best_fit \
-FLAGS_fraction_of_gpu_memory_to_use=0.01 \
 python3 ${BIN_DIR}/../voice_cloning.py \
     --am=fastspeech2_aishell3 \
     --am_config=${config_path} \
diff --git a/examples/aishell3/vc2/local/synthesize.sh b/examples/aishell3/vc2/local/synthesize.sh
index 8c61e3f3..8fd8977d 100755
--- a/examples/aishell3/vc2/local/synthesize.sh
+++ b/examples/aishell3/vc2/local/synthesize.sh
@@ -4,8 +4,6 @@ config_path=$1
 train_output_path=$2
 ckpt_name=$3
 
-FLAGS_allocator_strategy=naive_best_fit \
-FLAGS_fraction_of_gpu_memory_to_use=0.01 \
 python3 ${BIN_DIR}/../synthesize.py \
     --am=fastspeech2_aishell3 \
     --am_config=${config_path} \
diff --git a/examples/aishell3/vc2/local/voice_cloning.sh b/examples/aishell3/vc2/local/voice_cloning.sh
index 09c5e436..ae8211b9 100755
--- a/examples/aishell3/vc2/local/voice_cloning.sh
+++ b/examples/aishell3/vc2/local/voice_cloning.sh
@@ -5,8 +5,6 @@ train_output_path=$2
 ckpt_name=$3
 ref_audio_dir=$4
 
-FLAGS_allocator_strategy=naive_best_fit \
-FLAGS_fraction_of_gpu_memory_to_use=0.01 \
 python3 ${BIN_DIR}/../voice_cloning.py \
     --am=fastspeech2_aishell3 \
     --am_config=${config_path} \
diff --git a/examples/aishell3_vctk/ernie_sat/README.md b/examples/aishell3_vctk/ernie_sat/README.md
index d55af675..32195783 100644
--- a/examples/aishell3_vctk/ernie_sat/README.md
+++ b/examples/aishell3_vctk/ernie_sat/README.md
@@ -1,4 +1,4 @@
-# ERNIE-SAT with VCTK dataset
+# ERNIE-SAT with AISHELL-3 and VCTK dataset
 ERNIE-SAT speech-text joint pretraining framework, which achieves SOTA results in cross-lingual multi-speaker speech synthesis and cross-lingual speech editing tasks, It can be applied to a series of scenarios such as Speech Editing, personalized Speech Synthesis, and Voice Cloning.
 
 ## Model Framework
diff --git a/examples/other/tts_finetune/tts3/README.md b/examples/other/tts_finetune/tts3/README.md
index 192ee7ff..ceb8e797 100644
--- a/examples/other/tts_finetune/tts3/README.md
+++ b/examples/other/tts_finetune/tts3/README.md
@@ -1,20 +1,41 @@
-# Finetune your own AM based on FastSpeech2 with AISHELL-3.
-This example shows how to finetune your own AM based on FastSpeech2 with AISHELL-3. We use part of csmsc's data (top 200) as finetune data in this example. The example is implemented according to this [discussion](https://github.com/PaddlePaddle/PaddleSpeech/discussions/1842). Thanks to the developer for the idea.
+# Finetune your own AM based on FastSpeech2 with multi-speakers dataset.
+This example shows how to finetune your own AM based on FastSpeech2 with multi-speakers dataset. For finetuning Chinese data, we use part of csmsc's data (top 200) and Fastspeech2 pretrained model with AISHELL-3. For finetuning English data, we use part of ljspeech's data (top 200) and Fastspeech2 pretrained model with VCTK. The example is implemented according to this [discussion](https://github.com/PaddlePaddle/PaddleSpeech/discussions/1842). Thanks to the developer for the idea.
 
-We use AISHELL-3 to train a multi-speaker fastspeech2 model. You can refer [examples/aishell3/tts3](https://github.com/lym0302/PaddleSpeech/tree/develop/examples/aishell3/tts3) to train multi-speaker fastspeech2 from scratch.
+For more information on training Fastspeech2 with AISHELL-3, You can refer [examples/aishell3/tts3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/tts3). For more information on training Fastspeech2 with VCTK, You can refer [examples/vctk/tts3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/tts3).
 
-## Prepare
-### Download Pretrained Fastspeech2 model
-Assume the path to the model is `./pretrained_models`. Download pretrained fastspeech2 model with aishell3: [fastspeech2_aishell3_ckpt_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_aishell3_ckpt_1.1.0.zip). 
+
+## Prepare 
+### Download Pretrained model
+Assume the path to the model is `./pretrained_models`. </br>
+If you want to finetune Chinese data, you need to download Fastspeech2 pretrained model with AISHELL-3: [fastspeech2_aishell3_ckpt_1.1.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_aishell3_ckpt_1.1.0.zip) for finetuning. Download HiFiGAN pretrained model with aishell3: [hifigan_aishell3_ckpt_0.2.0](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_ckpt_0.2.0.zip) for synthesis.
 
 ```bash
 mkdir -p pretrained_models && cd pretrained_models
+# pretrained fastspeech2 model
 wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_aishell3_ckpt_1.1.0.zip 
 unzip fastspeech2_aishell3_ckpt_1.1.0.zip
+# pretrained hifigan model
+wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_ckpt_0.2.0.zip
+unzip hifigan_aishell3_ckpt_0.2.0.zip
 cd ../
 ```
+
+
+If you want to finetune English data, you need to download Fastspeech2 pretrained model with VCTK: [fastspeech2_vctk_ckpt_1.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_vctk_ckpt_1.2.0.zip) for finetuning. Download HiFiGAN pretrained model with VCTK: [hifigan_vctk_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_ckpt_0.2.0.zip) for synthesis.
+
+```bash
+mkdir -p pretrained_models && cd pretrained_models
+# pretrained fastspeech2 model
+wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_vctk_ckpt_1.2.0.zip 
+unzip fastspeech2_vctk_ckpt_1.2.0.zip
+# pretrained hifigan model
+wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_ckpt_0.2.0.zip
+unzip hifigan_vctk_ckpt_0.2.0.zip
+cd ../
+```
+
 ### Download MFA tools and pretrained model
-Assume the path to the MFA tool is `./tools`. Download [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/download/v1.0.1/montreal-forced-aligner_linux.tar.gz) and pretrained MFA models with aishell3: [aishell3_model.zip](https://paddlespeech.bj.bcebos.com/MFA/ernie_sat/aishell3_model.zip).
+Assume the path to the MFA tool is `./tools`. Download [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/download/v1.0.1/montreal-forced-aligner_linux.tar.gz).
 
 ```bash
 mkdir -p tools && cd tools
@@ -22,16 +43,34 @@ mkdir -p tools && cd tools
 wget https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/download/v1.0.1/montreal-forced-aligner_linux.tar.gz
 tar xvf montreal-forced-aligner_linux.tar.gz
 cp montreal-forced-aligner/lib/libpython3.6m.so.1.0 montreal-forced-aligner/lib/libpython3.6m.so
-# pretrained mfa model
 mkdir -p aligner && cd aligner
+```
+
+If you want to finetune Chinese data, you need to download pretrained MFA models with aishell3: [aishell3_model.zip](https://paddlespeech.bj.bcebos.com/MFA/ernie_sat/aishell3_model.zip) and unzip it.
+
+```bash
+# pretrained mfa model for Chinese data
 wget https://paddlespeech.bj.bcebos.com/MFA/ernie_sat/aishell3_model.zip
 unzip aishell3_model.zip
 wget https://paddlespeech.bj.bcebos.com/MFA/AISHELL-3/with_tone/simple.lexicon
 cd ../../
 ```
 
+If you want to finetune English data, you need to download pretrained MFA models with vctk: [vctk_model.zip](https://paddlespeech.bj.bcebos.com/MFA/ernie_sat/vctk_model.zip) and unzip it.
+
+```bash
+# pretrained mfa model for Chinese data
+wget https://paddlespeech.bj.bcebos.com/MFA/ernie_sat/vctk_model.zip
+unzip vctk_model.zip
+wget https://paddlespeech.bj.bcebos.com/MFA/LJSpeech-1.1/cmudict-0.7b
+cd ../../
+```
+
 ### Prepare your data
-Assume the path to the dataset is `./input`. This directory contains audio files (*.wav) and label file (labels.txt). The audio file is in wav format. The format of the label file is: utt_id|pinyin. Here is an example of the first 200 data of csmsc.
+Assume the path to the dataset is `./input` which contains a speaker folder. Speaker folder contains audio files (*.wav) and label file (labels.txt). The format of the audio file is wav. The format of the label file is: utt_id|pronunciation. </br>
+
+If you want to finetune Chinese data, Chinese label example: 000001|ka2 er2 pu3 pei2 wai4 sun1 wan2 hua2 ti1</br>
+Here is an example of the first 200 data of csmsc.
 
 ```bash
 mkdir -p input && cd input
@@ -60,7 +99,12 @@ When "Prepare" done. The structure of the current directory is listed below.
 │   │   ├── snapshot_iter_96400.pdz
 │   │   ├── speaker_id_map.txt
 │   │   └── speech_stats.npy
-│   └── fastspeech2_aishell3_ckpt_1.1.0.zip
+│   ├── fastspeech2_aishell3_ckpt_1.1.0.zip
+│   ├── hifigan_aishell3_ckpt_0.2.0    
+│   │   ├── default.yaml
+│   │   ├── feats_stats.npy
+│   │   └── snapshot_iter_2500000.pdz
+│   └── hifigan_aishell3_ckpt_0.2.0.zip
 └── tools
     ├── aligner
     │   ├── aishell3_model
@@ -75,20 +119,71 @@ When "Prepare" done. The structure of the current directory is listed below.
 
 ```
 
+If you want to finetune English data, English label example: LJ001-0001|Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition </br>
+Here is an example of the first 200 data of ljspeech.
+
+```bash
+mkdir -p input && cd input
+wget https://paddlespeech.bj.bcebos.com/datasets/ljspeech_mini.zip
+unzip ljspeech_mini.zip
+cd ../
+```
+
+When "Prepare" done. The structure of the current directory is listed below.
+```text
+├── input
+│   ├── ljspeech_mini
+│   │   ├── LJ001-0001.wav
+│   │   ├── LJ001-0002.wav
+│   │   ├── LJ001-0003.wav
+│   │   ├── ...
+│   │   ├── LJ002-0014.wav
+│   │   ├── labels.txt
+│   └── ljspeech_mini.zip
+├── pretrained_models
+│   ├── fastspeech2_vctk_ckpt_1.2.0
+│   │   ├── default.yaml
+│   │   ├── energy_stats.npy
+│   │   ├── phone_id_map.txt
+│   │   ├── pitch_stats.npy
+│   │   ├── snapshot_iter_66200.pdz
+│   │   ├── speaker_id_map.txt
+│   │   └── speech_stats.npy
+│   ├── fastspeech2_vctk_ckpt_1.2.0.zip
+│   ├── hifigan_vctk_ckpt_0.2.0    
+│   │   ├── default.yaml
+│   │   ├── feats_stats.npy
+│   │   └── snapshot_iter_2500000.pdz
+│   └── hifigan_vctk_ckpt_0.2.0.zip
+└── tools
+    ├── aligner
+    │   ├── vctk_model
+    │   ├── vctk_model.zip
+    │   └── cmudict-0.7b
+    ├── montreal-forced-aligner
+    │   ├── bin
+    │   ├── lib
+    │   └── pretrained_models
+    └── montreal-forced-aligner_linux.tar.gz
+    ...
+
+```
+
 ### Set finetune.yaml
-`finetune.yaml` contains some configurations for fine-tuning. You can try various options to fine better result.
+`conf/finetune.yaml` contains some configurations for fine-tuning. You can try various options to fine better result. The value of frozen_layers can be change according `conf/fastspeech2_layers.txt` which is the model layer of fastspeech2.
+
 Arguments:
-  - `batch_size`: finetune batch size. Default: -1, means 64 which same to pretrained model
+  - `batch_size`: finetune batch size which should be less than or equal to the number of training samples. Default: -1, means 64 which same to pretrained model
   - `learning_rate`: learning rate. Default: 0.0001
   - `num_snapshots`: number of save models. Default: -1, means 5 which same to pretrained model
   - `frozen_layers`: frozen layers. must be a list. If you don't want to frozen any layer, set []. 
 
 
-
 ## Get Started
+For Chinese data finetune, execute `./run.sh`. For English data finetune, execute `./run_en.sh`. </br>
 Run the command below to
 1. **source path**.
-2. finetune the model.
+2. finetune the model. 
 3. synthesize wavs.
     - synthesize waveform from text file.
 
@@ -102,76 +197,59 @@ You can choose a range of stages you want to run, or set `stage` equal to `stop-
 Finetune a FastSpeech2 model. 
 
 ```bash
-./run.sh --stage 0 --stop-stage 0
+./run.sh --stage 0 --stop-stage 5
 ```
-`stage 0` of `run.sh` calls `finetune.py`, here's the complete help message.
+`stage 5` of `run.sh` calls `local/finetune.py`, here's the complete help message.
 
 ```text
-usage: finetune.py [-h] [--input_dir INPUT_DIR] [--pretrained_model_dir PRETRAINED_MODEL_DIR]
-                [--mfa_dir MFA_DIR] [--dump_dir DUMP_DIR]
-                [--output_dir OUTPUT_DIR] [--lang LANG]
-                [--ngpu NGPU]
+usage: finetune.py [-h] [--pretrained_model_dir PRETRAINED_MODEL_DIR]
+                [--dump_dir DUMP_DIR] [--output_dir OUTPUT_DIR] [--ngpu NGPU]
+                [--epoch EPOCH] [--finetune_config FINETUNE_CONFIG]
 
 optional arguments:
-  -h, --help            show this help message and exit
-  --input_dir INPUT_DIR       
-                        directory containing audio and label file
+  -h, --help           Show this help message and exit
   --pretrained_model_dir PRETRAINED_MODEL_DIR
                        Path to pretrained model
-  --mfa_dir MFA_DIR    directory to save aligned files
   --dump_dir DUMP_DIR
                        directory to save feature files and metadata
   --output_dir OUTPUT_DIR      
-                       directory to save finetune model 
-  --lang LANG          Choose input audio language, zh or en
-  --ngpu NGPU          if ngpu=0, use cpu
-  --epoch EPOCH        the epoch of finetune
-  --batch_size BATCH_SIZE        
-                       the batch size of finetune, default -1 means same as pretrained model
-
+                       Directory to save finetune model 
+  --ngpu NGPU          The number of gpu, if ngpu=0, use cpu
+  --epoch EPOCH        The epoch of finetune
+  --finetune_config FINETUNE_CONFIG        
+                       Path to finetune config file
 ```
-1. `--input_dir` is the directory containing audio and label file. 
-2. `--pretrained_model_dir` is the directory incluing pretrained fastspeech2_aishell3 model.
-3. `--mfa_dir` is the directory to save the results of aligning from pretrained MFA_aishell3 model.
-4. `--dump_dir` is the directory including audio feature and metadata.
-5. `--output_dir` is the directory to save finetune model.
-6. `--lang` is the language of input audio, zh or en.
-7. `--ngpu` is the number of gpu.
-8. `--epoch` is the epoch of finetune.
-9. `--batch_size` is the batch size of finetune.
+
+1. `--pretrained_model_dir` is the directory incluing pretrained fastspeech2_aishell3 model.
+2. `--dump_dir` is the directory including audio feature and metadata.
+3. `--output_dir` is the directory to save finetune model.
+4. `--ngpu` is the number of gpu, if ngpu=0, use cpu
+5. `--epoch` is the epoch of finetune.
+6. `--finetune_config` is the path to finetune config file
+ 
 
 ### Synthesizing
-We use [HiFiGAN](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc5) as the neural vocoder.
+To synthesize Chinese audio, We use [HiFiGAN with aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc5) as the neural vocoder.
 Assume the path to the hifigan model is `./pretrained_models`. Download the pretrained HiFiGAN model from [hifigan_aishell3_ckpt_0.2.0](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_ckpt_0.2.0.zip) and unzip it.
 
-```bash
-cd pretrained_models
-wget https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_ckpt_0.2.0.zip
-unzip hifigan_aishell3_ckpt_0.2.0.zip
-cd ../
-```
+To synthesize English audio, We use [HiFiGAN with vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/voc5) as the neural vocoder.
+Assume the path to the hifigan model is `./pretrained_models`. Download the pretrained HiFiGAN model from [hifigan_vctk_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_ckpt_0.2.0.zip) and unzip it.
+
 
-HiFiGAN checkpoint contains files listed below.
-```text
-hifigan_aishell3_ckpt_0.2.0
-├── default.yaml                   # default config used to train HiFiGAN
-├── feats_stats.npy                # statistics used to normalize spectrogram when training HiFiGAN
-└── snapshot_iter_2500000.pdz      # generator parameters of HiFiGAN
-```
 Modify `ckpt` in `run.sh` to the final model in `exp/default/checkpoints`.
 ```bash
-./run.sh --stage 1 --stop-stage 1
+./run.sh --stage 6 --stop-stage 6
 ```
-`stage 1` of `run.sh` calls `${BIN_DIR}/../synthesize_e2e.py`, which can synthesize waveform from text file.
+`stage 6` of `run.sh` calls `${BIN_DIR}/../synthesize_e2e.py`, which can synthesize waveform from text file.
 
 ```text
 usage: synthesize_e2e.py [-h]
-                         [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}]
+                         [--am {fastspeech2_aishell3,fastspeech2_vctk}]
                          [--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
                          [--am_stat AM_STAT] [--phones_dict PHONES_DICT]
                          [--tones_dict TONES_DICT]
                          [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID]
-                         [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}]
+                         [--voc {pwgan_aishell3, pwgan_vctk, hifigan_aishell3, hifigan_vctk}]
                          [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
                          [--voc_stat VOC_STAT] [--lang LANG]
                          [--inference_dir INFERENCE_DIR] [--ngpu NGPU]
@@ -181,7 +259,7 @@ Synthesize with acoustic model & vocoder
 
 optional arguments:
   -h, --help            show this help message and exit
-  --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}
+  --am {fastspeech2_aishell3, fastspeech2_vctk}
                         Choose acoustic model type of tts task.
   --am_config AM_CONFIG
                         Config of acoustic model.
@@ -195,7 +273,7 @@ optional arguments:
   --speaker_dict SPEAKER_DICT
                         speaker id map file.
   --spk_id SPK_ID       spk id for multi speaker acoustic model
-  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}
+  --voc {pwgan_aishell3, pwgan_vctk, hifigan_aishell3, hifigan_vctk}
                         Choose vocoder type of tts task.
   --voc_config VOC_CONFIG
                         Config of voc.
@@ -210,6 +288,7 @@ optional arguments:
   --output_dir OUTPUT_DIR
                         output dir.
 ```
+
 1. `--am` is acoustic model type with the format {model_name}_{dataset}
 2. `--am_config`, `--am_ckpt`, `--am_stat`, `--phones_dict` `--speaker_dict` are arguments for acoustic model, which correspond to the 5 files in the fastspeech2 pretrained model.
 3. `--voc` is vocoder type with the format {model_name}_{dataset}
@@ -219,5 +298,8 @@ optional arguments:
 7.  `--output_dir` is the directory to save synthesized audio files.
 8. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
 
+
 ### Tips
-If you want to get better audio quality, you can use more audios to finetune.
+If you want to get better audio quality, you can use more audios to finetune or change configuration parameters in `conf/finetune.yaml`.</br>
+More finetune results can be found on [finetune-fastspeech2-for-csmsc](https://paddlespeech.readthedocs.io/en/latest/tts/demo.html#finetune-fastspeech2-for-csmsc).</br>
+The results show the effect on csmsc_mini: Freeze encoder > Non Frozen > Freeze encoder && duration_predictor.
diff --git a/examples/other/tts_finetune/tts3/conf/fastspeech2_layers.txt b/examples/other/tts_finetune/tts3/conf/fastspeech2_layers.txt
new file mode 100644
index 00000000..855f36b9
--- /dev/null
+++ b/examples/other/tts_finetune/tts3/conf/fastspeech2_layers.txt
@@ -0,0 +1,216 @@
+epoch
+iteration
+main_params
+main_optimizer
+spk_embedding_table.weight
+encoder.embed.0.weight
+encoder.embed.1.alpha
+encoder.encoders.0.self_attn.linear_q.weight
+encoder.encoders.0.self_attn.linear_q.bias
+encoder.encoders.0.self_attn.linear_k.weight
+encoder.encoders.0.self_attn.linear_k.bias
+encoder.encoders.0.self_attn.linear_v.weight
+encoder.encoders.0.self_attn.linear_v.bias
+encoder.encoders.0.self_attn.linear_out.weight
+encoder.encoders.0.self_attn.linear_out.bias
+encoder.encoders.0.feed_forward.w_1.weight
+encoder.encoders.0.feed_forward.w_1.bias
+encoder.encoders.0.feed_forward.w_2.weight
+encoder.encoders.0.feed_forward.w_2.bias
+encoder.encoders.0.norm1.weight
+encoder.encoders.0.norm1.bias
+encoder.encoders.0.norm2.weight
+encoder.encoders.0.norm2.bias
+encoder.encoders.1.self_attn.linear_q.weight
+encoder.encoders.1.self_attn.linear_q.bias
+encoder.encoders.1.self_attn.linear_k.weight
+encoder.encoders.1.self_attn.linear_k.bias
+encoder.encoders.1.self_attn.linear_v.weight
+encoder.encoders.1.self_attn.linear_v.bias
+encoder.encoders.1.self_attn.linear_out.weight
+encoder.encoders.1.self_attn.linear_out.bias
+encoder.encoders.1.feed_forward.w_1.weight
+encoder.encoders.1.feed_forward.w_1.bias
+encoder.encoders.1.feed_forward.w_2.weight
+encoder.encoders.1.feed_forward.w_2.bias
+encoder.encoders.1.norm1.weight
+encoder.encoders.1.norm1.bias
+encoder.encoders.1.norm2.weight
+encoder.encoders.1.norm2.bias
+encoder.encoders.2.self_attn.linear_q.weight
+encoder.encoders.2.self_attn.linear_q.bias
+encoder.encoders.2.self_attn.linear_k.weight
+encoder.encoders.2.self_attn.linear_k.bias
+encoder.encoders.2.self_attn.linear_v.weight
+encoder.encoders.2.self_attn.linear_v.bias
+encoder.encoders.2.self_attn.linear_out.weight
+encoder.encoders.2.self_attn.linear_out.bias
+encoder.encoders.2.feed_forward.w_1.weight
+encoder.encoders.2.feed_forward.w_1.bias
+encoder.encoders.2.feed_forward.w_2.weight
+encoder.encoders.2.feed_forward.w_2.bias
+encoder.encoders.2.norm1.weight
+encoder.encoders.2.norm1.bias
+encoder.encoders.2.norm2.weight
+encoder.encoders.2.norm2.bias
+encoder.encoders.3.self_attn.linear_q.weight
+encoder.encoders.3.self_attn.linear_q.bias
+encoder.encoders.3.self_attn.linear_k.weight
+encoder.encoders.3.self_attn.linear_k.bias
+encoder.encoders.3.self_attn.linear_v.weight
+encoder.encoders.3.self_attn.linear_v.bias
+encoder.encoders.3.self_attn.linear_out.weight
+encoder.encoders.3.self_attn.linear_out.bias
+encoder.encoders.3.feed_forward.w_1.weight
+encoder.encoders.3.feed_forward.w_1.bias
+encoder.encoders.3.feed_forward.w_2.weight
+encoder.encoders.3.feed_forward.w_2.bias
+encoder.encoders.3.norm1.weight
+encoder.encoders.3.norm1.bias
+encoder.encoders.3.norm2.weight
+encoder.encoders.3.norm2.bias
+encoder.after_norm.weight
+encoder.after_norm.bias
+spk_projection.weight
+spk_projection.bias
+duration_predictor.conv.0.0.weight
+duration_predictor.conv.0.0.bias
+duration_predictor.conv.0.2.weight
+duration_predictor.conv.0.2.bias
+duration_predictor.conv.1.0.weight
+duration_predictor.conv.1.0.bias
+duration_predictor.conv.1.2.weight
+duration_predictor.conv.1.2.bias
+duration_predictor.linear.weight
+duration_predictor.linear.bias
+pitch_predictor.conv.0.0.weight
+pitch_predictor.conv.0.0.bias
+pitch_predictor.conv.0.2.weight
+pitch_predictor.conv.0.2.bias
+pitch_predictor.conv.1.0.weight
+pitch_predictor.conv.1.0.bias
+pitch_predictor.conv.1.2.weight
+pitch_predictor.conv.1.2.bias
+pitch_predictor.conv.2.0.weight
+pitch_predictor.conv.2.0.bias
+pitch_predictor.conv.2.2.weight
+pitch_predictor.conv.2.2.bias
+pitch_predictor.conv.3.0.weight
+pitch_predictor.conv.3.0.bias
+pitch_predictor.conv.3.2.weight
+pitch_predictor.conv.3.2.bias
+pitch_predictor.conv.4.0.weight
+pitch_predictor.conv.4.0.bias
+pitch_predictor.conv.4.2.weight
+pitch_predictor.conv.4.2.bias
+pitch_predictor.linear.weight
+pitch_predictor.linear.bias
+pitch_embed.0.weight
+pitch_embed.0.bias
+energy_predictor.conv.0.0.weight
+energy_predictor.conv.0.0.bias
+energy_predictor.conv.0.2.weight
+energy_predictor.conv.0.2.bias
+energy_predictor.conv.1.0.weight
+energy_predictor.conv.1.0.bias
+energy_predictor.conv.1.2.weight
+energy_predictor.conv.1.2.bias
+energy_predictor.linear.weight
+energy_predictor.linear.bias
+energy_embed.0.weight
+energy_embed.0.bias
+decoder.embed.0.alpha
+decoder.encoders.0.self_attn.linear_q.weight
+decoder.encoders.0.self_attn.linear_q.bias
+decoder.encoders.0.self_attn.linear_k.weight
+decoder.encoders.0.self_attn.linear_k.bias
+decoder.encoders.0.self_attn.linear_v.weight
+decoder.encoders.0.self_attn.linear_v.bias
+decoder.encoders.0.self_attn.linear_out.weight
+decoder.encoders.0.self_attn.linear_out.bias
+decoder.encoders.0.feed_forward.w_1.weight
+decoder.encoders.0.feed_forward.w_1.bias
+decoder.encoders.0.feed_forward.w_2.weight
+decoder.encoders.0.feed_forward.w_2.bias
+decoder.encoders.0.norm1.weight
+decoder.encoders.0.norm1.bias
+decoder.encoders.0.norm2.weight
+decoder.encoders.0.norm2.bias
+decoder.encoders.1.self_attn.linear_q.weight
+decoder.encoders.1.self_attn.linear_q.bias
+decoder.encoders.1.self_attn.linear_k.weight
+decoder.encoders.1.self_attn.linear_k.bias
+decoder.encoders.1.self_attn.linear_v.weight
+decoder.encoders.1.self_attn.linear_v.bias
+decoder.encoders.1.self_attn.linear_out.weight
+decoder.encoders.1.self_attn.linear_out.bias
+decoder.encoders.1.feed_forward.w_1.weight
+decoder.encoders.1.feed_forward.w_1.bias
+decoder.encoders.1.feed_forward.w_2.weight
+decoder.encoders.1.feed_forward.w_2.bias
+decoder.encoders.1.norm1.weight
+decoder.encoders.1.norm1.bias
+decoder.encoders.1.norm2.weight
+decoder.encoders.1.norm2.bias
+decoder.encoders.2.self_attn.linear_q.weight
+decoder.encoders.2.self_attn.linear_q.bias
+decoder.encoders.2.self_attn.linear_k.weight
+decoder.encoders.2.self_attn.linear_k.bias
+decoder.encoders.2.self_attn.linear_v.weight
+decoder.encoders.2.self_attn.linear_v.bias
+decoder.encoders.2.self_attn.linear_out.weight
+decoder.encoders.2.self_attn.linear_out.bias
+decoder.encoders.2.feed_forward.w_1.weight
+decoder.encoders.2.feed_forward.w_1.bias
+decoder.encoders.2.feed_forward.w_2.weight
+decoder.encoders.2.feed_forward.w_2.bias
+decoder.encoders.2.norm1.weight
+decoder.encoders.2.norm1.bias
+decoder.encoders.2.norm2.weight
+decoder.encoders.2.norm2.bias
+decoder.encoders.3.self_attn.linear_q.weight
+decoder.encoders.3.self_attn.linear_q.bias
+decoder.encoders.3.self_attn.linear_k.weight
+decoder.encoders.3.self_attn.linear_k.bias
+decoder.encoders.3.self_attn.linear_v.weight
+decoder.encoders.3.self_attn.linear_v.bias
+decoder.encoders.3.self_attn.linear_out.weight
+decoder.encoders.3.self_attn.linear_out.bias
+decoder.encoders.3.feed_forward.w_1.weight
+decoder.encoders.3.feed_forward.w_1.bias
+decoder.encoders.3.feed_forward.w_2.weight
+decoder.encoders.3.feed_forward.w_2.bias
+decoder.encoders.3.norm1.weight
+decoder.encoders.3.norm1.bias
+decoder.encoders.3.norm2.weight
+decoder.encoders.3.norm2.bias
+decoder.after_norm.weight
+decoder.after_norm.bias
+feat_out.weight
+feat_out.bias
+postnet.postnet.0.0.weight
+postnet.postnet.0.1.weight
+postnet.postnet.0.1.bias
+postnet.postnet.0.1._mean
+postnet.postnet.0.1._variance
+postnet.postnet.1.0.weight
+postnet.postnet.1.1.weight
+postnet.postnet.1.1.bias
+postnet.postnet.1.1._mean
+postnet.postnet.1.1._variance
+postnet.postnet.2.0.weight
+postnet.postnet.2.1.weight
+postnet.postnet.2.1.bias
+postnet.postnet.2.1._mean
+postnet.postnet.2.1._variance
+postnet.postnet.3.0.weight
+postnet.postnet.3.1.weight
+postnet.postnet.3.1.bias
+postnet.postnet.3.1._mean
+postnet.postnet.3.1._variance
+postnet.postnet.4.0.weight
+postnet.postnet.4.1.weight
+postnet.postnet.4.1.bias
+postnet.postnet.4.1._mean
+postnet.postnet.4.1._variance
+
diff --git a/examples/other/tts_finetune/tts3/conf/finetune.yaml b/examples/other/tts_finetune/tts3/conf/finetune.yaml
new file mode 100644
index 00000000..7d0dd7b8
--- /dev/null
+++ b/examples/other/tts_finetune/tts3/conf/finetune.yaml
@@ -0,0 +1,14 @@
+###########################################################
+#                 PARAS SETTING               #
+###########################################################
+# Set to -1 to indicate that the parameter is the same as the pretrained model configuration
+
+batch_size: -1
+learning_rate: 0.0001     # learning rate
+num_snapshots: -1
+
+# frozen_layers should be a list
+# if you don't need to freeze, set frozen_layers to []
+# fastspeech2 layers can be found on conf/fastspeech2_layers.txt
+# example: frozen_layers: ["encoder", "duration_predictor"]
+frozen_layers: ["encoder"]
diff --git a/examples/other/tts_finetune/tts3/finetune.py b/examples/other/tts_finetune/tts3/finetune.py
deleted file mode 100644
index 207e2dbc..00000000
--- a/examples/other/tts_finetune/tts3/finetune.py
+++ /dev/null
@@ -1,214 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import os
-from pathlib import Path
-from typing import List
-from typing import Union
-
-import yaml
-from local.check_oov import get_check_result
-from local.extract import extract_feature
-from local.label_process import get_single_label
-from local.prepare_env import generate_finetune_env
-from local.train import train_sp
-from paddle import distributed as dist
-from yacs.config import CfgNode
-
-from utils.gen_duration_from_textgrid import gen_duration_from_textgrid
-
-DICT_EN = 'tools/aligner/cmudict-0.7b'
-DICT_ZH = 'tools/aligner/simple.lexicon'
-MODEL_DIR_EN = 'tools/aligner/vctk_model.zip'
-MODEL_DIR_ZH = 'tools/aligner/aishell3_model.zip'
-MFA_PHONE_EN = 'tools/aligner/vctk_model/meta.yaml'
-MFA_PHONE_ZH = 'tools/aligner/aishell3_model/meta.yaml'
-MFA_PATH = 'tools/montreal-forced-aligner/bin'
-os.environ['PATH'] = MFA_PATH + '/:' + os.environ['PATH']
-
-
-class TrainArgs():
-    def __init__(self,
-                 ngpu,
-                 config_file,
-                 dump_dir: Path,
-                 output_dir: Path,
-                 frozen_layers: List[str]):
-        # config: fastspeech2 config file.
-        self.config = str(config_file)
-        self.train_metadata = str(dump_dir / "train/norm/metadata.jsonl")
-        self.dev_metadata = str(dump_dir / "dev/norm/metadata.jsonl")
-        # model output dir.
-        self.output_dir = str(output_dir)
-        self.ngpu = ngpu
-        self.phones_dict = str(dump_dir / "phone_id_map.txt")
-        self.speaker_dict = str(dump_dir / "speaker_id_map.txt")
-        self.voice_cloning = False
-        # frozen layers
-        self.frozen_layers = frozen_layers
-
-
-def get_mfa_result(
-        input_dir: Union[str, Path],
-        mfa_dir: Union[str, Path],
-        lang: str='en', ):
-    """get mfa result
-
-    Args:
-        input_dir (Union[str, Path]): input dir including wav file and label
-        mfa_dir (Union[str, Path]): mfa result dir
-        lang (str, optional): input audio language. Defaults to 'en'.
-    """
-    # MFA
-    if lang == 'en':
-        DICT = DICT_EN
-        MODEL_DIR = MODEL_DIR_EN
-
-    elif lang == 'zh':
-        DICT = DICT_ZH
-        MODEL_DIR = MODEL_DIR_ZH
-    else:
-        print('please input right lang!!')
-
-    CMD = 'mfa_align' + ' ' + str(
-        input_dir) + ' ' + DICT + ' ' + MODEL_DIR + ' ' + str(mfa_dir)
-    os.system(CMD)
-
-
-if __name__ == '__main__':
-    # parse config and args
-    parser = argparse.ArgumentParser(
-        description="Preprocess audio and then extract features.")
-
-    parser.add_argument(
-        "--input_dir",
-        type=str,
-        default="./input/baker_mini",
-        help="directory containing audio and label file")
-
-    parser.add_argument(
-        "--pretrained_model_dir",
-        type=str,
-        default="./pretrained_models/fastspeech2_aishell3_ckpt_1.1.0",
-        help="Path to pretrained model")
-
-    parser.add_argument(
-        "--mfa_dir",
-        type=str,
-        default="./mfa_result",
-        help="directory to save aligned files")
-
-    parser.add_argument(
-        "--dump_dir",
-        type=str,
-        default="./dump",
-        help="directory to save feature files and metadata.")
-
-    parser.add_argument(
-        "--output_dir",
-        type=str,
-        default="./exp/default/",
-        help="directory to save finetune model.")
-
-    parser.add_argument(
-        '--lang',
-        type=str,
-        default='zh',
-        choices=['zh', 'en'],
-        help='Choose input audio language. zh or en')
-
-    parser.add_argument(
-        "--ngpu", type=int, default=2, help="if ngpu=0, use cpu.")
-
-    parser.add_argument("--epoch", type=int, default=100, help="finetune epoch")
-    parser.add_argument(
-        "--finetune_config",
-        type=str,
-        default="./finetune.yaml",
-        help="Path to finetune config file")
-
-    args = parser.parse_args()
-
-    fs = 24000
-    n_shift = 300
-    input_dir = Path(args.input_dir).expanduser()
-    mfa_dir = Path(args.mfa_dir).expanduser()
-    mfa_dir.mkdir(parents=True, exist_ok=True)
-    dump_dir = Path(args.dump_dir).expanduser()
-    dump_dir.mkdir(parents=True, exist_ok=True)
-    output_dir = Path(args.output_dir).expanduser()
-    output_dir.mkdir(parents=True, exist_ok=True)
-    pretrained_model_dir = Path(args.pretrained_model_dir).expanduser()
-
-    # read config
-    config_file = pretrained_model_dir / "default.yaml"
-    with open(config_file) as f:
-        config = CfgNode(yaml.safe_load(f))
-    config.max_epoch = config.max_epoch + args.epoch
-
-    with open(args.finetune_config) as f2:
-        finetune_config = CfgNode(yaml.safe_load(f2))
-    config.batch_size = finetune_config.batch_size if finetune_config.batch_size > 0 else config.batch_size
-    config.optimizer.learning_rate = finetune_config.learning_rate if finetune_config.learning_rate > 0 else config.optimizer.learning_rate
-    config.num_snapshots = finetune_config.num_snapshots if finetune_config.num_snapshots > 0 else config.num_snapshots
-    frozen_layers = finetune_config.frozen_layers
-    assert type(frozen_layers) == list, "frozen_layers should be set a list."
-
-    if args.lang == 'en':
-        lexicon_file = DICT_EN
-        mfa_phone_file = MFA_PHONE_EN
-    elif args.lang == 'zh':
-        lexicon_file = DICT_ZH
-        mfa_phone_file = MFA_PHONE_ZH
-    else:
-        print('please input right lang!!')
-
-    print(f"finetune max_epoch: {config.max_epoch}")
-    print(f"finetune batch_size: {config.batch_size}")
-    print(f"finetune learning_rate: {config.optimizer.learning_rate}")
-    print(f"finetune num_snapshots: {config.num_snapshots}")
-    print(f"finetune frozen_layers: {frozen_layers}")
-
-    am_phone_file = pretrained_model_dir / "phone_id_map.txt"
-    label_file = input_dir / "labels.txt"
-
-    #check phone for mfa and am finetune
-    oov_words, oov_files, oov_file_words = get_check_result(
-        label_file, lexicon_file, mfa_phone_file, am_phone_file)
-    input_dir = get_single_label(label_file, oov_files, input_dir)
-
-    # get mfa result
-    get_mfa_result(input_dir, mfa_dir, args.lang)
-
-    # # generate durations.txt
-    duration_file = "./durations.txt"
-    gen_duration_from_textgrid(mfa_dir, duration_file, fs, n_shift)
-
-    # generate phone and speaker map files
-    extract_feature(duration_file, config, input_dir, dump_dir,
-                    pretrained_model_dir)
-
-    # create finetune env
-    generate_finetune_env(output_dir, pretrained_model_dir)
-
-    # create a new args for training
-    train_args = TrainArgs(args.ngpu, config_file, dump_dir, output_dir,
-                           frozen_layers)
-
-    # finetune models
-    # dispatch
-    if args.ngpu > 1:
-        dist.spawn(train_sp, (train_args, config), nprocs=args.ngpu)
-    else:
-        train_sp(train_args, config)
diff --git a/examples/other/tts_finetune/tts3/local/check_oov.py b/examples/other/tts_finetune/tts3/local/check_oov.py
index 4d685482..9e1d3f6e 100644
--- a/examples/other/tts_finetune/tts3/local/check_oov.py
+++ b/examples/other/tts_finetune/tts3/local/check_oov.py
@@ -11,17 +11,30 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import argparse
+import os
+import re
 from pathlib import Path
 from typing import Dict
 from typing import List
 from typing import Union
 
+DICT_EN = 'tools/aligner/cmudict-0.7b'
+DICT_ZH = 'tools/aligner/simple.lexicon'
+MODEL_DIR_EN = 'tools/aligner/vctk_model.zip'
+MODEL_DIR_ZH = 'tools/aligner/aishell3_model.zip'
+MFA_PHONE_EN = 'tools/aligner/vctk_model/meta.yaml'
+MFA_PHONE_ZH = 'tools/aligner/aishell3_model/meta.yaml'
+MFA_PATH = 'tools/montreal-forced-aligner/bin'
+os.environ['PATH'] = MFA_PATH + '/:' + os.environ['PATH']
+
 
 def check_phone(label_file: Union[str, Path],
-                pinyin_phones: Dict[str, str],
+                pronunciation_phones: Dict[str, str],
                 mfa_phones: List[str],
                 am_phones: List[str],
-                oov_record: str="./oov_info.txt"):
+                oov_record: str="./oov_info.txt",
+                lang: str="zh"):
     """Check whether the phoneme corresponding to the audio text content 
     is in the phoneme list of the pretrained mfa model to ensure that the alignment is normal.
     Check whether the phoneme corresponding to the audio text content 
@@ -29,7 +42,7 @@ def check_phone(label_file: Union[str, Path],
 
     Args:
         label_file (Union[str, Path]): label file, format: utt_id|phone seq
-        pinyin_phones (dict): pinyin to phones map dict
+        pronunciation_phones (dict): pronunciation to phones map dict
         mfa_phones (list): the phone list of pretrained mfa model
         am_phones (list): the phone list of pretrained mfa model
 
@@ -46,16 +59,21 @@ def check_phone(label_file: Union[str, Path],
         for line in f.readlines():
             utt_id = line.split("|")[0]
             transcription = line.strip().split("|")[1]
+            transcription = re.sub(
+                r'[：、，；。？！,.:;"?!”’《》【】<=>{}()（）#&@“”^_|…\\]', '',
+                transcription)
+            if lang == "en":
+                transcription = transcription.upper()
             flag = 0
             temp_oov_words = []
             for word in transcription.split(" "):
-                if word not in pinyin_phones.keys():
+                if word not in pronunciation_phones.keys():
                     temp_oov_words.append(word)
                     flag = 1
                     if word not in oov_words:
                         oov_words.append(word)
                 else:
-                    for p in pinyin_phones[word]:
+                    for p in pronunciation_phones[word]:
                         if p not in mfa_phones or p not in am_phones:
                             temp_oov_words.append(word)
                             flag = 1
@@ -74,20 +92,20 @@ def check_phone(label_file: Union[str, Path],
     return oov_words, oov_files, oov_file_words
 
 
-def get_pinyin_phones(lexicon_file: Union[str, Path]):
-    # pinyin to phones
-    pinyin_phones = {}
+def get_pronunciation_phones(lexicon_file: Union[str, Path]):
+    # pronunciation to phones
+    pronunciation_phones = {}
     with open(lexicon_file, "r") as f2:
         for line in f2.readlines():
             line_list = line.strip().split(" ")
-            pinyin = line_list[0]
+            pronunciation = line_list[0]
             if line_list[1] == '':
                 phones = line_list[2:]
             else:
                 phones = line_list[1:]
-            pinyin_phones[pinyin] = phones
+            pronunciation_phones[pronunciation] = phones
 
-    return pinyin_phones
+    return pronunciation_phones
 
 
 def get_mfa_phone(mfa_phone_file: Union[str, Path]):
@@ -114,12 +132,109 @@ def get_am_phone(am_phone_file: Union[str, Path]):
 
 
 def get_check_result(label_file: Union[str, Path],
-                     lexicon_file: Union[str, Path],
-                     mfa_phone_file: Union[str, Path],
-                     am_phone_file: Union[str, Path]):
-    pinyin_phones = get_pinyin_phones(lexicon_file)
+                     am_phone_file: Union[str, Path],
+                     input_dir: Union[str, Path],
+                     newdir_name: str="newdir",
+                     lang: str="zh"):
+    """Check if there is any audio in the input that contains the oov word according to label_file.
+       Copy audio that does not contain oov word to input_dir / newdir_name.
+       Generate label file and save to input_dir / newdir_name.
+
+
+    Args:
+        label_file (Union[str, Path]): input audio label file, format: utt|pronunciation 
+        am_phone_file (Union[str, Path]): pretrained am model phone file
+        input_dir (Union[str, Path]): input dir
+        newdir_name (str): directory name saved after checking oov
+        lang (str): input audio language
+    """
+
+    if lang == 'en':
+        lexicon_file = DICT_EN
+        mfa_phone_file = MFA_PHONE_EN
+    elif lang == 'zh':
+        lexicon_file = DICT_ZH
+        mfa_phone_file = MFA_PHONE_ZH
+    else:
+        print('please input right lang!!')
+
+    pronunciation_phones = get_pronunciation_phones(lexicon_file)
     mfa_phones = get_mfa_phone(mfa_phone_file)
     am_phones = get_am_phone(am_phone_file)
     oov_words, oov_files, oov_file_words = check_phone(
-        label_file, pinyin_phones, mfa_phones, am_phones)
-    return oov_words, oov_files, oov_file_words
+        label_file=label_file,
+        pronunciation_phones=pronunciation_phones,
+        mfa_phones=mfa_phones,
+        am_phones=am_phones,
+        oov_record="./oov_info.txt",
+        lang=lang)
+
+    input_dir = Path(input_dir).expanduser()
+    new_dir = input_dir / newdir_name
+    new_dir.mkdir(parents=True, exist_ok=True)
+    with open(label_file, "r") as f:
+        for line in f.readlines():
+            utt_id = line.split("|")[0]
+            if utt_id not in oov_files:
+                transcription = line.split("|")[1].strip()
+                wav_file = str(input_dir) + "/" + utt_id + ".wav"
+                new_wav_file = str(new_dir) + "/" + utt_id + ".wav"
+                os.system("cp %s %s" % (wav_file, new_wav_file))
+                single_file = str(new_dir) + "/" + utt_id + ".txt"
+                with open(single_file, "w") as fw:
+                    fw.write(transcription)
+
+
+if __name__ == '__main__':
+    # parse config and args
+    parser = argparse.ArgumentParser(
+        description="Preprocess audio and then extract features.")
+
+    parser.add_argument(
+        "--input_dir",
+        type=str,
+        default="./input/csmsc_mini",
+        help="directory containing audio and label file")
+
+    parser.add_argument(
+        "--pretrained_model_dir",
+        type=str,
+        default="./pretrained_models/fastspeech2_aishell3_ckpt_1.1.0",
+        help="Path to pretrained model")
+
+    parser.add_argument(
+        "--newdir_name",
+        type=str,
+        default="newdir",
+        help="directory name saved after checking oov")
+
+    parser.add_argument(
+        '--lang',
+        type=str,
+        default='zh',
+        choices=['zh', 'en'],
+        help='Choose input audio language. zh or en')
+
+    args = parser.parse_args()
+
+    # if args.lang == 'en':
+    #     lexicon_file = DICT_EN
+    #     mfa_phone_file = MFA_PHONE_EN
+    # elif args.lang == 'zh':
+    #     lexicon_file = DICT_ZH
+    #     mfa_phone_file = MFA_PHONE_ZH
+    # else:
+    #     print('please input right lang!!')
+    assert args.lang == "zh" or args.lang == "en", "please input right lang! zh or en"
+
+    input_dir = Path(args.input_dir).expanduser()
+    pretrained_model_dir = Path(args.pretrained_model_dir).expanduser()
+    am_phone_file = pretrained_model_dir / "phone_id_map.txt"
+    label_file = input_dir / "labels.txt"
+
+    get_check_result(
+        label_file=label_file,
+        am_phone_file=am_phone_file,
+        input_dir=input_dir,
+        newdir_name=args.newdir_name,
+        lang=args.lang)
diff --git a/examples/other/tts_finetune/tts3/local/extract.py b/examples/other/tts_finetune/tts3/local/extract_feature.py
similarity index 87%
rename from examples/other/tts_finetune/tts3/local/extract.py
rename to examples/other/tts_finetune/tts3/local/extract_feature.py
index 630b58ce..3277db53 100644
--- a/examples/other/tts_finetune/tts3/local/extract.py
+++ b/examples/other/tts_finetune/tts3/local/extract_feature.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import argparse
 import logging
 import os
 from operator import itemgetter
@@ -20,8 +21,10 @@ from typing import Union
 
 import jsonlines
 import numpy as np
+import yaml
 from sklearn.preprocessing import StandardScaler
 from tqdm import tqdm
+from yacs.config import CfgNode
 
 from paddlespeech.t2s.datasets.data_table import DataTable
 from paddlespeech.t2s.datasets.get_feats import Energy
@@ -284,3 +287,49 @@ def extract_feature(duration_file: str,
         # norm
         normalize(speech_scaler, pitch_scaler, energy_scaler, vocab_phones,
                   vocab_speaker, dump_dir, "test")
+
+
+if __name__ == '__main__':
+    # parse config and args
+    parser = argparse.ArgumentParser(
+        description="Preprocess audio and then extract features.")
+
+    parser.add_argument(
+        "--duration_file",
+        type=str,
+        default="./durations.txt",
+        help="duration file")
+
+    parser.add_argument(
+        "--input_dir",
+        type=str,
+        default="./input/baker_mini/newdir",
+        help="directory containing audio and label file")
+
+    parser.add_argument(
+        "--dump_dir", type=str, default="./dump", help="dump dir")
+
+    parser.add_argument(
+        "--pretrained_model_dir",
+        type=str,
+        default="./pretrained_models/fastspeech2_aishell3_ckpt_1.1.0",
+        help="Path to pretrained model")
+
+    args = parser.parse_args()
+
+    input_dir = Path(args.input_dir).expanduser()
+    dump_dir = Path(args.dump_dir).expanduser()
+    dump_dir.mkdir(parents=True, exist_ok=True)
+    pretrained_model_dir = Path(args.pretrained_model_dir).expanduser()
+
+    # read config
+    config_file = pretrained_model_dir / "default.yaml"
+    with open(config_file) as f:
+        config = CfgNode(yaml.safe_load(f))
+
+    extract_feature(
+        duration_file=args.duration_file,
+        config=config,
+        input_dir=input_dir,
+        dump_dir=dump_dir,
+        pretrained_model_dir=pretrained_model_dir)
diff --git a/examples/other/tts_finetune/tts3/local/train.py b/examples/other/tts_finetune/tts3/local/finetune.py
similarity index 65%
rename from examples/other/tts_finetune/tts3/local/train.py
rename to examples/other/tts_finetune/tts3/local/finetune.py
index d065ae59..496c2355 100644
--- a/examples/other/tts_finetune/tts3/local/train.py
+++ b/examples/other/tts_finetune/tts3/local/finetune.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import argparse
 import logging
 import os
 import shutil
@@ -20,10 +21,12 @@ from typing import List
 import jsonlines
 import numpy as np
 import paddle
+import yaml
 from paddle import DataParallel
 from paddle import distributed as dist
 from paddle.io import DataLoader
 from paddle.io import DistributedBatchSampler
+from yacs.config import CfgNode
 
 from paddlespeech.t2s.datasets.am_batch_fn import fastspeech2_multi_spk_batch_fn
 from paddlespeech.t2s.datasets.am_batch_fn import fastspeech2_single_spk_batch_fn
@@ -38,6 +41,27 @@ from paddlespeech.t2s.training.seeding import seed_everything
 from paddlespeech.t2s.training.trainer import Trainer
 
 
+class TrainArgs():
+    def __init__(self,
+                 ngpu,
+                 config_file,
+                 dump_dir: Path,
+                 output_dir: Path,
+                 frozen_layers: List[str]):
+        # config: fastspeech2 config file.
+        self.config = str(config_file)
+        self.train_metadata = str(dump_dir / "train/norm/metadata.jsonl")
+        self.dev_metadata = str(dump_dir / "dev/norm/metadata.jsonl")
+        # model output dir.
+        self.output_dir = str(output_dir)
+        self.ngpu = ngpu
+        self.phones_dict = str(dump_dir / "phone_id_map.txt")
+        self.speaker_dict = str(dump_dir / "speaker_id_map.txt")
+        self.voice_cloning = False
+        # frozen layers
+        self.frozen_layers = frozen_layers
+
+
 def freeze_layer(model, layers: List[str]):
     """freeze layers
 
@@ -176,3 +200,70 @@ def train_sp(args, config):
     trainer.extend(
         Snapshot(max_size=config.num_snapshots), trigger=(1, 'epoch'))
     trainer.run()
+
+
+if __name__ == '__main__':
+    # parse config and args
+    parser = argparse.ArgumentParser(
+        description="Preprocess audio and then extract features.")
+
+    parser.add_argument(
+        "--pretrained_model_dir",
+        type=str,
+        default="./pretrained_models/fastspeech2_aishell3_ckpt_1.1.0",
+        help="Path to pretrained model")
+
+    parser.add_argument(
+        "--dump_dir",
+        type=str,
+        default="./dump",
+        help="directory to save feature files and metadata.")
+
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="./exp/default/",
+        help="directory to save finetune model.")
+
+    parser.add_argument(
+        "--ngpu", type=int, default=2, help="if ngpu=0, use cpu.")
+
+    parser.add_argument("--epoch", type=int, default=100, help="finetune epoch")
+    parser.add_argument(
+        "--finetune_config",
+        type=str,
+        default="./finetune.yaml",
+        help="Path to finetune config file")
+
+    args = parser.parse_args()
+
+    dump_dir = Path(args.dump_dir).expanduser()
+    dump_dir.mkdir(parents=True, exist_ok=True)
+    output_dir = Path(args.output_dir).expanduser()
+    output_dir.mkdir(parents=True, exist_ok=True)
+    pretrained_model_dir = Path(args.pretrained_model_dir).expanduser()
+
+    # read config
+    config_file = pretrained_model_dir / "default.yaml"
+    with open(config_file) as f:
+        config = CfgNode(yaml.safe_load(f))
+    config.max_epoch = config.max_epoch + args.epoch
+
+    with open(args.finetune_config) as f2:
+        finetune_config = CfgNode(yaml.safe_load(f2))
+    config.batch_size = finetune_config.batch_size if finetune_config.batch_size > 0 else config.batch_size
+    config.optimizer.learning_rate = finetune_config.learning_rate if finetune_config.learning_rate > 0 else config.optimizer.learning_rate
+    config.num_snapshots = finetune_config.num_snapshots if finetune_config.num_snapshots > 0 else config.num_snapshots
+    frozen_layers = finetune_config.frozen_layers
+    assert type(frozen_layers) == list, "frozen_layers should be set a list."
+
+    # create a new args for training
+    train_args = TrainArgs(args.ngpu, config_file, dump_dir, output_dir,
+                           frozen_layers)
+
+    # finetune models
+    # dispatch
+    if args.ngpu > 1:
+        dist.spawn(train_sp, (train_args, config), nprocs=args.ngpu)
+    else:
+        train_sp(train_args, config)
diff --git a/examples/other/tts_finetune/tts3/local/generate_duration.py b/examples/other/tts_finetune/tts3/local/generate_duration.py
new file mode 100644
index 00000000..e512d478
--- /dev/null
+++ b/examples/other/tts_finetune/tts3/local/generate_duration.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+from pathlib import Path
+
+from utils.gen_duration_from_textgrid import gen_duration_from_textgrid
+
+if __name__ == '__main__':
+    # parse config and args
+    parser = argparse.ArgumentParser(
+        description="Preprocess audio and then extract features.")
+
+    parser.add_argument(
+        "--mfa_dir",
+        type=str,
+        default="./mfa_result",
+        help="directory to save aligned files")
+
+    args = parser.parse_args()
+
+    fs = 24000
+    n_shift = 300
+    duration_file = "./durations.txt"
+    mfa_dir = Path(args.mfa_dir).expanduser()
+    mfa_dir.mkdir(parents=True, exist_ok=True)
+
+    gen_duration_from_textgrid(mfa_dir, duration_file, fs, n_shift)
diff --git a/examples/other/tts_finetune/tts3/local/get_mfa_result.py b/examples/other/tts_finetune/tts3/local/get_mfa_result.py
new file mode 100644
index 00000000..f564fbfc
--- /dev/null
+++ b/examples/other/tts_finetune/tts3/local/get_mfa_result.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+from pathlib import Path
+from typing import Union
+
+DICT_EN = 'tools/aligner/cmudict-0.7b'
+DICT_ZH = 'tools/aligner/simple.lexicon'
+MODEL_DIR_EN = 'tools/aligner/vctk_model.zip'
+MODEL_DIR_ZH = 'tools/aligner/aishell3_model.zip'
+MFA_PHONE_EN = 'tools/aligner/vctk_model/meta.yaml'
+MFA_PHONE_ZH = 'tools/aligner/aishell3_model/meta.yaml'
+MFA_PATH = 'tools/montreal-forced-aligner/bin'
+os.environ['PATH'] = MFA_PATH + '/:' + os.environ['PATH']
+
+
+def get_mfa_result(
+        input_dir: Union[str, Path],
+        mfa_dir: Union[str, Path],
+        lang: str='en', ):
+    """get mfa result
+
+    Args:
+        input_dir (Union[str, Path]): input dir including wav file and label
+        mfa_dir (Union[str, Path]): mfa result dir
+        lang (str, optional): input audio language. Defaults to 'en'.
+    """
+    # MFA
+    if lang == 'en':
+        DICT = DICT_EN
+        MODEL_DIR = MODEL_DIR_EN
+
+    elif lang == 'zh':
+        DICT = DICT_ZH
+        MODEL_DIR = MODEL_DIR_ZH
+    else:
+        print('please input right lang!!')
+
+    CMD = 'mfa_align' + ' ' + str(
+        input_dir) + ' ' + DICT + ' ' + MODEL_DIR + ' ' + str(mfa_dir)
+    os.system(CMD)
+
+
+if __name__ == '__main__':
+    # parse config and args
+    parser = argparse.ArgumentParser(
+        description="Preprocess audio and then extract features.")
+
+    parser.add_argument(
+        "--input_dir",
+        type=str,
+        default="./input/baker_mini/newdir",
+        help="directory containing audio and label file")
+
+    parser.add_argument(
+        "--mfa_dir",
+        type=str,
+        default="./mfa_result",
+        help="directory to save aligned files")
+
+    parser.add_argument(
+        '--lang',
+        type=str,
+        default='zh',
+        choices=['zh', 'en'],
+        help='Choose input audio language. zh or en')
+
+    args = parser.parse_args()
+
+    get_mfa_result(
+        input_dir=args.input_dir, mfa_dir=args.mfa_dir, lang=args.lang)
diff --git a/examples/other/tts_finetune/tts3/local/label_process.py b/examples/other/tts_finetune/tts3/local/label_process.py
deleted file mode 100644
index 711dde4b..00000000
--- a/examples/other/tts_finetune/tts3/local/label_process.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-from pathlib import Path
-from typing import List
-from typing import Union
-
-
-def change_baker_label(baker_label_file: Union[str, Path],
-                       out_label_file: Union[str, Path]):
-    """change baker label file to regular label file
-
-    Args:
-        baker_label_file (Union[str, Path]): Original baker label file
-        out_label_file (Union[str, Path]): regular label file
-    """
-    with open(baker_label_file) as f:
-        lines = f.readlines()
-
-    with open(out_label_file, "w") as fw:
-        for i in range(0, len(lines), 2):
-            utt_id = lines[i].split()[0]
-            transcription = lines[i + 1].strip()
-            fw.write(utt_id + "|" + transcription + "\n")
-
-
-def get_single_label(label_file: Union[str, Path],
-                     oov_files: List[Union[str, Path]],
-                     input_dir: Union[str, Path]):
-    """Divide the label file into individual files according to label_file
-
-    Args:
-        label_file (str or Path): label file, format: utt_id|phones id
-        input_dir (Path): input dir including audios
-    """
-    input_dir = Path(input_dir).expanduser()
-    new_dir = input_dir / "newdir"
-    new_dir.mkdir(parents=True, exist_ok=True)
-
-    with open(label_file, "r") as f:
-        for line in f.readlines():
-            utt_id = line.split("|")[0]
-            if utt_id not in oov_files:
-                transcription = line.split("|")[1].strip()
-                wav_file = str(input_dir) + "/" + utt_id + ".wav"
-                new_wav_file = str(new_dir) + "/" + utt_id + ".wav"
-                os.system("cp %s %s" % (wav_file, new_wav_file))
-                single_file = str(new_dir) + "/" + utt_id + ".txt"
-                with open(single_file, "w") as fw:
-                    fw.write(transcription)
-
-    return new_dir
diff --git a/examples/other/tts_finetune/tts3/local/prepare_env.py b/examples/other/tts_finetune/tts3/local/prepare_env.py
index f2166ff1..5e4f9634 100644
--- a/examples/other/tts_finetune/tts3/local/prepare_env.py
+++ b/examples/other/tts_finetune/tts3/local/prepare_env.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import argparse
 import os
 from pathlib import Path
 
@@ -33,3 +34,29 @@ def generate_finetune_env(output_dir: Path, pretrained_model_dir: Path):
         line = "\"time\": \"2022-08-06 07:51:53.463650\", \"path\": \"%s\", \"iteration\": %d" % (
             str(output_dir / model_file), iter)
         f.write("{" + line + "}" + "\n")
+
+
+if __name__ == '__main__':
+    # parse config and args
+    parser = argparse.ArgumentParser(
+        description="Preprocess audio and then extract features.")
+
+    parser.add_argument(
+        "--pretrained_model_dir",
+        type=str,
+        default="./pretrained_models/fastspeech2_aishell3_ckpt_1.1.0",
+        help="Path to pretrained model")
+
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="./exp/default/",
+        help="directory to save finetune model.")
+
+    args = parser.parse_args()
+
+    output_dir = Path(args.output_dir).expanduser()
+    output_dir.mkdir(parents=True, exist_ok=True)
+    pretrained_model_dir = Path(args.pretrained_model_dir).expanduser()
+
+    generate_finetune_env(output_dir, pretrained_model_dir)
diff --git a/examples/other/tts_finetune/tts3/run.sh b/examples/other/tts_finetune/tts3/run.sh
index 9c877e64..1faa2b46 100755
--- a/examples/other/tts_finetune/tts3/run.sh
+++ b/examples/other/tts_finetune/tts3/run.sh
@@ -5,13 +5,16 @@ source path.sh
 
 
 input_dir=./input/csmsc_mini
+newdir_name="newdir"
+new_dir=${input_dir}/${newdir_name}
 pretrained_model_dir=./pretrained_models/fastspeech2_aishell3_ckpt_1.1.0
+mfa_tools=./tools
 mfa_dir=./mfa_result
 dump_dir=./dump
 output_dir=./exp/default
 lang=zh
 ngpu=1
-finetune_config=./finetune.yaml
+finetune_config=./conf/finetune.yaml
 
 ckpt=snapshot_iter_96699
 
@@ -26,25 +29,65 @@ stop_stage=100
 # this can not be mixed use with `$1`, `$2` ...
 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
 
+# check oov
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    # finetune
-    python3 finetune.py \
+    echo "check oov"
+    python3 local/check_oov.py \
         --input_dir=${input_dir} \
         --pretrained_model_dir=${pretrained_model_dir} \
+        --newdir_name=${newdir_name} \
+        --lang=${lang}
+fi
+
+# get mfa result
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    echo "get mfa result"
+    python3 local/get_mfa_result.py \
+        --input_dir=${new_dir} \
         --mfa_dir=${mfa_dir} \
+        --lang=${lang}
+fi
+
+# generate durations.txt
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    echo "generate durations.txt"
+    python3 local/generate_duration.py \
+        --mfa_dir=${mfa_dir} 
+fi
+
+# extract feature
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    echo "extract feature"
+    python3 local/extract_feature.py \
+        --duration_file="./durations.txt" \
+        --input_dir=${new_dir} \
+        --dump_dir=${dump_dir} \
+        --pretrained_model_dir=${pretrained_model_dir}
+fi
+
+# create finetune env
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    echo "create finetune env"
+    python3 local/prepare_env.py \
+        --pretrained_model_dir=${pretrained_model_dir} \
+        --output_dir=${output_dir}
+fi
+
+# finetune
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+    echo "finetune..."
+    python3 local/finetune.py \
+        --pretrained_model_dir=${pretrained_model_dir} \
         --dump_dir=${dump_dir} \
         --output_dir=${output_dir} \
-        --lang=${lang} \
         --ngpu=${ngpu} \
         --epoch=100 \
         --finetune_config=${finetune_config}
 fi
 
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+# synthesize e2e
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
     echo "in hifigan syn_e2e"
-    FLAGS_allocator_strategy=naive_best_fit \
-    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
     python3 ${BIN_DIR}/../synthesize_e2e.py \
         --am=fastspeech2_aishell3 \
         --am_config=${pretrained_model_dir}/default.yaml \
diff --git a/examples/other/tts_finetune/tts3/run_en.sh b/examples/other/tts_finetune/tts3/run_en.sh
new file mode 100755
index 00000000..e8551667
--- /dev/null
+++ b/examples/other/tts_finetune/tts3/run_en.sh
@@ -0,0 +1,105 @@
+#!/bin/bash
+
+set -e
+source path.sh
+
+input_dir=./input/ljspeech_mini
+newdir_name="newdir"
+new_dir=${input_dir}/${newdir_name}
+pretrained_model_dir=./pretrained_models/fastspeech2_vctk_ckpt_1.2.0
+mfa_tools=./tools
+mfa_dir=./mfa_result
+dump_dir=./dump
+output_dir=./exp/default
+lang=en
+ngpu=1
+finetune_config=./conf/finetune.yaml
+
+ckpt=snapshot_iter_66300
+
+gpus=1
+CUDA_VISIBLE_DEVICES=${gpus}
+stage=0
+stop_stage=100
+
+
+# with the following command, you can choose the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+# check oov
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    echo "check oov"
+    python3 local/check_oov.py \
+        --input_dir=${input_dir} \
+        --pretrained_model_dir=${pretrained_model_dir} \
+        --newdir_name=${newdir_name} \
+        --lang=${lang}
+fi
+
+# get mfa result
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    echo "get mfa result"
+    python3 local/get_mfa_result.py \
+        --input_dir=${new_dir} \
+        --mfa_dir=${mfa_dir} \
+        --lang=${lang}
+fi
+
+# generate durations.txt
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    echo "generate durations.txt"
+    python3 local/generate_duration.py \
+        --mfa_dir=${mfa_dir} 
+fi
+
+# extract feature
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    echo "extract feature"
+    python3 local/extract_feature.py \
+        --duration_file="./durations.txt" \
+        --input_dir=${new_dir} \
+        --dump_dir=${dump_dir} \
+        --pretrained_model_dir=${pretrained_model_dir}
+fi
+
+# create finetune env
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    echo "create finetune env"
+    python3 local/prepare_env.py \
+        --pretrained_model_dir=${pretrained_model_dir} \
+        --output_dir=${output_dir}
+fi
+
+# finetune
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+    echo "finetune..."
+    python3 local/finetune.py \
+        --pretrained_model_dir=${pretrained_model_dir} \
+        --dump_dir=${dump_dir} \
+        --output_dir=${output_dir} \
+        --ngpu=${ngpu} \
+        --epoch=100 \
+        --finetune_config=${finetune_config}
+fi
+
+# synthesize e2e
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+    echo "in hifigan syn_e2e"
+    python3 ${BIN_DIR}/../synthesize_e2e.py \
+        --am=fastspeech2_vctk \
+        --am_config=${pretrained_model_dir}/default.yaml \
+        --am_ckpt=${output_dir}/checkpoints/${ckpt}.pdz \
+        --am_stat=${pretrained_model_dir}/speech_stats.npy \
+        --voc=hifigan_vctk \
+        --voc_config=pretrained_models/hifigan_vctk_ckpt_0.2.0/default.yaml \
+        --voc_ckpt=pretrained_models/hifigan_vctk_ckpt_0.2.0/snapshot_iter_2500000.pdz \
+        --voc_stat=pretrained_models/hifigan_vctk_ckpt_0.2.0/feats_stats.npy \
+        --lang=en \
+        --text=${BIN_DIR}/../sentences_en.txt \
+        --output_dir=./test_e2e/ \
+        --phones_dict=${dump_dir}/phone_id_map.txt \
+        --speaker_dict=${dump_dir}/speaker_id_map.txt \
+        --spk_id=0 
+fi
diff --git a/paddlespeech/audio/utils/tensor_utils.py b/paddlespeech/audio/utils/tensor_utils.py
index 16f60810..e9008f17 100644
--- a/paddlespeech/audio/utils/tensor_utils.py
+++ b/paddlespeech/audio/utils/tensor_utils.py
@@ -31,7 +31,6 @@ def has_tensor(val):
                 return True
     elif isinstance(val, dict):
         for k, v in val.items():
-            print(k)
             if has_tensor(v):
                 return True
     else:
@@ -143,14 +142,15 @@ def add_sos_eos(ys_pad: paddle.Tensor, sos: int, eos: int,
                 [ 7,  8,  9, 11, -1, -1]])
     """
     # TODO(Hui Zhang): using comment code,
-    #_sos = paddle.to_tensor(
-    #    [sos], dtype=paddle.long, stop_gradient=True, place=ys_pad.place)
-    #_eos = paddle.to_tensor(
-    #    [eos], dtype=paddle.long, stop_gradient=True, place=ys_pad.place)
-    #ys = [y[y != ignore_id] for y in ys_pad]  # parse padded ys
-    #ys_in = [paddle.cat([_sos, y], dim=0) for y in ys]
-    #ys_out = [paddle.cat([y, _eos], dim=0) for y in ys]
-    #return pad_sequence(ys_in, padding_value=eos), pad_sequence(ys_out, padding_value=ignore_id)
+    # _sos = paddle.to_tensor(
+    #    [sos], dtype=ys_pad.dtype, stop_gradient=True, place=ys_pad.place)
+    # _eos = paddle.to_tensor(
+    #    [eos], dtype=ys_pad.dtype, stop_gradient=True, place=ys_pad.place)
+    # ys = [y[y != ignore_id] for y in ys_pad]  # parse padded ys
+    # ys_in = [paddle.concat([_sos, y], axis=0) for y in ys]
+    # ys_out = [paddle.concat([y, _eos], axis=0) for y in ys]
+    # return pad_sequence(ys_in, padding_value=eos).transpose([1,0]), pad_sequence(ys_out, padding_value=ignore_id).transpose([1,0])
+
     B = ys_pad.shape[0]
     _sos = paddle.ones([B, 1], dtype=ys_pad.dtype) * sos
     _eos = paddle.ones([B, 1], dtype=ys_pad.dtype) * eos
@@ -190,3 +190,106 @@ def th_accuracy(pad_outputs: paddle.Tensor,
     # denominator = paddle.sum(mask)
     denominator = paddle.sum(mask.type_as(pad_targets))
     return float(numerator) / float(denominator)
+
+
+def reverse_pad_list(ys_pad: paddle.Tensor,
+                     ys_lens: paddle.Tensor,
+                     pad_value: float=-1.0) -> paddle.Tensor:
+    """Reverse padding for the list of tensors.
+    Args:
+        ys_pad (tensor): The padded tensor (B, Tokenmax).
+        ys_lens (tensor): The lens of token seqs (B)
+        pad_value (int): Value for padding.
+    Returns:
+        Tensor: Padded tensor (B, Tokenmax).
+    Examples:
+        >>> x
+        tensor([[1, 2, 3, 4], [5, 6, 7, 0], [8, 9, 0, 0]])
+        >>> pad_list(x, 0)
+        tensor([[4, 3, 2, 1],
+                [7, 6, 5, 0],
+                [9, 8, 0, 0]])
+    """
+    r_ys_pad = pad_sequence([(paddle.flip(y.int()[:i], [0]))
+                             for y, i in zip(ys_pad, ys_lens)], True, pad_value)
+    return r_ys_pad
+
+
+def st_reverse_pad_list(ys_pad: paddle.Tensor,
+                        ys_lens: paddle.Tensor,
+                        sos: float,
+                        eos: float) -> paddle.Tensor:
+    """Reverse padding for the list of tensors.
+    Args:
+        ys_pad (tensor): The padded tensor (B, Tokenmax).
+        ys_lens (tensor): The lens of token seqs (B)
+    Returns:
+        Tensor: Padded tensor (B, Tokenmax).
+    Examples:
+        >>> x
+        tensor([[1, 2, 3, 4], [5, 6, 7, 0], [8, 9, 0, 0]])
+        >>> pad_list(x, 0)
+        tensor([[4, 3, 2, 1],
+                [7, 6, 5, 0],
+                [9, 8, 0, 0]])
+    """
+    # Equal to:
+    #   >>> r_hyps = reverse_pad_list(r_hyps, r_hyps_lens, float(self.ignore_id))
+    #   >>> r_hyps, _ = add_sos_eos(r_hyps, self.sos, self.eos, self.ignore_id)
+    B = ys_pad.shape[0]
+    _sos = paddle.full([B, 1], sos, dtype=ys_pad.dtype)
+    max_len = paddle.max(ys_lens)
+    index_range = paddle.arange(0, max_len, 1)
+    seq_len_expand = ys_lens.unsqueeze(1)
+    seq_mask = seq_len_expand > index_range  # (beam, max_len)
+
+    index = (seq_len_expand - 1) - index_range  # (beam, max_len)
+    #   >>> index
+    #   >>> tensor([[ 2,  1,  0],
+    #   >>>         [ 2,  1,  0],
+    #   >>>         [ 0, -1, -2]])
+    index = index * seq_mask
+
+    #   >>> index
+    #   >>> tensor([[2, 1, 0],
+    #   >>>         [2, 1, 0],
+    #   >>>         [0, 0, 0]])
+    def paddle_gather(x, dim, index):
+        index_shape = index.shape
+        index_flatten = index.flatten()
+        if dim < 0:
+            dim = len(x.shape) + dim
+        nd_index = []
+        for k in range(len(x.shape)):
+            if k == dim:
+                nd_index.append(index_flatten)
+            else:
+                reshape_shape = [1] * len(x.shape)
+                reshape_shape[k] = x.shape[k]
+                x_arange = paddle.arange(x.shape[k], dtype=index.dtype)
+                x_arange = x_arange.reshape(reshape_shape)
+                dim_index = paddle.expand(x_arange, index_shape).flatten()
+                nd_index.append(dim_index)
+        ind2 = paddle.transpose(paddle.stack(nd_index), [1, 0]).astype("int64")
+        paddle_out = paddle.gather_nd(x, ind2).reshape(index_shape)
+        return paddle_out
+
+    r_hyps = paddle_gather(ys_pad, 1, index)
+    #   >>> r_hyps
+    #   >>> tensor([[3, 2, 1],
+    #   >>>         [4, 8, 9],
+    #   >>>         [2, 2, 2]])
+    eos = paddle.full([1], eos, dtype=r_hyps.dtype)
+    r_hyps = paddle.where(seq_mask, r_hyps, eos)
+    #   >>> r_hyps
+    #   >>> tensor([[3, 2, 1],
+    #   >>>         [4, 8, 9],
+    #   >>>         [2, eos, eos]])
+
+    r_hyps = paddle.cat([_sos, r_hyps], dim=1)
+    # r_hyps = paddle.concat([hyps[:, 0:1], r_hyps], axis=1)
+    #   >>> r_hyps
+    #   >>> tensor([[sos, 3, 2, 1],
+    #   >>>         [sos, 4, 8, 9],
+    #   >>>         [sos, 2, eos, eos]])
+    return r_hyps
diff --git a/paddlespeech/s2t/exps/u2/bin/test_wav.py b/paddlespeech/s2t/exps/u2/bin/test_wav.py
index 887ec7a6..4588def0 100644
--- a/paddlespeech/s2t/exps/u2/bin/test_wav.py
+++ b/paddlespeech/s2t/exps/u2/bin/test_wav.py
@@ -40,7 +40,7 @@ class U2Infer():
         self.preprocess_conf = config.preprocess_config
         self.preprocess_args = {"train": False}
         self.preprocessing = Transformation(self.preprocess_conf)
-
+        self.reverse_weight = getattr(config.model_conf, 'reverse_weight', 0.0)
         self.text_feature = TextFeaturizer(
             unit_type=config.unit_type,
             vocab=config.vocab_filepath,
@@ -89,7 +89,8 @@ class U2Infer():
                 ctc_weight=decode_config.ctc_weight,
                 decoding_chunk_size=decode_config.decoding_chunk_size,
                 num_decoding_left_chunks=decode_config.num_decoding_left_chunks,
-                simulate_streaming=decode_config.simulate_streaming)
+                simulate_streaming=decode_config.simulate_streaming,
+                reverse_weight=self.reverse_weight)
             rsl = result_transcripts[0][0]
             utt = Path(self.audio_file).name
             logger.info(f"hyp: {utt} {result_transcripts[0][0]}")
diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py
index db60083b..a13a6385 100644
--- a/paddlespeech/s2t/exps/u2/model.py
+++ b/paddlespeech/s2t/exps/u2/model.py
@@ -253,7 +253,6 @@ class U2Trainer(Trainer):
                 model_conf.output_dim = self.test_loader.vocab_size
 
         model = U2Model.from_config(model_conf)
-
         if self.parallel:
             model = paddle.DataParallel(model)
 
@@ -317,6 +316,7 @@ class U2Tester(U2Trainer):
             vocab=self.config.vocab_filepath,
             spm_model_prefix=self.config.spm_model_prefix)
         self.vocab_list = self.text_feature.vocab_list
+        self.reverse_weight = getattr(config.model_conf, 'reverse_weight', 0.0)
 
     def id2token(self, texts, texts_len, text_feature):
         """ ord() id to chr() chr """
@@ -341,6 +341,7 @@ class U2Tester(U2Trainer):
 
         start_time = time.time()
         target_transcripts = self.id2token(texts, texts_len, self.text_feature)
+
         result_transcripts, result_tokenids = self.model.decode(
             audio,
             audio_len,
@@ -350,7 +351,8 @@ class U2Tester(U2Trainer):
             ctc_weight=decode_config.ctc_weight,
             decoding_chunk_size=decode_config.decoding_chunk_size,
             num_decoding_left_chunks=decode_config.num_decoding_left_chunks,
-            simulate_streaming=decode_config.simulate_streaming)
+            simulate_streaming=decode_config.simulate_streaming,
+            reverse_weight=self.reverse_weight)
         decode_time = time.time() - start_time
 
         for utt, target, result, rec_tids in zip(
diff --git a/paddlespeech/s2t/io/dataloader.py b/paddlespeech/s2t/io/dataloader.py
index 4cc8274f..5ba891c3 100644
--- a/paddlespeech/s2t/io/dataloader.py
+++ b/paddlespeech/s2t/io/dataloader.py
@@ -361,7 +361,7 @@ class DataLoaderFactory():
             elif mode == 'valid':
                 config['manifest'] = config.dev_manifest
                 config['train_mode'] = False
-            elif model == 'test' or mode == 'align':
+            elif mode == 'test' or mode == 'align':
                 config['manifest'] = config.test_manifest
                 config['train_mode'] = False
                 config['dither'] = 0.0
diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py
index 8a984949..48b05d20 100644
--- a/paddlespeech/s2t/models/u2/u2.py
+++ b/paddlespeech/s2t/models/u2/u2.py
@@ -31,6 +31,8 @@ from paddle import nn
 
 from paddlespeech.audio.utils.tensor_utils import add_sos_eos
 from paddlespeech.audio.utils.tensor_utils import pad_sequence
+from paddlespeech.audio.utils.tensor_utils import reverse_pad_list
+from paddlespeech.audio.utils.tensor_utils import st_reverse_pad_list
 from paddlespeech.audio.utils.tensor_utils import th_accuracy
 from paddlespeech.s2t.decoders.scorers.ctc import CTCPrefixScorer
 from paddlespeech.s2t.frontend.utility import IGNORE_ID
@@ -38,6 +40,7 @@ from paddlespeech.s2t.frontend.utility import load_cmvn
 from paddlespeech.s2t.models.asr_interface import ASRInterface
 from paddlespeech.s2t.modules.cmvn import GlobalCMVN
 from paddlespeech.s2t.modules.ctc import CTCDecoderBase
+from paddlespeech.s2t.modules.decoder import BiTransformerDecoder
 from paddlespeech.s2t.modules.decoder import TransformerDecoder
 from paddlespeech.s2t.modules.encoder import ConformerEncoder
 from paddlespeech.s2t.modules.encoder import TransformerEncoder
@@ -69,6 +72,7 @@ class U2BaseModel(ASRInterface, nn.Layer):
                  ctc: CTCDecoderBase,
                  ctc_weight: float=0.5,
                  ignore_id: int=IGNORE_ID,
+                 reverse_weight: float=0.0,
                  lsm_weight: float=0.0,
                  length_normalized_loss: bool=False,
                  **kwargs):
@@ -82,6 +86,7 @@ class U2BaseModel(ASRInterface, nn.Layer):
         self.vocab_size = vocab_size
         self.ignore_id = ignore_id
         self.ctc_weight = ctc_weight
+        self.reverse_weight = reverse_weight
 
         self.encoder = encoder
         self.decoder = decoder
@@ -171,12 +176,21 @@ class U2BaseModel(ASRInterface, nn.Layer):
                                             self.ignore_id)
         ys_in_lens = ys_pad_lens + 1
 
+        r_ys_pad = reverse_pad_list(ys_pad, ys_pad_lens, float(self.ignore_id))
+        r_ys_in_pad, r_ys_out_pad = add_sos_eos(r_ys_pad, self.sos, self.eos,
+                                                self.ignore_id)
         # 1. Forward decoder
-        decoder_out, _ = self.decoder(encoder_out, encoder_mask, ys_in_pad,
-                                      ys_in_lens)
+        decoder_out, r_decoder_out, _ = self.decoder(
+            encoder_out, encoder_mask, ys_in_pad, ys_in_lens, r_ys_in_pad,
+            self.reverse_weight)
 
         # 2. Compute attention loss
         loss_att = self.criterion_att(decoder_out, ys_out_pad)
+        r_loss_att = paddle.to_tensor(0.0)
+        if self.reverse_weight > 0.0:
+            r_loss_att = self.criterion_att(r_decoder_out, r_ys_out_pad)
+        loss_att = loss_att * (1 - self.reverse_weight
+                               ) + r_loss_att * self.reverse_weight
         acc_att = th_accuracy(
             decoder_out.view(-1, self.vocab_size),
             ys_out_pad,
@@ -359,6 +373,7 @@ class U2BaseModel(ASRInterface, nn.Layer):
         # Let's assume B = batch_size
         # encoder_out: (B, maxlen, encoder_dim)
         # encoder_mask: (B, 1, Tmax)
+
         encoder_out, encoder_mask = self._forward_encoder(
             speech, speech_lengths, decoding_chunk_size,
             num_decoding_left_chunks, simulate_streaming)
@@ -500,7 +515,8 @@ class U2BaseModel(ASRInterface, nn.Layer):
             decoding_chunk_size: int=-1,
             num_decoding_left_chunks: int=-1,
             ctc_weight: float=0.0,
-            simulate_streaming: bool=False, ) -> List[int]:
+            simulate_streaming: bool=False,
+            reverse_weight: float=0.0, ) -> List[int]:
         """ Apply attention rescoring decoding, CTC prefix beam search
             is applied first to get nbest, then we resoring the nbest on
             attention decoder with corresponding encoder out
@@ -520,6 +536,9 @@ class U2BaseModel(ASRInterface, nn.Layer):
         """
         assert speech.shape[0] == speech_lengths.shape[0]
         assert decoding_chunk_size != 0
+        if reverse_weight > 0.0:
+            # decoder should be a bitransformer decoder if reverse_weight > 0.0
+            assert hasattr(self.decoder, 'right_decoder')
         device = speech.place
         batch_size = speech.shape[0]
         # For attention rescoring we only support batch_size=1
@@ -541,22 +560,30 @@ class U2BaseModel(ASRInterface, nn.Layer):
                 hyp_content, place=device, dtype=paddle.long)
             hyp_list.append(hyp_content)
         hyps_pad = pad_sequence(hyp_list, True, self.ignore_id)
+        ori_hyps_pad = hyps_pad
         hyps_lens = paddle.to_tensor(
             [len(hyp[0]) for hyp in hyps], place=device,
             dtype=paddle.long)  # (beam_size,)
         hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id)
         hyps_lens = hyps_lens + 1  # Add <sos> at begining
-
         encoder_out = encoder_out.repeat(beam_size, 1, 1)
         encoder_mask = paddle.ones(
             (beam_size, 1, encoder_out.shape[1]), dtype=paddle.bool)
-        decoder_out, _ = self.decoder(
-            encoder_out, encoder_mask, hyps_pad,
-            hyps_lens)  # (beam_size, max_hyps_len, vocab_size)
+
+        r_hyps_pad = st_reverse_pad_list(ori_hyps_pad, hyps_lens - 1, self.sos,
+                                         self.eos)
+        decoder_out, r_decoder_out, _ = self.decoder(
+            encoder_out, encoder_mask, hyps_pad, hyps_lens, r_hyps_pad,
+            reverse_weight)  # (beam_size, max_hyps_len, vocab_size)
         # ctc score in ln domain
         decoder_out = paddle.nn.functional.log_softmax(decoder_out, axis=-1)
         decoder_out = decoder_out.numpy()
 
+        # r_decoder_out will be 0.0, if reverse_weight is 0.0 or decoder is a
+        # conventional transformer decoder.
+        r_decoder_out = paddle.nn.functional.log_softmax(r_decoder_out, axis=-1)
+        r_decoder_out = r_decoder_out.numpy()
+
         # Only use decoder score for rescoring
         best_score = -float('inf')
         best_index = 0
@@ -567,6 +594,12 @@ class U2BaseModel(ASRInterface, nn.Layer):
                 score += decoder_out[i][j][w]
             # last decoder output token is `eos`, for laste decoder input token.
             score += decoder_out[i][len(hyp[0])][self.eos]
+            if reverse_weight > 0:
+                r_score = 0.0
+                for j, w in enumerate(hyp[0]):
+                    r_score += r_decoder_out[i][len(hyp[0]) - j - 1][w]
+                r_score += r_decoder_out[i][len(hyp[0])][self.eos]
+                score = score * (1 - reverse_weight) + r_score * reverse_weight
             # add ctc score (which in ln domain)
             score += hyp[1] * ctc_weight
             if score > best_score:
@@ -653,12 +686,24 @@ class U2BaseModel(ASRInterface, nn.Layer):
         """
         return self.ctc.log_softmax(xs)
 
-    @jit.to_static
+    # @jit.to_static
+    def is_bidirectional_decoder(self) -> bool:
+        """
+        Returns:
+            paddle.Tensor: decoder output
+        """
+        if hasattr(self.decoder, 'right_decoder'):
+            return True
+        else:
+            return False
+
+    # @jit.to_static
     def forward_attention_decoder(
             self,
             hyps: paddle.Tensor,
             hyps_lens: paddle.Tensor,
-            encoder_out: paddle.Tensor, ) -> paddle.Tensor:
+            encoder_out: paddle.Tensor,
+            reverse_weight: float=0.0, ) -> paddle.Tensor:
         """ Export interface for c++ call, forward decoder with multiple
             hypothesis from ctc prefix beam search and one encoder output
         Args:
@@ -676,11 +721,22 @@ class U2BaseModel(ASRInterface, nn.Layer):
         # (B, 1, T)
         encoder_mask = paddle.ones(
             [num_hyps, 1, encoder_out.shape[1]], dtype=paddle.bool)
+
+        # input for right to left decoder
+        # this hyps_lens has count <sos> token, we need minus it.
+        r_hyps_lens = hyps_lens - 1
+        # this hyps has included <sos> token, so it should be
+        # convert the original hyps.
+        r_hyps = hyps[:, 1:]
         # (num_hyps, max_hyps_len, vocab_size)
-        decoder_out, _ = self.decoder(encoder_out, encoder_mask, hyps,
-                                      hyps_lens)
+
+        r_hyps = st_reverse_pad_list(r_hyps, r_hyps_lens, self.sos, self.eos)
+
+        decoder_out, r_decoder_out, _ = self.decoder(
+            encoder_out, encoder_mask, hyps, hyps_lens, r_hyps, reverse_weight)
         decoder_out = paddle.nn.functional.log_softmax(decoder_out, axis=-1)
-        return decoder_out
+        r_decoder_out = paddle.nn.functional.log_softmax(r_decoder_out, axis=-1)
+        return decoder_out, r_decoder_out
 
     @paddle.no_grad()
     def decode(self,
@@ -692,7 +748,8 @@ class U2BaseModel(ASRInterface, nn.Layer):
                ctc_weight: float=0.0,
                decoding_chunk_size: int=-1,
                num_decoding_left_chunks: int=-1,
-               simulate_streaming: bool=False):
+               simulate_streaming: bool=False,
+               reverse_weight: float=0.0):
         """u2 decoding.
 
         Args:
@@ -764,7 +821,8 @@ class U2BaseModel(ASRInterface, nn.Layer):
                 decoding_chunk_size=decoding_chunk_size,
                 num_decoding_left_chunks=num_decoding_left_chunks,
                 ctc_weight=ctc_weight,
-                simulate_streaming=simulate_streaming)
+                simulate_streaming=simulate_streaming,
+                reverse_weight=reverse_weight)
             hyps = [hyp]
         else:
             raise ValueError(f"Not support decoding method: {decoding_method}")
@@ -801,7 +859,6 @@ class U2Model(U2DecodeModel):
         with DefaultInitializerContext(init_type):
             vocab_size, encoder, decoder, ctc = U2Model._init_from_config(
                 configs)
-
         super().__init__(
             vocab_size=vocab_size,
             encoder=encoder,
@@ -851,10 +908,20 @@ class U2Model(U2DecodeModel):
             raise ValueError(f"not support encoder type:{encoder_type}")
 
         # decoder
-        decoder = TransformerDecoder(vocab_size,
-                                     encoder.output_size(),
-                                     **configs['decoder_conf'])
-
+        decoder_type = configs.get('decoder', 'transformer')
+        logger.debug(f"U2 Decoder type: {decoder_type}")
+        if decoder_type == 'transformer':
+            decoder = TransformerDecoder(vocab_size,
+                                         encoder.output_size(),
+                                         **configs['decoder_conf'])
+        elif decoder_type == 'bitransformer':
+            assert 0.0 < configs['model_conf']['reverse_weight'] < 1.0
+            assert configs['decoder_conf']['r_num_blocks'] > 0
+            decoder = BiTransformerDecoder(vocab_size,
+                                           encoder.output_size(),
+                                           **configs['decoder_conf'])
+        else:
+            raise ValueError(f"not support decoder type:{decoder_type}")
         # ctc decoder and ctc loss
         model_conf = configs.get('model_conf', dict())
         dropout_rate = model_conf.get('ctc_dropout_rate', 0.0)
diff --git a/paddlespeech/s2t/modules/decoder.py b/paddlespeech/s2t/modules/decoder.py
index ccc8482d..3b1a7f23 100644
--- a/paddlespeech/s2t/modules/decoder.py
+++ b/paddlespeech/s2t/modules/decoder.py
@@ -35,7 +35,6 @@ from paddlespeech.s2t.modules.mask import make_xs_mask
 from paddlespeech.s2t.modules.mask import subsequent_mask
 from paddlespeech.s2t.modules.positionwise_feed_forward import PositionwiseFeedForward
 from paddlespeech.s2t.utils.log import Log
-
 logger = Log(__name__).getlog()
 
 __all__ = ["TransformerDecoder"]
@@ -116,13 +115,19 @@ class TransformerDecoder(BatchScorerInterface, nn.Layer):
             memory: paddle.Tensor,
             memory_mask: paddle.Tensor,
             ys_in_pad: paddle.Tensor,
-            ys_in_lens: paddle.Tensor, ) -> Tuple[paddle.Tensor, paddle.Tensor]:
+            ys_in_lens: paddle.Tensor,
+            r_ys_in_pad: paddle.Tensor=paddle.empty([0]),
+            reverse_weight: float=0.0) -> Tuple[paddle.Tensor, paddle.Tensor]:
         """Forward decoder.
         Args:
             memory: encoded memory, float32  (batch, maxlen_in, feat)
             memory_mask: encoder memory mask, (batch, 1, maxlen_in)
             ys_in_pad: padded input token ids, int64 (batch, maxlen_out)
             ys_in_lens: input lengths of this batch (batch)
+            r_ys_in_pad: not used in transformer decoder, in order to unify api
+                with bidirectional decoder
+            reverse_weight: not used in transformer decoder, in order to unify
+                api with bidirectional decode
         Returns:
             (tuple): tuple containing:
                 x: decoded token score before softmax (batch, maxlen_out, vocab_size)
@@ -151,7 +156,7 @@ class TransformerDecoder(BatchScorerInterface, nn.Layer):
         # TODO(Hui Zhang): reduce_sum not support bool type
         # olens = tgt_mask.sum(1)
         olens = tgt_mask.astype(paddle.int).sum(1)
-        return x, olens
+        return x, paddle.to_tensor(0.0), olens
 
     def forward_one_step(
             self,
@@ -251,3 +256,119 @@ class TransformerDecoder(BatchScorerInterface, nn.Layer):
         state_list = [[states[i][b] for i in range(n_layers)]
                       for b in range(n_batch)]
         return logp, state_list
+
+
+class BiTransformerDecoder(BatchScorerInterface, nn.Layer):
+    """Base class of Transfomer decoder module.
+    Args:
+        vocab_size: output dim
+        encoder_output_size: dimension of attention
+        attention_heads: the number of heads of multi head attention
+        linear_units: the hidden units number of position-wise feedforward
+        num_blocks: the number of decoder blocks
+        r_num_blocks: the number of right to left decoder blocks
+        dropout_rate: dropout rate
+        self_attention_dropout_rate: dropout rate for attention
+        input_layer: input layer type
+        use_output_layer: whether to use output layer
+        pos_enc_class: PositionalEncoding or ScaledPositionalEncoding
+        normalize_before:
+            True: use layer_norm before each sub-block of a layer.
+            False: use layer_norm after each sub-block of a layer.
+        concat_after: whether to concat attention layer's input and output
+            True: x -> x + linear(concat(x, att(x)))
+            False: x -> x + att(x)
+    """
+
+    def __init__(self,
+                 vocab_size: int,
+                 encoder_output_size: int,
+                 attention_heads: int=4,
+                 linear_units: int=2048,
+                 num_blocks: int=6,
+                 r_num_blocks: int=0,
+                 dropout_rate: float=0.1,
+                 positional_dropout_rate: float=0.1,
+                 self_attention_dropout_rate: float=0.0,
+                 src_attention_dropout_rate: float=0.0,
+                 input_layer: str="embed",
+                 use_output_layer: bool=True,
+                 normalize_before: bool=True,
+                 concat_after: bool=False,
+                 max_len: int=5000):
+
+        assert check_argument_types()
+
+        nn.Layer.__init__(self)
+        self.left_decoder = TransformerDecoder(
+            vocab_size, encoder_output_size, attention_heads, linear_units,
+            num_blocks, dropout_rate, positional_dropout_rate,
+            self_attention_dropout_rate, src_attention_dropout_rate,
+            input_layer, use_output_layer, normalize_before, concat_after,
+            max_len)
+
+        self.right_decoder = TransformerDecoder(
+            vocab_size, encoder_output_size, attention_heads, linear_units,
+            r_num_blocks, dropout_rate, positional_dropout_rate,
+            self_attention_dropout_rate, src_attention_dropout_rate,
+            input_layer, use_output_layer, normalize_before, concat_after,
+            max_len)
+
+    def forward(
+            self,
+            memory: paddle.Tensor,
+            memory_mask: paddle.Tensor,
+            ys_in_pad: paddle.Tensor,
+            ys_in_lens: paddle.Tensor,
+            r_ys_in_pad: paddle.Tensor,
+            reverse_weight: float=0.0,
+    ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
+        """Forward decoder.
+        Args:
+            memory: encoded memory, float32  (batch, maxlen_in, feat)
+            memory_mask: encoder memory mask, (batch, 1, maxlen_in)
+            ys_in_pad: padded input token ids, int64 (batch, maxlen_out)
+            ys_in_lens: input lengths of this batch (batch)
+            r_ys_in_pad: padded input token ids, int64 (batch, maxlen_out),
+                used for right to left decoder
+            reverse_weight: used for right to left decoder
+        Returns:
+            (tuple): tuple containing:
+                x: decoded token score before softmax (batch, maxlen_out,
+                    vocab_size) if use_output_layer is True,
+                r_x: x: decoded token score (right to left decoder)
+                    before softmax (batch, maxlen_out, vocab_size)
+                    if use_output_layer is True,
+                olens: (batch, )
+        """
+        l_x, _, olens = self.left_decoder(memory, memory_mask, ys_in_pad,
+                                          ys_in_lens)
+        r_x = paddle.to_tensor(0.0)
+        if reverse_weight > 0.0:
+            r_x, _, olens = self.right_decoder(memory, memory_mask, r_ys_in_pad,
+                                               ys_in_lens)
+        return l_x, r_x, olens
+
+    def forward_one_step(
+            self,
+            memory: paddle.Tensor,
+            memory_mask: paddle.Tensor,
+            tgt: paddle.Tensor,
+            tgt_mask: paddle.Tensor,
+            cache: Optional[List[paddle.Tensor]]=None,
+    ) -> Tuple[paddle.Tensor, List[paddle.Tensor]]:
+        """Forward one step.
+            This is only used for decoding.
+        Args:
+            memory: encoded memory, float32  (batch, maxlen_in, feat)
+            memory_mask: encoded memory mask, (batch, 1, maxlen_in)
+            tgt: input token ids, int64 (batch, maxlen_out)
+            tgt_mask: input token mask,  (batch, maxlen_out, maxlen_out)
+                      dtype=paddle.bool
+            cache: cached output list of (batch, max_time_out-1, size)
+        Returns:
+            y, cache: NN output value and cache per `self.decoders`.
+            y.shape` is (batch, maxlen_out, token)
+        """
+        return self.left_decoder.forward_one_step(memory, memory_mask, tgt,
+                                                  tgt_mask, cache)
diff --git a/paddlespeech/server/engine/asr/online/python/asr_engine.py b/paddlespeech/server/engine/asr/online/python/asr_engine.py
index 5782d703..ae026092 100644
--- a/paddlespeech/server/engine/asr/online/python/asr_engine.py
+++ b/paddlespeech/server/engine/asr/online/python/asr_engine.py
@@ -612,7 +612,8 @@ class PaddleASRConnectionHanddler:
         encoder_out = self.encoder_out.repeat(beam_size, 1, 1)
         encoder_mask = paddle.ones(
             (beam_size, 1, encoder_out.shape[1]), dtype=paddle.bool)
-        decoder_out, _ = self.model.decoder(
+
+        decoder_out, _, _ = self.model.decoder(
             encoder_out, encoder_mask, hyps_pad,
             hyps_lens)  # (beam_size, max_hyps_len, vocab_size)
         # ctc score in ln domain
diff --git a/paddlespeech/server/engine/tts/paddleinference/tts_engine.py b/paddlespeech/server/engine/tts/paddleinference/tts_engine.py
index 43b0df40..2878c852 100644
--- a/paddlespeech/server/engine/tts/paddleinference/tts_engine.py
+++ b/paddlespeech/server/engine/tts/paddleinference/tts_engine.py
@@ -22,7 +22,6 @@ import librosa
 import numpy as np
 import paddle
 import soundfile as sf
-from scipy.io import wavfile
 
 from paddlespeech.cli.log import logger
 from paddlespeech.cli.tts.infer import TTSExecutor
@@ -409,7 +408,8 @@ class PaddleTTSConnectionHandler(TTSServerExecutor):
 
         # wav to base64
         buf = io.BytesIO()
-        wavfile.write(buf, target_fs, wav_speed)
+        sf.write(buf, wav_speed, target_fs, format="wav")
+        buf.seek(0)
         base64_bytes = base64.b64encode(buf.read())
         wav_base64 = base64_bytes.decode('utf-8')
         logger.debug("Audio to string successfully.")
diff --git a/paddlespeech/server/engine/tts/python/tts_engine.py b/paddlespeech/server/engine/tts/python/tts_engine.py
index 4d180100..356962bd 100644
--- a/paddlespeech/server/engine/tts/python/tts_engine.py
+++ b/paddlespeech/server/engine/tts/python/tts_engine.py
@@ -20,7 +20,6 @@ import librosa
 import numpy as np
 import paddle
 import soundfile as sf
-from scipy.io import wavfile
 
 from paddlespeech.cli.log import logger
 from paddlespeech.cli.tts.infer import TTSExecutor
@@ -173,7 +172,9 @@ class PaddleTTSConnectionHandler(TTSServerExecutor):
 
         # wav to base64
         buf = io.BytesIO()
-        wavfile.write(buf, target_fs, wav_speed)
+        sf.write(buf, wav_speed, target_fs, format="wav")
+        buf.seek(0)
+
         base64_bytes = base64.b64encode(buf.read())
         wav_base64 = base64_bytes.decode('utf-8')
         logger.debug("Audio to string successfully.")
diff --git a/paddlespeech/t2s/exps/ernie_sat/align.py b/paddlespeech/t2s/exps/ernie_sat/align.py
index 8dbe685f..a802d029 100755
--- a/paddlespeech/t2s/exps/ernie_sat/align.py
+++ b/paddlespeech/t2s/exps/ernie_sat/align.py
@@ -285,7 +285,7 @@ def get_phns_spans(wav_path: str,
                 break
 
     # reverse w2p and new_w2p
-    right_idx = 0
+    right_idx = len(new_phns)
     new_phns_right = []
     sp_count = 0
     w2p_max_idx = _get_max_idx(w2p)
diff --git a/paddlespeech/t2s/exps/voice_cloning.py b/paddlespeech/t2s/exps/voice_cloning.py
index 80cfea4a..4b737205 100644
--- a/paddlespeech/t2s/exps/voice_cloning.py
+++ b/paddlespeech/t2s/exps/voice_cloning.py
@@ -135,16 +135,16 @@ def voice_cloning(args):
         print(f"{utt_id} done!")
 
     # generate 5 random_spk_emb
-    for i in range(5):
-        random_spk_emb = gen_random_embed(args.use_ecapa)
-        utt_id = "random_spk_emb"
-        with paddle.no_grad():
-            wav = voc_inference(am_inference(phone_ids, spk_emb=random_spk_emb))
-        sf.write(
-            str(output_dir / (utt_id + "_" + str(i) + ".wav")),
-            wav.numpy(),
-            samplerate=am_config.fs)
-    print(f"{utt_id} done!")
+    # for i in range(5):
+    #     random_spk_emb = gen_random_embed(args.use_ecapa)
+    #     utt_id = "random_spk_emb"
+    #     with paddle.no_grad():
+    #         wav = voc_inference(am_inference(phone_ids, spk_emb=random_spk_emb))
+    #     sf.write(
+    #         str(output_dir / (utt_id + "_" + str(i) + ".wav")),
+    #         wav.numpy(),
+    #         samplerate=am_config.fs)
+    # print(f"{utt_id} done!")
 
 
 def parse_args():