From 2c4c141de5b8bf7335370f955559d1492adf42c2 Mon Sep 17 00:00:00 2001
From: YangZhou <56786796+SmileGoat@users.noreply.github.com>
Date: Tue, 10 Jan 2023 14:35:41 +0800
Subject: [PATCH 01/24] [audio] fix load paddleaudio fail (#2815)

* fix paddleaudio import fail
---
 audio/paddleaudio/_internal/module_utils.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/audio/paddleaudio/_internal/module_utils.py b/audio/paddleaudio/_internal/module_utils.py
index 7b3230de9..becd23cd8 100644
--- a/audio/paddleaudio/_internal/module_utils.py
+++ b/audio/paddleaudio/_internal/module_utils.py
@@ -67,8 +67,11 @@ def deprecated(direction: str, version: Optional[str]=None):
 
 
 def is_kaldi_available():
-    return is_module_available("paddleaudio._paddleaudio")
-
+    try:
+        from paddleaudio import _paddleaudio  
+        return True
+    except Exception:
+        return False
 
 def requires_kaldi():
     if is_kaldi_available():
@@ -128,9 +131,11 @@ def requires_soundfile():
 
 
 def is_sox_available():
-    if platform.system() == "Windows":  # not support sox in windows
+    try:
+        from paddleaudio import _paddleaudio  
+        return True
+    except Exception:
         return False
-    return is_module_available("paddleaudio._paddleaudio")
 
 
 def requires_sox():

From 88fe26f17ca4a35d007bb934cbe96550b1592508 Mon Sep 17 00:00:00 2001
From: zxcd <228587199@qq.com>
Date: Tue, 10 Jan 2023 19:46:39 +0800
Subject: [PATCH 02/24] [ASR] add asr code-switch cli and demo, test='asr'
 (#2816)

* add asr code-switch cli and demo.

* fix some model named problem.
---
 README.md                                     |  2 ++
 README_cn.md                                  |  2 ++
 demos/speech_recognition/README.md            | 28 +++++++++--------
 demos/speech_recognition/README_cn.md         | 29 ++++++++++--------
 demos/speech_recognition/run.sh               |  6 ++++
 paddlespeech/cli/asr/infer.py                 | 28 ++++++++++++-----
 paddlespeech/cli/base_commands.py             | 19 ++++++++++--
 paddlespeech/resource/pretrained_models.py    | 13 ++++++++
 .../server/bin/paddlespeech_server.py         | 30 ++++++++++++++-----
 tests/unit/cli/test_cli.sh                    |  3 +-
 10 files changed, 118 insertions(+), 42 deletions(-)

diff --git a/README.md b/README.md
index dbdf6a4f8..2fb773634 100644
--- a/README.md
+++ b/README.md
@@ -157,6 +157,8 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision
   - 🧩  *Cascaded models application*: as an extension of the typical traditional audio tasks, we combine the workflows of the aforementioned tasks with other fields like Natural language processing (NLP) and Computer Vision (CV).
 
 ### Recent Update
+- 🔥 2022.01.10: Add [code-switch asr CLI and Demos](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/speech_recognition).
+- 👑 2022.01.06: Add [code-switch asr tal_cs recipe](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/tal_cs/asr1/).
 - 🎉 2022.12.02: Add [end-to-end Prosody Prediction pipeline](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3_rhy) (including using prosody labels in Acoustic Model).
 - 🎉 2022.11.30: Add [TTS Android Demo](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/TTSAndroid).
 - 🤗 2022.11.28: PP-TTS and PP-ASR demos are available in [AIStudio](https://aistudio.baidu.com/aistudio/modelsoverview) and [official website
diff --git a/README_cn.md b/README_cn.md
index 5cc156c9f..53f6a66e4 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -164,6 +164,8 @@
 
   
 ### 近期更新
+- 🔥 2022.01.10: 新增 [中英混合 ASR CLI 和 Demos](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/speech_recognition).
+- 👑 2022.01.06: 新增 [ASR中英混合 tal_cs 训练推理流程](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/tal_cs/asr1/).
 - 🎉 2022.12.02: 新增 [端到端韵律预测全流程](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3_rhy) (包含在声学模型中使用韵律标签)。
 - 🎉 2022.11.30: 新增 [TTS Android 部署示例](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/TTSAndroid)。
 - 🤗 2022.11.28: PP-TTS and PP-ASR 示例可在 [AIStudio](https://aistudio.baidu.com/aistudio/modelsoverview) 和[飞桨官网](https://www.paddlepaddle.org.cn/models)体验！
diff --git a/demos/speech_recognition/README.md b/demos/speech_recognition/README.md
index c815a88af..ee2acd6fd 100644
--- a/demos/speech_recognition/README.md
+++ b/demos/speech_recognition/README.md
@@ -17,7 +17,7 @@ The input of this demo should be a WAV file(`.wav`), and the sample rate must be
 
 Here are sample files for this demo that can be downloaded:
 ```bash
-wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
+wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav
 ```
 
 ### 3. Usage
@@ -27,6 +27,8 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
   paddlespeech asr --input ./zh.wav -v
   # English
   paddlespeech asr --model transformer_librispeech --lang en --input ./en.wav -v
+  # Code-Switch
+  paddlespeech asr --model conformer_talcs --lang zh_en --codeswitch True --input ./ch_zh_mix.wav -v 
   # Chinese ASR + Punctuation Restoration
   paddlespeech asr --input ./zh.wav -v | paddlespeech text --task punc -v
   ```
@@ -40,6 +42,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
   - `input`(required): Audio file to recognize.
   - `model`: Model type of asr task. Default: `conformer_wenetspeech`.
   - `lang`: Model language. Default: `zh`.
+  - `codeswitch`: Code Swith Model. Default: `False`
   - `sample_rate`: Sample rate of the model. Default: `16000`.
   - `config`: Config of asr task. Use pretrained model when it is None. Default: `None`.
   - `ckpt_path`: Model checkpoint. Use pretrained model when it is None. Default: `None`.
@@ -83,14 +86,15 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
 
 Here is a list of pretrained models released by PaddleSpeech that can be used by command and python API:
 
-| Model | Language | Sample Rate
-| :--- | :---: | :---: |
-| conformer_wenetspeech | zh | 16k
-| conformer_online_multicn | zh | 16k
-| conformer_aishell | zh | 16k
-| conformer_online_aishell | zh | 16k
-| transformer_librispeech | en | 16k
-| deepspeech2online_wenetspeech | zh | 16k
-| deepspeech2offline_aishell| zh| 16k
-| deepspeech2online_aishell | zh | 16k
-| deepspeech2offline_librispeech | en | 16k
+| Model | Code Switch | Language | Sample Rate
+| :--- | :---: | :---: | :---: |
+| conformer_wenetspeech | False | zh | 16k
+| conformer_online_multicn | False | zh | 16k
+| conformer_aishell | False | zh | 16k
+| conformer_online_aishell | False | zh | 16k
+| transformer_librispeech | False | en | 16k
+| deepspeech2online_wenetspeech | False | zh | 16k
+| deepspeech2offline_aishell | False | zh| 16k
+| deepspeech2online_aishell | False | zh | 16k
+| deepspeech2offline_librispeech | False | en | 16k
+| conformer_talcs | True | zh_en | 16k
diff --git a/demos/speech_recognition/README_cn.md b/demos/speech_recognition/README_cn.md
index 13aa9f277..62dce3bc9 100644
--- a/demos/speech_recognition/README_cn.md
+++ b/demos/speech_recognition/README_cn.md
@@ -1,4 +1,5 @@
 (简体中文|[English](./README.md))
+ (简体中文|[English](./README.md))
 
 # 语音识别
 ## 介绍
@@ -16,7 +17,7 @@
 
 可以下载此 demo 的示例音频：
 ```bash
-wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
+wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav
 ```
 ### 3. 使用方法
 - 命令行 (推荐使用)
@@ -25,6 +26,8 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
   paddlespeech asr --input ./zh.wav -v
   # 英文
   paddlespeech asr --model transformer_librispeech --lang en --input ./en.wav -v
+  #中英混合
+  paddlespeech asr --model conformer_talcs --lang zh_en --codeswitch True --input ./ch_zh_mix.wav -v 
   # 中文 + 标点恢复
   paddlespeech asr --input ./zh.wav -v | paddlespeech text --task punc -v
   ```
@@ -38,6 +41,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
   - `input`(必须输入)：用于识别的音频文件。
   - `model`：ASR 任务的模型，默认值：`conformer_wenetspeech`。
   - `lang`：模型语言，默认值：`zh`。
+  - `codeswitch`: 是否使用语言转换，默认值：`False`。
   - `sample_rate`：音频采样率，默认值：`16000`。
   - `config`：ASR 任务的参数文件，若不设置则使用预训练模型中的默认配置，默认值：`None`。
   - `ckpt_path`：模型参数文件，若不设置则下载预训练模型使用，默认值：`None`。
@@ -80,14 +84,15 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
 ### 4.预训练模型
 以下是 PaddleSpeech 提供的可以被命令行和 python API 使用的预训练模型列表：
 
-| 模型 | 语言 | 采样率
-| :--- | :---: | :---: |
-| conformer_wenetspeech | zh | 16k
-| conformer_online_multicn | zh | 16k
-| conformer_aishell | zh | 16k
-| conformer_online_aishell | zh | 16k
-| transformer_librispeech | en | 16k
-| deepspeech2online_wenetspeech | zh | 16k
-| deepspeech2offline_aishell| zh| 16k
-| deepspeech2online_aishell | zh | 16k
-| deepspeech2offline_librispeech | en | 16k
+| 模型 | 语言转换 | 语言 | 采样率
+| :--- | :---: | :---: | :---: |
+| conformer_wenetspeech | False | zh | 16k
+| conformer_online_multicn | False | zh | 16k
+| conformer_aishell | False | zh | 16k
+| conformer_online_aishell | False | zh | 16k
+| transformer_librispeech | False | en | 16k
+| deepspeech2online_wenetspeech | False | zh | 16k
+| deepspeech2offline_aishell | False | zh| 16k
+| deepspeech2online_aishell | False | zh | 16k
+| deepspeech2offline_librispeech | False | en | 16k
+| conformer_talcs | True | zh_en | 16k
diff --git a/demos/speech_recognition/run.sh b/demos/speech_recognition/run.sh
index e48ff3e96..8ba6e4c3e 100755
--- a/demos/speech_recognition/run.sh
+++ b/demos/speech_recognition/run.sh
@@ -2,6 +2,7 @@
 
 wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
 wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
+wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav
 
 # asr
 paddlespeech asr --input ./zh.wav
@@ -18,6 +19,11 @@ paddlespeech asr --help
 # english asr
 paddlespeech asr --lang en --model transformer_librispeech --input ./en.wav
 
+
+# code-switch asr
+paddlespeech asr --lang zh_en --codeswitch True --model conformer_talcs --input ./ch_zh_mix.wav
+
+
 # model stats
 paddlespeech stats --task asr
 
diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py
index 004143361..7a7aef8b0 100644
--- a/paddlespeech/cli/asr/infer.py
+++ b/paddlespeech/cli/asr/infer.py
@@ -25,6 +25,9 @@ import librosa
 import numpy as np
 import paddle
 import soundfile
+from paddlespeech.audio.transform.transformation import Transformation
+from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
+from paddlespeech.s2t.utils.utility import UpdateConfig
 from yacs.config import CfgNode
 
 from ...utils.env import MODEL_HOME
@@ -34,9 +37,6 @@ from ..log import logger
 from ..utils import CLI_TIMER
 from ..utils import stats_wrapper
 from ..utils import timer_register
-from paddlespeech.audio.transform.transformation import Transformation
-from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
-from paddlespeech.s2t.utils.utility import UpdateConfig
 
 __all__ = ['ASRExecutor']
 
@@ -62,8 +62,13 @@ class ASRExecutor(BaseExecutor):
             '--lang',
             type=str,
             default='zh',
-            help='Choose model language. zh or en, zh:[conformer_wenetspeech-zh-16k], en:[transformer_librispeech-en-16k]'
+            help='Choose model language. [zh, en, zh_en], zh:[conformer_wenetspeech-zh-16k], en:[transformer_librispeech-en-16k], zh_en:[conformer_talcs-codeswitch_zh_en-16k]'
         )
+        self.parser.add_argument(
+            '--codeswitch',
+            type=bool,
+            default=False,
+            help='Choose whether use code-switch. True or False.')
         self.parser.add_argument(
             "--sample_rate",
             type=int,
@@ -127,6 +132,7 @@ class ASRExecutor(BaseExecutor):
     def _init_from_path(self,
                         model_type: str='wenetspeech',
                         lang: str='zh',
+                        codeswitch: bool=False,
                         sample_rate: int=16000,
                         cfg_path: Optional[os.PathLike]=None,
                         decode_method: str='attention_rescoring',
@@ -144,7 +150,12 @@ class ASRExecutor(BaseExecutor):
 
         if cfg_path is None or ckpt_path is None:
             sample_rate_str = '16k' if sample_rate == 16000 else '8k'
-            tag = model_type + '-' + lang + '-' + sample_rate_str
+            if lang == "zh_en" and codeswitch is True:
+                tag = model_type + '-' + 'codeswitch_' + lang + '-' + sample_rate_str
+            elif lang == "zh_en" or codeswitch is True:
+                raise Exception("codeswitch is true only in zh_en model")
+            else:
+                tag = model_type + '-' + lang + '-' + sample_rate_str
             self.task_resource.set_task_model(tag, version=None)
             self.res_path = self.task_resource.res_dir
 
@@ -423,6 +434,7 @@ class ASRExecutor(BaseExecutor):
 
         model = parser_args.model
         lang = parser_args.lang
+        codeswitch = parser_args.codeswitch
         sample_rate = parser_args.sample_rate
         config = parser_args.config
         ckpt_path = parser_args.ckpt_path
@@ -444,6 +456,7 @@ class ASRExecutor(BaseExecutor):
                     audio_file=input_,
                     model=model,
                     lang=lang,
+                    codeswitch=codeswitch,
                     sample_rate=sample_rate,
                     config=config,
                     ckpt_path=ckpt_path,
@@ -472,6 +485,7 @@ class ASRExecutor(BaseExecutor):
                  audio_file: os.PathLike,
                  model: str='conformer_u2pp_online_wenetspeech',
                  lang: str='zh',
+                 codeswitch: bool=False,
                  sample_rate: int=16000,
                  config: os.PathLike=None,
                  ckpt_path: os.PathLike=None,
@@ -485,8 +499,8 @@ class ASRExecutor(BaseExecutor):
         """
         audio_file = os.path.abspath(audio_file)
         paddle.set_device(device)
-        self._init_from_path(model, lang, sample_rate, config, decode_method,
-                             num_decoding_left_chunks, ckpt_path)
+        self._init_from_path(model, lang, codeswitch, sample_rate, config,
+                             decode_method, num_decoding_left_chunks, ckpt_path)
         if not self._check(audio_file, sample_rate, force_yes):
             sys.exit(-1)
         if rtf:
diff --git a/paddlespeech/cli/base_commands.py b/paddlespeech/cli/base_commands.py
index 767d0df78..dfeb5cae5 100644
--- a/paddlespeech/cli/base_commands.py
+++ b/paddlespeech/cli/base_commands.py
@@ -14,6 +14,7 @@
 import argparse
 from typing import List
 
+import numpy
 from prettytable import PrettyTable
 
 from ..resource import CommonTaskResource
@@ -78,7 +79,7 @@ class VersionCommand:
 
 
 model_name_format = {
-    'asr': 'Model-Language-Sample Rate',
+    'asr': 'Model-Size-Code Switch-Multilingual-Language-Sample Rate',
     'cls': 'Model-Sample Rate',
     'st': 'Model-Source language-Target language',
     'text': 'Model-Task-Language',
@@ -111,7 +112,21 @@ class StatsCommand:
         fields = model_name_format[self.task].split("-")
         table = PrettyTable(fields)
         for key in pretrained_models:
-            table.add_row(key.split("-"))
+            line = key.split("-")
+            if self.task == "asr" and len(line) < len(fields):
+                for i in range(len(line), len(fields)):
+                    line.append("-")
+                if "codeswitch" in key:
+                    line[3], line[1] = line[1].split("_")[0], line[1].split(
+                        "_")[1:]
+                elif "multilingual" in key:
+                    line[4], line[1] = line[1].split("_")[0], line[1].split(
+                        "_")[1:]
+                tmp = numpy.array(line)
+                idx = [0, 5, 3, 4, 1, 2]
+                line = tmp[idx]
+            table.add_row(line)
+
         print(table)
 
     def execute(self, argv: List[str]) -> bool:
diff --git a/paddlespeech/resource/pretrained_models.py b/paddlespeech/resource/pretrained_models.py
index 3c5aa1f90..ff0b30f6d 100644
--- a/paddlespeech/resource/pretrained_models.py
+++ b/paddlespeech/resource/pretrained_models.py
@@ -30,6 +30,7 @@ __all__ = [
 ]
 
 # The tags for pretrained_models should be "{model_name}[_{dataset}][-{lang}][-...]".
+# Add code-switch and multilingual tag, "{model_name}[_{dataset}]-[codeswitch/multilingual][_{lang}][-...]".
 # e.g. "conformer_wenetspeech-zh-16k" and "panns_cnn6-32k".
 # Command line and python api use "{model_name}[_{dataset}]" as --model, usage:
 # "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav"
@@ -322,6 +323,18 @@ asr_dynamic_pretrained_models = {
             '099a601759d467cd0a8523ff939819c5'
         },
     },
+    "conformer_talcs-codeswitch_zh_en-16k": {
+        '1.4': {
+            'url':
+            'https://paddlespeech.bj.bcebos.com/s2t/tal_cs/asr1/asr1_conformer_talcs_ckpt_1.4.0.model.tar.gz',
+            'md5':
+            '01962c5d0a70878fe41cacd4f61e14d1',
+            'cfg_path':
+            'model.yaml',
+            'ckpt_path':
+            'exp/conformer/checkpoints/avg_10'
+        },
+    },
 }
 
 asr_static_pretrained_models = {
diff --git a/paddlespeech/server/bin/paddlespeech_server.py b/paddlespeech/server/bin/paddlespeech_server.py
index 1b1792bd1..299a8c3d4 100644
--- a/paddlespeech/server/bin/paddlespeech_server.py
+++ b/paddlespeech/server/bin/paddlespeech_server.py
@@ -16,14 +16,9 @@ import sys
 import warnings
 from typing import List
 
+import numpy
 import uvicorn
 from fastapi import FastAPI
-from prettytable import PrettyTable
-from starlette.middleware.cors import CORSMiddleware
-
-from ..executor import BaseExecutor
-from ..util import cli_server_register
-from ..util import stats_wrapper
 from paddlespeech.cli.log import logger
 from paddlespeech.resource import CommonTaskResource
 from paddlespeech.server.engine.engine_pool import init_engine_pool
@@ -31,6 +26,12 @@ from paddlespeech.server.engine.engine_warmup import warm_up
 from paddlespeech.server.restful.api import setup_router as setup_http_router
 from paddlespeech.server.utils.config import get_config
 from paddlespeech.server.ws.api import setup_router as setup_ws_router
+from prettytable import PrettyTable
+from starlette.middleware.cors import CORSMiddleware
+
+from ..executor import BaseExecutor
+from ..util import cli_server_register
+from ..util import stats_wrapper
 warnings.filterwarnings("ignore")
 
 __all__ = ['ServerExecutor', 'ServerStatsExecutor']
@@ -134,7 +135,7 @@ class ServerStatsExecutor():
             required=True)
         self.task_choices = ['asr', 'tts', 'cls', 'text', 'vector']
         self.model_name_format = {
-            'asr': 'Model-Language-Sample Rate',
+            'asr': 'Model-Size-Code Switch-Multilingual-Language-Sample Rate',
             'tts': 'Model-Language',
             'cls': 'Model-Sample Rate',
             'text': 'Model-Task-Language',
@@ -145,7 +146,20 @@ class ServerStatsExecutor():
         fields = self.model_name_format[self.task].split("-")
         table = PrettyTable(fields)
         for key in pretrained_models:
-            table.add_row(key.split("-"))
+            line = key.split("-")
+            if self.task == "asr" and len(line) < len(fields):
+                for i in range(len(line), len(fields)):
+                    line.append("-")
+                if "codeswitch" in key:
+                    line[3], line[1] = line[1].split("_")[0], line[1].split(
+                        "_")[1:]
+                elif "multilingual" in key:
+                    line[4], line[1] = line[1].split("_")[0], line[1].split(
+                        "_")[1:]
+                tmp = numpy.array(line)
+                idx = [0, 5, 3, 4, 1, 2]
+                line = tmp[idx]
+            table.add_row(line)
         print(table)
 
     def execute(self, argv: List[str]) -> bool:
diff --git a/tests/unit/cli/test_cli.sh b/tests/unit/cli/test_cli.sh
index 3a58626d2..5d3b76f6c 100755
--- a/tests/unit/cli/test_cli.sh
+++ b/tests/unit/cli/test_cli.sh
@@ -14,7 +14,7 @@ paddlespeech ssl --task asr --lang en --input ./en.wav
 paddlespeech ssl --task vector --lang en --input ./en.wav
 
 # Speech_recognition
-wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
+wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav
 paddlespeech asr --input ./zh.wav
 paddlespeech asr --model conformer_aishell --input ./zh.wav
 paddlespeech asr --model conformer_online_aishell --input ./zh.wav
@@ -26,6 +26,7 @@ paddlespeech asr --model deepspeech2offline_aishell --input ./zh.wav
 paddlespeech asr --model deepspeech2online_wenetspeech --input ./zh.wav
 paddlespeech asr --model deepspeech2online_aishell --input ./zh.wav
 paddlespeech asr --model deepspeech2offline_librispeech --lang en --input ./en.wav
+paddlespeech asr --model conformer_talcs --lang zh_en --codeswitch True --input ./ch_zh_mix.wav
 
 # Support editing num_decoding_left_chunks
 paddlespeech asr --model conformer_online_wenetspeech --num_decoding_left_chunks 3 --input ./zh.wav

From faa2f866516e1e1afb40b25df907ebe3078bd078 Mon Sep 17 00:00:00 2001
From: HuangLiangJie <mailoflawrence@gmail.com>
Date: Wed, 11 Jan 2023 12:59:41 +0800
Subject: [PATCH 03/24] [TTS]update VITS init method (#2809)

---
 paddlespeech/t2s/models/vits/text_encoder.py |  13 +-
 paddlespeech/t2s/models/vits/vits.py         |  55 +++-
 paddlespeech/utils/initialize.py             | 321 +++++++++++++++++++
 3 files changed, 375 insertions(+), 14 deletions(-)
 create mode 100644 paddlespeech/utils/initialize.py

diff --git a/paddlespeech/t2s/models/vits/text_encoder.py b/paddlespeech/t2s/models/vits/text_encoder.py
index 799e0c759..015ed76c6 100644
--- a/paddlespeech/t2s/models/vits/text_encoder.py
+++ b/paddlespeech/t2s/models/vits/text_encoder.py
@@ -24,6 +24,7 @@ from paddle import nn
 
 from paddlespeech.t2s.modules.nets_utils import make_non_pad_mask
 from paddlespeech.t2s.modules.transformer.encoder import ConformerEncoder as Encoder
+from paddlespeech.utils.initialize import normal_
 
 
 class TextEncoder(nn.Layer):
@@ -105,10 +106,6 @@ class TextEncoder(nn.Layer):
         # define modules
         self.emb = nn.Embedding(vocabs, attention_dim)
 
-        dist = paddle.distribution.Normal(loc=0.0, scale=attention_dim**-0.5)
-        w = dist.sample(self.emb.weight.shape)
-        self.emb.weight.set_value(w)
-
         self.encoder = Encoder(
             idim=-1,
             input_layer=None,
@@ -130,6 +127,8 @@ class TextEncoder(nn.Layer):
             cnn_module_kernel=conformer_kernel_size, )
         self.proj = nn.Conv1D(attention_dim, attention_dim * 2, 1)
 
+        self.reset_parameters()
+
     def forward(
             self,
             x: paddle.Tensor,
@@ -166,3 +165,9 @@ class TextEncoder(nn.Layer):
         m, logs = paddle.split(stats, 2, axis=1)
 
         return x, m, logs, x_mask
+
+    def reset_parameters(self):
+        normal_(self.emb.weight, mean=0.0, std=self.attention_dim**-0.5)
+        if self.emb._padding_idx is not None:
+            with paddle.no_grad():
+                self.emb.weight[self.emb._padding_idx] = 0
diff --git a/paddlespeech/t2s/models/vits/vits.py b/paddlespeech/t2s/models/vits/vits.py
index 0ff3a546d..e68ed5643 100644
--- a/paddlespeech/t2s/models/vits/vits.py
+++ b/paddlespeech/t2s/models/vits/vits.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 # Modified from espnet(https://github.com/espnet/espnet)
 """VITS module"""
+import math
 from typing import Any
 from typing import Dict
 from typing import Optional
@@ -27,7 +28,12 @@ from paddlespeech.t2s.models.hifigan import HiFiGANMultiScaleMultiPeriodDiscrimi
 from paddlespeech.t2s.models.hifigan import HiFiGANPeriodDiscriminator
 from paddlespeech.t2s.models.hifigan import HiFiGANScaleDiscriminator
 from paddlespeech.t2s.models.vits.generator import VITSGenerator
-from paddlespeech.t2s.modules.nets_utils import initialize
+from paddlespeech.utils.initialize import _calculate_fan_in_and_fan_out
+from paddlespeech.utils.initialize import kaiming_uniform_
+from paddlespeech.utils.initialize import normal_
+from paddlespeech.utils.initialize import ones_
+from paddlespeech.utils.initialize import uniform_
+from paddlespeech.utils.initialize import zeros_
 
 AVAILABLE_GENERATERS = {
     "vits_generator": VITSGenerator,
@@ -152,8 +158,7 @@ class VITS(nn.Layer):
                     "use_spectral_norm": False,
                 },
             },
-            cache_generator_outputs: bool=True,
-            init_type: str="xavier_uniform", ):
+            cache_generator_outputs: bool=True, ):
         """Initialize VITS module.
         Args:
             idim (int):
@@ -179,9 +184,6 @@ class VITS(nn.Layer):
         assert check_argument_types()
         super().__init__()
 
-        # initialize parameters
-        initialize(self, init_type)
-
         # define modules
         generator_class = AVAILABLE_GENERATERS[generator_type]
         if generator_type == "vits_generator":
@@ -196,8 +198,6 @@ class VITS(nn.Layer):
         self.discriminator = discriminator_class(
             **discriminator_params, )
 
-        nn.initializer.set_global_initializer(None)
-
         # cache
         self.cache_generator_outputs = cache_generator_outputs
         self._cache = None
@@ -214,6 +214,10 @@ class VITS(nn.Layer):
         self.reuse_cache_gen = True
         self.reuse_cache_dis = True
 
+        self.reset_parameters()
+        self.generator.decoder.reset_parameters()
+        self.generator.text_encoder.reset_parameters()
+
     def forward(
             self,
             text: paddle.Tensor,
@@ -243,7 +247,7 @@ class VITS(nn.Layer):
             forward_generator (bool):
                     Whether to forward generator.
         Returns:
-        
+
         """
         if forward_generator:
             return self._forward_generator(
@@ -290,7 +294,7 @@ class VITS(nn.Layer):
             lids (Optional[Tensor]):
                 Language index tensor (B,) or (B, 1).
         Returns:
-            
+
         """
         # setup
         feats = feats.transpose([0, 2, 1])
@@ -497,3 +501,34 @@ class VITS(nn.Layer):
             lids, )
 
         return dict(wav=paddle.reshape(wav, [-1]))
+
+    def reset_parameters(self):
+        def _reset_parameters(module):
+            if isinstance(module,
+                        (nn.Conv1D, nn.Conv1DTranspose, nn.Conv2D, nn.Conv2DTranspose)):
+                kaiming_uniform_(module.weight, a=math.sqrt(5))
+                if module.bias is not None:
+                    fan_in, _ = _calculate_fan_in_and_fan_out(module.weight)
+                    if fan_in != 0:
+                        bound = 1 / math.sqrt(fan_in)
+                        uniform_(module.bias, -bound, bound)
+
+            if isinstance(module,
+                          (nn.BatchNorm1D, nn.BatchNorm2D, nn.GroupNorm, nn.LayerNorm)):
+                ones_(module.weight)
+                zeros_(module.bias)
+
+            if isinstance(module, nn.Linear):
+                kaiming_uniform_(module.weight, a=math.sqrt(5))
+                if module.bias is not None:
+                    fan_in, _ = _calculate_fan_in_and_fan_out(module.weight)
+                    bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+                    uniform_(module.bias, -bound, bound)
+
+            if isinstance(module, nn.Embedding):
+                normal_(module.weight)
+                if module._padding_idx is not None:
+                    with paddle.no_grad():
+                        module.weight[module._padding_idx] = 0
+
+        self.apply(_reset_parameters)
diff --git a/paddlespeech/utils/initialize.py b/paddlespeech/utils/initialize.py
new file mode 100644
index 000000000..8ebe6845e
--- /dev/null
+++ b/paddlespeech/utils/initialize.py
@@ -0,0 +1,321 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This code is based on https://github.com/pytorch/pytorch/blob/master/torch/nn/init.py
+Ths copyright of pytorch/pytorch is a BSD-style license, as found in the LICENSE file.
+"""
+import math
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+
+__all__ = [
+    "uniform_",
+    "normal_",
+    "constant_",
+    "ones_",
+    "zeros_",
+    "xavier_uniform_",
+    "xavier_normal_",
+    "kaiming_uniform_",
+    "kaiming_normal_",
+    "linear_init_",
+    "conv_init_",
+    "reset_initialized_parameter",
+    "_calculate_fan_in_and_fan_out",
+]
+
+
+def _no_grad_uniform_(tensor, a, b):
+    with paddle.no_grad():
+        tensor.set_value(
+            paddle.uniform(
+                shape=tensor.shape, dtype=tensor.dtype, min=a, max=b))
+    return tensor
+
+
+def _no_grad_normal_(tensor, mean=0.0, std=1.0):
+    with paddle.no_grad():
+        tensor.set_value(paddle.normal(mean=mean, std=std, shape=tensor.shape))
+    return tensor
+
+
+def _no_grad_fill_(tensor, value=0.0):
+    with paddle.no_grad():
+        tensor.set_value(paddle.full_like(tensor, value, dtype=tensor.dtype))
+    return tensor
+
+
+def uniform_(tensor, a, b):
+    """
+    Modified tensor inspace using uniform_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        a (float|int): min value.
+        b (float|int): max value.
+    Return:
+        tensor
+    """
+    return _no_grad_uniform_(tensor, a, b)
+
+
+def normal_(tensor, mean=0.0, std=1.0):
+    """
+    Modified tensor inspace using normal_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        mean (float|int): mean value.
+        std (float|int): std value.
+    Return:
+        tensor
+    """
+    return _no_grad_normal_(tensor, mean, std)
+
+
+def constant_(tensor, value=0.0):
+    """
+    Modified tensor inspace using constant_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        value (float|int): value to fill tensor.
+    Return:
+        tensor
+    """
+    return _no_grad_fill_(tensor, value)
+
+
+def ones_(tensor):
+    """
+    Modified tensor inspace using ones_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+    Return:
+        tensor
+    """
+    return _no_grad_fill_(tensor, 1)
+
+
+def zeros_(tensor):
+    """
+    Modified tensor inspace using zeros_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+    Return:
+        tensor
+    """
+    return _no_grad_fill_(tensor, 0)
+
+
+def vector_(tensor, vector):
+    with paddle.no_grad():
+        tensor.set_value(paddle.to_tensor(vector, dtype=tensor.dtype))
+    return tensor
+
+
+def _calculate_fan_in_and_fan_out(tensor, reverse=False):
+    """
+    Calculate (fan_in, _fan_out) for tensor
+    Args:
+        tensor (Tensor): paddle.Tensor
+        reverse (bool: False): tensor data format order, False by default as [fout, fin, ...]. e.g. : conv.weight [cout, cin, kh, kw] is False; linear.weight [cin, cout] is True
+    Return:
+        Tuple[fan_in, fan_out]
+    """
+    if tensor.ndim < 2:
+        raise ValueError(
+            "Fan in and fan out can not be computed for tensor with fewer than 2 dimensions"
+        )
+
+    if reverse:
+        num_input_fmaps, num_output_fmaps = tensor.shape[0], tensor.shape[1]
+    else:
+        num_input_fmaps, num_output_fmaps = tensor.shape[1], tensor.shape[0]
+
+    receptive_field_size = 1
+    if tensor.ndim > 2:
+        receptive_field_size = np.prod(tensor.shape[2:])
+
+    fan_in = num_input_fmaps * receptive_field_size
+    fan_out = num_output_fmaps * receptive_field_size
+
+    return fan_in, fan_out
+
+
+def xavier_uniform_(tensor, gain=1.0, reverse=False):
+    """
+    Modified tensor inspace using xavier_uniform_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        gain (float): super parameter, 1. default.
+        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
+    Return:
+        tensor
+    """
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse=reverse)
+    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
+    k = math.sqrt(3.0) * std
+    return _no_grad_uniform_(tensor, -k, k)
+
+
+def xavier_normal_(tensor, gain=1.0, reverse=False):
+    """
+    Modified tensor inspace using xavier_normal_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        gain (float): super parameter, 1. default.
+        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
+    Return:
+        tensor
+    """
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse=reverse)
+    std = gain * math.sqrt(2.0 / float(fan_in + fan_out))
+    return _no_grad_normal_(tensor, 0, std)
+
+
+# reference: https://pytorch.org/docs/stable/_modules/torch/nn/init.html
+def _calculate_correct_fan(tensor, mode, reverse=False):
+    mode = mode.lower()
+    valid_modes = ["fan_in", "fan_out"]
+    if mode not in valid_modes:
+        raise ValueError("Mode {} not supported, please use one of {}".format(
+            mode, valid_modes))
+
+    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor, reverse)
+
+    return fan_in if mode == "fan_in" else fan_out
+
+
+def _calculate_gain(nonlinearity, param=None):
+    linear_fns = [
+        "linear", "conv1d", "conv2d", "conv3d", "conv_transpose1d",
+        "conv_transpose2d", "conv_transpose3d"
+    ]
+    if nonlinearity in linear_fns or nonlinearity == "sigmoid":
+        return 1
+    elif nonlinearity == "tanh":
+        return 5.0 / 3
+    elif nonlinearity == "relu":
+        return math.sqrt(2.0)
+    elif nonlinearity == "leaky_relu":
+        if param is None:
+            negative_slope = 0.01
+        elif not isinstance(param, bool) and isinstance(
+                param, int) or isinstance(param, float):
+            # True/False are instances of int, hence check above
+            negative_slope = param
+        else:
+            raise ValueError(
+                "negative_slope {} not a valid number".format(param))
+        return math.sqrt(2.0 / (1 + negative_slope**2))
+    elif nonlinearity == "selu":
+        return 3.0 / 4
+    else:
+        raise ValueError("Unsupported nonlinearity {}".format(nonlinearity))
+
+
+def kaiming_uniform_(tensor,
+                     a=0,
+                     mode="fan_in",
+                     nonlinearity="leaky_relu",
+                     reverse=False):
+    """
+    Modified tensor inspace using kaiming_uniform method
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        mode (str): ['fan_in', 'fan_out'], 'fin_in' defalut
+        nonlinearity (str): nonlinearity method name
+        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
+    Return:
+        tensor
+    """
+    fan = _calculate_correct_fan(tensor, mode, reverse)
+    gain = _calculate_gain(nonlinearity, a)
+    std = gain / math.sqrt(fan)
+    k = math.sqrt(3.0) * std
+    return _no_grad_uniform_(tensor, -k, k)
+
+
+def kaiming_normal_(tensor,
+                    a=0,
+                    mode="fan_in",
+                    nonlinearity="leaky_relu",
+                    reverse=False):
+    """
+    Modified tensor inspace using kaiming_normal_
+    Args:
+        tensor (paddle.Tensor): paddle Tensor
+        mode (str): ['fan_in', 'fan_out'], 'fin_in' defalut
+        nonlinearity (str): nonlinearity method name
+        reverse (bool):  reverse (bool: False): tensor data format order, False by default as [fout, fin, ...].
+    Return:
+        tensor
+    """
+    fan = _calculate_correct_fan(tensor, mode, reverse)
+    gain = _calculate_gain(nonlinearity, a)
+    std = gain / math.sqrt(fan)
+    return _no_grad_normal_(tensor, 0, std)
+
+
+def linear_init_(module):
+    bound = 1 / math.sqrt(module.weight.shape[0])
+    uniform_(module.weight, -bound, bound)
+    uniform_(module.bias, -bound, bound)
+
+
+def conv_init_(module):
+    bound = 1 / np.sqrt(np.prod(module.weight.shape[1:]))
+    uniform_(module.weight, -bound, bound)
+    if module.bias is not None:
+        uniform_(module.bias, -bound, bound)
+
+
+def bias_init_with_prob(prior_prob=0.01):
+    """initialize conv/fc bias value according to a given probability value."""
+    bias_init = float(-np.log((1 - prior_prob) / prior_prob))
+    return bias_init
+
+
+@paddle.no_grad()
+def reset_initialized_parameter(model, include_self=True):
+    """
+    Reset initialized parameter using following method for [conv, linear, embedding, bn]
+    Args:
+        model (paddle.Layer): paddle Layer
+        include_self (bool: False): include_self for Layer.named_sublayers method. Indicate whether including itself
+    Return:
+        None
+    """
+    for _, m in model.named_sublayers(include_self=include_self):
+        if isinstance(m, nn.Conv2D):
+            k = float(m._groups) / (m._in_channels * m._kernel_size[0] *
+                                    m._kernel_size[1])
+            k = math.sqrt(k)
+            _no_grad_uniform_(m.weight, -k, k)
+            if hasattr(m, "bias") and getattr(m, "bias") is not None:
+                _no_grad_uniform_(m.bias, -k, k)
+
+        elif isinstance(m, nn.Linear):
+            k = math.sqrt(1.0 / m.weight.shape[0])
+            _no_grad_uniform_(m.weight, -k, k)
+            if hasattr(m, "bias") and getattr(m, "bias") is not None:
+                _no_grad_uniform_(m.bias, -k, k)
+
+        elif isinstance(m, nn.Embedding):
+            _no_grad_normal_(m.weight, mean=0.0, std=1.0)
+
+        elif isinstance(m, (nn.BatchNorm2D, nn.LayerNorm)):
+            _no_grad_fill_(m.weight, 1.0)
+            if hasattr(m, "bias") and getattr(m, "bias") is not None:
+                _no_grad_fill_(m.bias, 0)

From ad40dafa856b9c4539e7b9f82bad2d9ff8c317f4 Mon Sep 17 00:00:00 2001
From: zxcd <228587199@qq.com>
Date: Thu, 12 Jan 2023 10:23:56 +0800
Subject: [PATCH 04/24] fix some bug. (#2825)

---
 paddlespeech/s2t/models/whisper/tokenizer.py |  4 ++++
 paddlespeech/s2t/models/whisper/whipser.py   | 14 +++++++-------
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/paddlespeech/s2t/models/whisper/tokenizer.py b/paddlespeech/s2t/models/whisper/tokenizer.py
index 8bd85c914..1e1aea044 100644
--- a/paddlespeech/s2t/models/whisper/tokenizer.py
+++ b/paddlespeech/s2t/models/whisper/tokenizer.py
@@ -155,6 +155,10 @@ class Tokenizer:
                 if ids < len(self.tokenizer):
                     ids_list.append(ids)
             token_ids = ids_list
+        elif len(token_ids) == 1:
+            token_ids = token_ids[0]
+        else:
+            raise ValueError(f"token_ids {token_ids} load error.")
 
         return self.tokenizer.decode(token_ids, **kwargs)
 
diff --git a/paddlespeech/s2t/models/whisper/whipser.py b/paddlespeech/s2t/models/whisper/whipser.py
index 63cafbdb7..81692f37a 100644
--- a/paddlespeech/s2t/models/whisper/whipser.py
+++ b/paddlespeech/s2t/models/whisper/whipser.py
@@ -17,12 +17,11 @@ from typing import Union
 import numpy as np
 import paddle
 import paddle.nn.functional as F
+import paddlespeech.s2t.modules.align as paddlespeech_nn
 import soundfile
 import tqdm
 from paddle import nn
 from paddle.distribution import Categorical
-
-import paddlespeech.s2t.modules.align as paddlespeech_nn
 from paddlespeech.s2t.models.whisper import utils
 from paddlespeech.s2t.models.whisper.tokenizer import get_tokenizer
 from paddlespeech.s2t.models.whisper.tokenizer import LANGUAGES
@@ -771,8 +770,10 @@ class GreedyDecoder(TokenDecoder):
         if temperature == 0:
             next_tokens = paddle.argmax(logits, axis=-1)
         else:
-            next_tokens = Categorical(logits=logits / temperature).sample(
-                shape=logits.shape)
+            next_tokens = Categorical(logits=logits / temperature).sample([1])
+            next_tokens = paddle.reshape(next_tokens, [
+                next_tokens.shape[0] * next_tokens.shape[1],
+            ])
 
         logprobs = F.log_softmax(logits, axis=-1, dtype=paddle.float32)
         current_logprobs = logprobs[paddle.arange(logprobs.shape[0]),
@@ -1205,9 +1206,8 @@ class DecodingTask:
                 DecodingResult(
                     audio_features=features,
                     language=language,
-                    language_probs=probs)
-                for features, language, probs in zip(audio_features, languages,
-                                                     language_probs)
+                    language_probs=probs) for features, language, probs in
+                zip(audio_features, languages, language_probs)
             ]
 
         # repeat the audio & text tensors by the group size, for beam search or best-of-n sampling

From a99244d86e56a0d796f04919b4e6493b6d4d22a6 Mon Sep 17 00:00:00 2001
From: cxumol <cxumol@users.noreply.github.com>
Date: Wed, 11 Jan 2023 22:04:10 -0800
Subject: [PATCH 05/24] fix: whisper language choice, test=asr (#2828)

---
 paddlespeech/s2t/models/whisper/whipser.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddlespeech/s2t/models/whisper/whipser.py b/paddlespeech/s2t/models/whisper/whipser.py
index 81692f37a..9cf9a9eca 100644
--- a/paddlespeech/s2t/models/whisper/whipser.py
+++ b/paddlespeech/s2t/models/whisper/whipser.py
@@ -476,7 +476,7 @@ def transcribe(
         decode_options["fp16"] = False
 
     if decode_options.get(
-            "language", 'None') or decode_options.get("language", None) is None:
+            "language") == 'None' or decode_options.get("language", None) is None:
         if not model.is_multilingual:
             decode_options["language"] = "en"
         else:

From 742523fb38f521aaa93431658a7eb2042b2bad81 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=99=8B=E4=B8=9C=E6=AF=85?= <jin.dongyi@qq.com>
Date: Fri, 13 Jan 2023 15:45:49 +0800
Subject: [PATCH 06/24] [tts]For mixed Chinese and English speech synthesis,
 add SSML support for Chinese (#2830)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* 添加.history

* [tts]添加中英混合语音合成时对中文SSML的支持
---
 .gitignore                                |  1 +
 paddlespeech/t2s/frontend/mix_frontend.py | 57 ++++++++++++++++++++---
 paddlespeech/t2s/ssml/xml_processor.py    | 34 ++++++++++++++
 3 files changed, 86 insertions(+), 6 deletions(-)

diff --git a/.gitignore b/.gitignore
index 75f56b604..4a0c43312 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,6 +15,7 @@
 *.egg-info
 build
 *output/
+.history
 
 audio/dist/
 audio/fc_patch/
diff --git a/paddlespeech/t2s/frontend/mix_frontend.py b/paddlespeech/t2s/frontend/mix_frontend.py
index 19c98d53f..c13a5ab62 100644
--- a/paddlespeech/t2s/frontend/mix_frontend.py
+++ b/paddlespeech/t2s/frontend/mix_frontend.py
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import re
 from typing import Dict
 from typing import List
 
@@ -18,6 +19,7 @@ import paddle
 
 from paddlespeech.t2s.frontend import English
 from paddlespeech.t2s.frontend.zh_frontend import Frontend
+from paddlespeech.t2s.ssml.xml_processor import MixTextProcessor
 
 
 class MixFrontend():
@@ -107,7 +109,40 @@ class MixFrontend():
                       add_sp: bool=True,
                       to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]:
 
-        segments = self.get_segment(sentence)
+        ''' 1. 添加SSML支持，先列出 文字 和 <say-as>标签内容，
+                然后添加到tmpSegments数组里
+        '''
+        d_inputs = MixTextProcessor.get_dom_split(sentence)
+        tmpSegments = []
+        for instr in d_inputs:
+            ''' 暂时只支持 say-as '''
+            if instr.lower().startswith("<say-as"):
+                tmpSegments.append((instr, "zh"))
+            else:
+                tmpSegments.extend(self.get_segment(instr))
+
+        ''' 2. 把zh的merge到一起，避免合成结果中间停顿
+        '''
+        segments = []
+        currentSeg = ["", ""]
+        for seg in tmpSegments:
+            if seg[1] == "en" or seg[1] == "other":
+                if currentSeg[0] == '':
+                    segments.append(seg)
+                else:
+                    currentSeg[0] = "<speak>" + currentSeg[0] + "</speak>"
+                    segments.append(tuple(currentSeg))
+                    segments.append(seg)
+                    currentSeg = ["", ""]
+            else:
+                if currentSeg[0] == '':
+                    currentSeg[0] = seg[0]
+                    currentSeg[1] = seg[1]
+                else:
+                    currentSeg[0] = currentSeg[0] + seg[0]
+        if currentSeg[0] != '':
+            currentSeg[0] = "<speak>" + currentSeg[0] + "</speak>"
+            segments.append(tuple(currentSeg))
 
         phones_list = []
         result = {}
@@ -120,11 +155,21 @@ class MixFrontend():
                     input_ids = self.en_frontend.get_input_ids(
                         content, merge_sentences=False, to_tensor=to_tensor)
                 else:
-                    input_ids = self.zh_frontend.get_input_ids(
-                        content,
-                        merge_sentences=False,
-                        get_tone_ids=get_tone_ids,
-                        to_tensor=to_tensor)
+                    ''' 3. 把带speak tag的中文和普通文字分开处理
+                    '''
+                    if content.strip() != "" and \
+                        re.match(r".*?<speak>.*?</speak>.*", content, re.DOTALL):
+                        input_ids = self.zh_frontend.get_input_ids_ssml(
+                            content,
+                            merge_sentences=False,
+                            get_tone_ids=get_tone_ids,
+                            to_tensor=to_tensor)
+                    else:
+                        input_ids = self.zh_frontend.get_input_ids(
+                            content,
+                            merge_sentences=False,
+                            get_tone_ids=get_tone_ids,
+                            to_tensor=to_tensor)
                 if add_sp:
                     input_ids["phone_ids"][-1] = paddle.concat(
                         [input_ids["phone_ids"][-1], self.sp_id_tensor])
diff --git a/paddlespeech/t2s/ssml/xml_processor.py b/paddlespeech/t2s/ssml/xml_processor.py
index b39121347..892ca371e 100644
--- a/paddlespeech/t2s/ssml/xml_processor.py
+++ b/paddlespeech/t2s/ssml/xml_processor.py
@@ -74,6 +74,28 @@ class MixTextProcessor():
             ctlist.append([mixstr, []])
         return ctlist
 
+    @classmethod
+    def get_dom_split(self, mixstr):
+        ''' 文本分解，顺序加了列表中，返回文本和say-as标签
+        '''
+        ctlist = []
+        patn = re.compile(r'(.*\s*?)(<speak>.*?</speak>)(.*\s*)$', re.M | re.S)
+        mat = re.match(patn, mixstr)
+        if mat:
+            pre_xml = mat.group(1)
+            in_xml = mat.group(2)
+            after_xml = mat.group(3)
+
+            ctlist.append(pre_xml)
+            dom = DomXml(in_xml)
+            tags = dom.get_text_and_sayas_tags()
+            ctlist.extend(tags)
+            
+            ctlist.append(after_xml)
+            return ctlist
+        else:
+            ctlist.append(mixstr)
+        return ctlist
 
 class DomXml():
     def __init__(self, xmlstr):
@@ -156,3 +178,15 @@ class DomXml():
             if x.hasAttribute('pinyin'):  # pinyin
                 print(x.tagName, 'pinyin',
                       x.getAttribute('pinyin'), x.firstChild.data)
+
+    def get_text_and_sayas_tags(self):
+        '''返回 xml 内容的列表，包括所有文本内容和<say-as> tag'''
+        res = []
+
+        for x1 in self.rnode:
+            if x1.nodeType == Node.TEXT_NODE:
+                res.append(x1.value)
+            else:
+                for x2 in x1.childNodes:
+                    res.append(x2.toxml())
+        return res

From 1fd38c0e8b5937a5e9a1fd576e35c610b7b181a0 Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Fri, 13 Jan 2023 17:40:47 +0800
Subject: [PATCH 07/24] fix o (#2831)

---
 paddlespeech/t2s/frontend/g2pw/onnx_api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddlespeech/t2s/frontend/g2pw/onnx_api.py b/paddlespeech/t2s/frontend/g2pw/onnx_api.py
index 47c26a610..3ce3d246d 100644
--- a/paddlespeech/t2s/frontend/g2pw/onnx_api.py
+++ b/paddlespeech/t2s/frontend/g2pw/onnx_api.py
@@ -100,7 +100,7 @@ class G2PWOnnxConverter:
         ]
         self.non_polyphonic = {
             '一', '不', '和', '咋', '嗲', '剖', '差', '攢', '倒', '難', '奔', '勁', '拗',
-            '肖', '瘙', '誒', '泊', '听'
+            '肖', '瘙', '誒', '泊', '听', '噢'
         }
         self.non_monophonic = {'似', '攢'}
         self.monophonic_chars = [

From 57b9d4bca4c897835a52a8f6a2f9ee04ddc4b402 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=89=BE=E6=A2=A6?= <HighCWu@163.com>
Date: Fri, 13 Jan 2023 20:42:23 +0800
Subject: [PATCH 08/24] add diffusion module for training diffsinger (#2832)

---
 docs/requirements.txt                 |   1 +
 paddlespeech/t2s/modules/diffusion.py | 467 ++++++++++++++++++++++++++
 setup.py                              |   1 +
 3 files changed, 469 insertions(+)
 create mode 100644 paddlespeech/t2s/modules/diffusion.py

diff --git a/docs/requirements.txt b/docs/requirements.txt
index bd7f40ec3..c6228d917 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -27,6 +27,7 @@ pandas
 pathos==0.2.8
 pattern_singleton
 Pillow>=9.0.0
+ppdiffusers>=0.9.0
 praatio==5.0.0
 prettytable
 pypinyin-dict
diff --git a/paddlespeech/t2s/modules/diffusion.py b/paddlespeech/t2s/modules/diffusion.py
new file mode 100644
index 000000000..52fe84ceb
--- /dev/null
+++ b/paddlespeech/t2s/modules/diffusion.py
@@ -0,0 +1,467 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Diffusion denoising related modules for paddle"""
+import math
+from typing import Callable
+from typing import Optional
+from typing import Tuple
+
+import paddle
+import ppdiffusers
+from paddle import nn
+from ppdiffusers.models.embeddings import Timesteps
+from ppdiffusers.schedulers import DDPMScheduler
+
+from paddlespeech.t2s.modules.nets_utils import initialize
+from paddlespeech.t2s.modules.residual_block import WaveNetResidualBlock
+
+
+class WaveNetDenoiser(nn.Layer):
+    """A Mel-Spectrogram Denoiser modified from WaveNet
+
+    Args:
+        in_channels (int, optional): 
+            Number of channels of the input mel-spectrogram, by default 80
+        out_channels (int, optional): 
+            Number of channels of the output mel-spectrogram, by default 80
+        kernel_size (int, optional): 
+            Kernel size of the residual blocks inside, by default 3
+        layers (int, optional): 
+            Number of residual blocks inside, by default 20
+        stacks (int, optional):
+            The number of groups to split the residual blocks into, by default 4
+            Within each group, the dilation of the residual block grows exponentially.
+        residual_channels (int, optional): 
+            Residual channel of the residual blocks, by default 256
+        gate_channels (int, optional): 
+            Gate channel of the residual blocks, by default 512
+        skip_channels (int, optional): 
+            Skip channel of the residual blocks, by default 256
+        aux_channels (int, optional): 
+            Auxiliary channel of the residual blocks, by default 256
+        dropout (float, optional): 
+            Dropout of the residual blocks, by default 0.
+        bias (bool, optional): 
+            Whether to use bias in residual blocks, by default True
+        use_weight_norm (bool, optional): 
+            Whether to use weight norm in all convolutions, by default False
+    """
+
+    def __init__(
+            self,
+            in_channels: int=80,
+            out_channels: int=80,
+            kernel_size: int=3,
+            layers: int=20,
+            stacks: int=4,
+            residual_channels: int=256,
+            gate_channels: int=512,
+            skip_channels: int=256,
+            aux_channels: int=256,
+            dropout: float=0.,
+            bias: bool=True,
+            use_weight_norm: bool=False,
+            init_type: str="kaiming_uniform", ):
+        super().__init__()
+
+        # initialize parameters
+        initialize(self, init_type)
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.aux_channels = aux_channels
+        self.layers = layers
+        self.stacks = stacks
+        self.kernel_size = kernel_size
+
+        assert layers % stacks == 0
+        layers_per_stack = layers // stacks
+
+        self.first_t_emb = nn.Sequential(
+            Timesteps(
+                residual_channels,
+                flip_sin_to_cos=False,
+                downscale_freq_shift=1),
+            nn.Linear(residual_channels, residual_channels * 4),
+            nn.Mish(), nn.Linear(residual_channels * 4, residual_channels))
+        self.t_emb_layers = nn.LayerList([
+            nn.Linear(residual_channels, residual_channels)
+            for _ in range(layers)
+        ])
+
+        self.first_conv = nn.Conv1D(
+            in_channels, residual_channels, 1, bias_attr=True)
+        self.first_act = nn.ReLU()
+
+        self.conv_layers = nn.LayerList()
+        for layer in range(layers):
+            dilation = 2**(layer % layers_per_stack)
+            conv = WaveNetResidualBlock(
+                kernel_size=kernel_size,
+                residual_channels=residual_channels,
+                gate_channels=gate_channels,
+                skip_channels=skip_channels,
+                aux_channels=aux_channels,
+                dilation=dilation,
+                dropout=dropout,
+                bias=bias)
+            self.conv_layers.append(conv)
+
+        self.last_conv_layers = nn.Sequential(nn.ReLU(),
+                                              nn.Conv1D(
+                                                  skip_channels,
+                                                  skip_channels,
+                                                  1,
+                                                  bias_attr=True),
+                                              nn.ReLU(),
+                                              nn.Conv1D(
+                                                  skip_channels,
+                                                  out_channels,
+                                                  1,
+                                                  bias_attr=True))
+
+        if use_weight_norm:
+            self.apply_weight_norm()
+
+    def forward(self, x, t, c):
+        """Denoise mel-spectrogram.
+
+        Args:
+            x(Tensor): 
+                Shape (N, C_in, T), The input mel-spectrogram.
+            t(Tensor): 
+                Shape (N), The timestep input.
+            c(Tensor): 
+                Shape (N, C_aux, T'). The auxiliary input (e.g. fastspeech2 encoder output). 
+
+        Returns:
+            Tensor: Shape (N, C_out, T), the denoised mel-spectrogram.
+        """
+        assert c.shape[-1] == x.shape[-1]
+
+        if t.shape[0] != x.shape[0]:
+            t = t.tile([x.shape[0]])
+        t_emb = self.first_t_emb(t)
+        t_embs = [
+            t_emb_layer(t_emb)[..., None] for t_emb_layer in self.t_emb_layers
+        ]
+
+        x = self.first_conv(x)
+        x = self.first_act(x)
+        skips = 0
+        for f, t in zip(self.conv_layers, t_embs):
+            x = x + t
+            x, s = f(x, c)
+            skips += s
+        skips *= math.sqrt(1.0 / len(self.conv_layers))
+
+        x = self.last_conv_layers(skips)
+        return x
+
+    def apply_weight_norm(self):
+        """Recursively apply weight normalization to all the Convolution layers
+        in the sublayers.
+        """
+
+        def _apply_weight_norm(layer):
+            if isinstance(layer, (nn.Conv1D, nn.Conv2D)):
+                nn.utils.weight_norm(layer)
+
+        self.apply(_apply_weight_norm)
+
+    def remove_weight_norm(self):
+        """Recursively remove weight normalization from all the Convolution 
+        layers in the sublayers.
+        """
+
+        def _remove_weight_norm(layer):
+            try:
+                nn.utils.remove_weight_norm(layer)
+            except ValueError:
+                pass
+
+        self.apply(_remove_weight_norm)
+
+
+class GaussianDiffusion(nn.Layer):
+    """Common Gaussian Diffusion Denoising Model Module 
+
+    Args:
+        denoiser (Layer, optional): 
+            The model used for denoising noises.
+            In fact, the denoiser model performs the operation 
+            of producing a output with more noises from the noisy input. 
+            Then we use the diffusion algorithm to calculate 
+            the input with the output to get the denoised result.
+        num_train_timesteps (int, optional): 
+            The number of timesteps between the noise and the real during training, by default 1000.
+        beta_start (float, optional): 
+            beta start parameter for the scheduler, by default 0.0001.
+        beta_end (float, optional): 
+            beta end parameter for the scheduler, by default 0.0001.
+        beta_schedule (str, optional): 
+            beta schedule parameter for the scheduler, by default 'squaredcos_cap_v2' (cosine schedule).
+        num_max_timesteps (int, optional): 
+            The max timestep transition from real to noise, by default None.
+    
+    Examples: 
+        >>> import paddle
+        >>> import paddle.nn.functional as F
+        >>> from tqdm import tqdm
+        >>> 
+        >>> denoiser = WaveNetDenoiser()
+        >>> diffusion = GaussianDiffusion(denoiser, num_train_timesteps=1000, num_max_timesteps=100)
+        >>> x = paddle.ones([4, 80, 192]) # [B, mel_ch, T] # real mel input
+        >>> c = paddle.randn([4, 256, 192]) # [B, fs2_encoder_out_ch, T] # fastspeech2 encoder output
+        >>> loss = F.mse_loss(*diffusion(x, c))
+        >>> loss.backward()
+        >>> print('MSE Loss:', loss.item())
+        MSE Loss: 1.6669728755950928 
+        >>> def create_progress_callback():
+        >>>     pbar = None
+        >>>     def callback(index, timestep, num_timesteps, sample):
+        >>>         nonlocal pbar
+        >>>         if pbar is None:
+        >>>             pbar = tqdm(total=num_timesteps-index)
+        >>>         pbar.update()
+        >>> 
+        >>>     return callback
+        >>> 
+        >>> # ds=1000, K_step=60, scheduler=ddpm, from aux fs2 mel output
+        >>> ds = 1000
+        >>> infer_steps = 1000
+        >>> K_step = 60
+        >>> scheduler_type = 'ddpm'
+        >>> x_in = x
+        >>> diffusion = GaussianDiffusion(denoiser, num_train_timesteps=ds, num_max_timesteps=K_step)
+        >>> with paddle.no_grad():
+        >>>     sample = diffusion.inference(
+        >>>         paddle.randn(x.shape), c, x, 
+        >>>         num_inference_steps=infer_steps,
+        >>>         scheduler_type=scheduler_type,
+        >>>         callback=create_progress_callback())
+        100%|█████| 60/60 [00:03<00:00, 18.36it/s] 
+        >>> 
+        >>> # ds=100, K_step=100, scheduler=ddpm, from gaussian noise
+        >>> ds = 100
+        >>> infer_steps = 100
+        >>> K_step = 100
+        >>> scheduler_type = 'ddpm'
+        >>> x_in = None
+        >>> diffusion = GaussianDiffusion(denoiser, num_train_timesteps=ds, num_max_timesteps=K_step)
+        >>> with paddle.no_grad():
+        >>>     sample = diffusion.inference(
+        >>>         paddle.randn(x.shape), c, x_in, 
+        >>>         num_inference_steps=infer_steps,
+        >>>         scheduler_type=scheduler_type,
+        >>>         callback=create_progress_callback())
+        100%|█████| 100/100 [00:05<00:00, 18.29it/s] 
+        >>> 
+        >>> # ds=1000, K_step=1000, scheduler=pndm, infer_step=25, from gaussian noise
+        >>> ds = 1000
+        >>> infer_steps = 25
+        >>> K_step = 1000
+        >>> scheduler_type = 'pndm'
+        >>> x_in = None
+        >>> diffusion = GaussianDiffusion(denoiser, num_train_timesteps=ds, num_max_timesteps=K_step)
+        >>> with paddle.no_grad():
+        >>>     sample = diffusion.inference(
+        >>>         paddle.randn(x.shape), c, None, 
+        >>>         num_inference_steps=infer_steps,
+        >>>         scheduler_type=scheduler_type,
+        >>>         callback=create_progress_callback())
+        100%|█████| 25/25 [00:01<00:00, 19.75it/s]
+        >>> 
+        >>> # ds=1000, K_step=100, scheduler=pndm, infer_step=50, from aux fs2 mel output
+        >>> ds = 1000
+        >>> infer_steps = 50
+        >>> K_step = 100
+        >>> scheduler_type = 'pndm'
+        >>> x_in = x
+        >>> diffusion = GaussianDiffusion(denoiser, num_train_timesteps=ds, num_max_timesteps=K_step)
+        >>> with paddle.no_grad():
+        >>>     sample = diffusion.inference(
+        >>>         paddle.randn(x.shape), c, x, 
+        >>>         num_inference_steps=infer_steps,
+        >>>         scheduler_type=scheduler_type,
+        >>>         callback=create_progress_callback())
+        100%|█████| 5/5 [00:00<00:00, 23.80it/s]
+
+    """
+
+    def __init__(self,
+                 denoiser: nn.Layer,
+                 num_train_timesteps: Optional[int]=1000,
+                 beta_start: Optional[float]=0.0001,
+                 beta_end: Optional[float]=0.02,
+                 beta_schedule: Optional[str]="squaredcos_cap_v2",
+                 num_max_timesteps: Optional[int]=None):
+        super().__init__()
+
+        self.num_train_timesteps = num_train_timesteps
+        self.beta_start = beta_start
+        self.beta_end = beta_end
+        self.beta_schedule = beta_schedule
+
+        self.denoiser = denoiser
+        self.noise_scheduler = DDPMScheduler(
+            num_train_timesteps=num_train_timesteps,
+            beta_start=beta_start,
+            beta_end=beta_end,
+            beta_schedule=beta_schedule)
+        self.num_max_timesteps = num_max_timesteps
+
+    def forward(self, x: paddle.Tensor, cond: Optional[paddle.Tensor]=None
+                ) -> Tuple[paddle.Tensor, paddle.Tensor]:
+        """Generate random timesteps noised x.
+
+        Args:
+            x (Tensor): 
+                The input for adding noises.
+            cond (Tensor, optional):
+                Conditional input for compute noises.
+          
+        Returns: 
+            y (Tensor): 
+                The output with noises added in.
+            target (Tensor):
+                The noises which is added to the input.
+
+        """
+        noise_scheduler = self.noise_scheduler
+
+        # Sample noise that we'll add to the mel-spectrograms
+        target = noise = paddle.randn(x.shape)
+
+        # Sample a random timestep for each mel-spectrogram
+        num_timesteps = self.num_train_timesteps
+        if self.num_max_timesteps is not None:
+            num_timesteps = self.num_max_timesteps
+        timesteps = paddle.randint(0, num_timesteps, (x.shape[0], ))
+
+        # Add noise to the clean mel-spectrograms according to the noise magnitude at each timestep
+        # (this is the forward diffusion process)
+        noisy_images = noise_scheduler.add_noise(x, noise, timesteps)
+
+        y = self.denoiser(noisy_images, timesteps, cond)
+
+        # then compute loss use output y and noisy target for prediction_type == "epsilon"
+        return y, target
+
+    def inference(self,
+                  noise: paddle.Tensor,
+                  cond: Optional[paddle.Tensor]=None,
+                  ref_x: Optional[paddle.Tensor]=None,
+                  num_inference_steps: Optional[int]=1000,
+                  strength: Optional[float]=None,
+                  scheduler_type: Optional[str]="ddpm",
+                  callback: Optional[Callable[[int, int, int, paddle.Tensor],
+                                              None]]=None,
+                  callback_steps: Optional[int]=1):
+        """Denoising input from noises. Refer to ppdiffusers img2img pipeline.
+
+        Args:
+            noise (Tensor): 
+                The input tensor as a starting point for denoising.
+            cond (Tensor, optional):
+                Conditional input for compute noises.
+            ref_x (Tensor, optional):
+                The real output for the denoising process to refer.
+            num_inference_steps (int, optional):
+                The number of timesteps between the noise and the real during inference, by default 1000.
+            strength (float, optional):
+                Mixing strength of ref_x with noise. The larger the value, the stronger the noise. 
+                Range [0,1], by default None.
+            scheduler_type (str, optional):
+                Noise scheduler for generate noises. 
+                Choose a great scheduler can skip many denoising step, by default 'ddpm'.
+            callback (Callable[[int,int,int,Tensor], None], optional):
+                Callback function during denoising steps.
+
+                Args:
+                    index (int):
+                        Current denoising index.
+                    timestep (int):
+                        Current denoising timestep.
+                    num_timesteps (int):
+                        Number of the denoising timesteps.
+                    denoised_output (Tensor):
+                        Current intermediate result produced during denoising.
+
+            callback_steps (int, optional):
+                The step to call the callback function.
+          
+        Returns: 
+            denoised_output (Tensor): 
+                The denoised output tensor.
+
+        """
+        scheduler_cls = None
+        for clsname in dir(ppdiffusers.schedulers):
+            if clsname.lower() == scheduler_type + "scheduler":
+                scheduler_cls = getattr(ppdiffusers.schedulers, clsname)
+                break
+
+        if scheduler_cls is None:
+            raise ValueError(f"No such scheduler type named {scheduler_type}")
+
+        scheduler = scheduler_cls(
+            num_train_timesteps=self.num_train_timesteps,
+            beta_start=self.beta_start,
+            beta_end=self.beta_end,
+            beta_schedule=self.beta_schedule)
+
+        # set timesteps
+        scheduler.set_timesteps(num_inference_steps)
+
+        # prepare first noise variables
+        noisy_input = noise
+        timesteps = scheduler.timesteps
+        if ref_x is not None:
+            init_timestep = None
+            if strength is None or strength < 0. or strength > 1.:
+                strength = None
+                if self.num_max_timesteps is not None:
+                    strength = self.num_max_timesteps / self.num_train_timesteps
+            if strength is not None:
+                # get the original timestep using init_timestep
+                init_timestep = min(
+                    int(num_inference_steps * strength), num_inference_steps)
+                t_start = max(num_inference_steps - init_timestep, 0)
+                timesteps = scheduler.timesteps[t_start:]
+                num_inference_steps = num_inference_steps - t_start
+                noisy_input = scheduler.add_noise(
+                    ref_x, noise, timesteps[:1].tile([noise.shape[0]]))
+
+        # denoising loop
+        denoised_output = noisy_input
+        num_warmup_steps = len(
+            timesteps) - num_inference_steps * scheduler.order
+        for i, t in enumerate(timesteps):
+            denoised_output = scheduler.scale_model_input(denoised_output, t)
+
+            # predict the noise residual
+            noise_pred = self.denoiser(denoised_output, t, cond)
+
+            # compute the previous noisy sample x_t -> x_t-1
+            denoised_output = scheduler.step(noise_pred, t,
+                                             denoised_output).prev_sample
+
+            # call the callback, if provided
+            if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and
+                                           (i + 1) % scheduler.order == 0):
+                if callback is not None and i % callback_steps == 0:
+                    callback(i, t, len(timesteps), denoised_output)
+
+        return denoised_output
diff --git a/setup.py b/setup.py
index 3bde2b205..212d3b109 100644
--- a/setup.py
+++ b/setup.py
@@ -49,6 +49,7 @@ base = [
     "opencc-python-reimplemented",
     "pandas",
     "paddlenlp>=2.4.8",
+    "ppdiffusers>=0.9.0",
     "paddlespeech_feat",
     "Pillow>=9.0.0",
     "praatio==5.0.0",

From 2f3ca4ac4809767008f89b0ab24846b2f5e0b983 Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Tue, 17 Jan 2023 13:55:18 +0800
Subject: [PATCH 09/24] Update README.md (#2840)

* Update README.md

* Update README_cn.md
---
 README.md    | 4 ++--
 README_cn.md | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 2fb773634..40064f5d2 100644
--- a/README.md
+++ b/README.md
@@ -157,8 +157,8 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision
   - 🧩  *Cascaded models application*: as an extension of the typical traditional audio tasks, we combine the workflows of the aforementioned tasks with other fields like Natural language processing (NLP) and Computer Vision (CV).
 
 ### Recent Update
-- 🔥 2022.01.10: Add [code-switch asr CLI and Demos](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/speech_recognition).
-- 👑 2022.01.06: Add [code-switch asr tal_cs recipe](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/tal_cs/asr1/).
+- 🔥 2023.01.10: Add [code-switch asr CLI and Demos](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/speech_recognition).
+- 👑 2023.01.06: Add [code-switch asr tal_cs recipe](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/tal_cs/asr1/).
 - 🎉 2022.12.02: Add [end-to-end Prosody Prediction pipeline](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3_rhy) (including using prosody labels in Acoustic Model).
 - 🎉 2022.11.30: Add [TTS Android Demo](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/TTSAndroid).
 - 🤗 2022.11.28: PP-TTS and PP-ASR demos are available in [AIStudio](https://aistudio.baidu.com/aistudio/modelsoverview) and [official website
diff --git a/README_cn.md b/README_cn.md
index 53f6a66e4..d2e5f63d7 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -164,8 +164,8 @@
 
   
 ### 近期更新
-- 🔥 2022.01.10: 新增 [中英混合 ASR CLI 和 Demos](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/speech_recognition).
-- 👑 2022.01.06: 新增 [ASR中英混合 tal_cs 训练推理流程](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/tal_cs/asr1/).
+- 🔥 2023.01.10: 新增 [中英混合 ASR CLI 和 Demos](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/speech_recognition).
+- 👑 2023.01.06: 新增 [ASR中英混合 tal_cs 训练推理流程](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/tal_cs/asr1/).
 - 🎉 2022.12.02: 新增 [端到端韵律预测全流程](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/tts3_rhy) (包含在声学模型中使用韵律标签)。
 - 🎉 2022.11.30: 新增 [TTS Android 部署示例](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/TTSAndroid)。
 - 🤗 2022.11.28: PP-TTS and PP-ASR 示例可在 [AIStudio](https://aistudio.baidu.com/aistudio/modelsoverview) 和[飞桨官网](https://www.paddlepaddle.org.cn/models)体验！

From 478fd2593e215a65ec641bc1ba831e53d9da7d4b Mon Sep 17 00:00:00 2001
From: Ming <hww_ym@aliyun.com>
Date: Tue, 17 Jan 2023 17:40:15 +0800
Subject: [PATCH 10/24] update QR Code in README, test=doc (#2841)

---
 README.md    | 2 +-
 README_cn.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 40064f5d2..afc4e4d09 100644
--- a/README.md
+++ b/README.md
@@ -191,7 +191,7 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision
 - Scan the QR code below with your Wechat, you can access to official technical exchange group and get the bonus ( more than 20GB learning materials, such as papers, codes and videos ) and the live link of the lessons. Look forward to your participation.
 
 <div align="center">
-<img src="https://user-images.githubusercontent.com/30135920/196351517-19dece6b-d6ea-448e-a341-d6bfe5712ec1.jpg"  width = "200"  />
+<img src="https://user-images.githubusercontent.com/30135920/212860467-9e943cc3-8be8-49a4-97fd-7c94aad8e979.jpg"  width = "200"  />
 </div>
 
 ## Installation
diff --git a/README_cn.md b/README_cn.md
index d2e5f63d7..ecc4644aa 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -202,7 +202,7 @@
 微信扫描二维码关注公众号，点击“马上报名”填写问卷加入官方交流群，获得更高效的问题答疑，与各行各业开发者充分交流，期待您的加入。
 
 <div align="center">
-<img src="https://user-images.githubusercontent.com/30135920/196351517-19dece6b-d6ea-448e-a341-d6bfe5712ec1.jpg"  width = "200"  />
+<img src="https://user-images.githubusercontent.com/30135920/212860467-9e943cc3-8be8-49a4-97fd-7c94aad8e979.jpg"  width = "200"  />
 </div>
 
 <a name="安装"></a>

From 140aed4b545885cdb9a13117e9d1a009466c44ac Mon Sep 17 00:00:00 2001
From: HuangLiangJie <mailoflawrence@gmail.com>
Date: Thu, 19 Jan 2023 16:04:03 +0800
Subject: [PATCH 11/24] [TTS]VITS init sampler reverse, test=tts (#2843)

---
 paddlespeech/t2s/exps/vits/normalize.py  | 2 +-
 paddlespeech/t2s/exps/vits/preprocess.py | 2 +-
 paddlespeech/t2s/exps/vits/train.py      | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddlespeech/t2s/exps/vits/normalize.py b/paddlespeech/t2s/exps/vits/normalize.py
index 514cbef8e..24e15765e 100644
--- a/paddlespeech/t2s/exps/vits/normalize.py
+++ b/paddlespeech/t2s/exps/vits/normalize.py
@@ -187,7 +187,7 @@ def main():
             record["spk_emb"] = str(item["spk_emb"])
 
         output_metadata.append(record)
-    output_metadata.sort(key=itemgetter('feats_lengths'))
+    output_metadata.sort(key=itemgetter('feats_lengths'), reverse=True)
     output_metadata_path = Path(args.dumpdir) / "metadata.jsonl"
     with jsonlines.open(output_metadata_path, 'w') as writer:
         for item in output_metadata:
diff --git a/paddlespeech/t2s/exps/vits/preprocess.py b/paddlespeech/t2s/exps/vits/preprocess.py
index 2b1a40834..d6b226a20 100644
--- a/paddlespeech/t2s/exps/vits/preprocess.py
+++ b/paddlespeech/t2s/exps/vits/preprocess.py
@@ -166,7 +166,7 @@ def process_sentences(config,
                     if record:
                         results.append(record)
 
-    results.sort(key=itemgetter("feats_lengths"))
+    results.sort(key=itemgetter("feats_lengths"), reverse=True)
     with jsonlines.open(output_dir / "metadata.jsonl", 'w') as writer:
         for item in results:
             writer.write(item)
diff --git a/paddlespeech/t2s/exps/vits/train.py b/paddlespeech/t2s/exps/vits/train.py
index 07301db56..f6a31ced2 100644
--- a/paddlespeech/t2s/exps/vits/train.py
+++ b/paddlespeech/t2s/exps/vits/train.py
@@ -110,7 +110,7 @@ def train_sp(args, config):
     train_sampler = ErnieSATSampler(
         train_dataset,
         batch_size=config.batch_size,
-        shuffle=True,
+        shuffle=False,
         drop_last=True)
     dev_sampler = ErnieSATSampler(
         dev_dataset,

From 2b01e4052559b5c0e1a7d47f4eb1e340a5a1bf1d Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Mon, 30 Jan 2023 13:33:38 +0800
Subject: [PATCH 12/24] =?UTF-8?q?[TTS]soft=20link=20for=20shell=20in=20exa?=
 =?UTF-8?q?mple,=20add=20skip=5Fcopy=5Fwave=20in=20norm=20stage=20of=20G?=
 =?UTF-8?q?=E2=80=A6=20(#2851)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

soft link for shell in example, add skip_copy_wave in norm stage of GANVocoders to save disk
---
 examples/aishell3/tts3/path.sh                | 14 +---
 examples/aishell3/vc0/path.sh                 | 14 +---
 examples/aishell3/vc1/local/train.sh          | 14 +---
 examples/aishell3/vc1/path.sh                 | 14 +---
 examples/aishell3/vc2/local/synthesize.sh     | 21 +-----
 examples/aishell3/vc2/local/train.sh          | 14 +---
 examples/aishell3/vc2/path.sh                 | 14 +---
 examples/aishell3/voc1/local/preprocess.sh    | 10 ++-
 examples/aishell3/voc1/local/synthesize.sh    | 15 +----
 examples/aishell3/voc1/local/train.sh         | 14 +---
 examples/aishell3/voc1/path.sh                | 14 +---
 examples/aishell3/voc5/local/preprocess.sh    | 56 +---------------
 examples/aishell3/voc5/local/synthesize.sh    | 15 +----
 examples/aishell3/voc5/local/train.sh         | 14 +---
 examples/aishell3/voc5/path.sh                | 14 +---
 .../ernie_sat/local/synthesize.sh             | 26 +-------
 .../aishell3_vctk/ernie_sat/local/train.sh    | 13 +---
 examples/aishell3_vctk/ernie_sat/path.sh      | 14 +---
 examples/csmsc/voc1/local/preprocess.sh       | 10 ++-
 examples/csmsc/voc3/finetune.sh               | 65 +------------------
 examples/csmsc/voc3/local/preprocess.sh       | 56 +---------------
 examples/csmsc/voc3/local/train.sh            | 14 +---
 examples/csmsc/voc4/local/preprocess.sh       | 56 +---------------
 examples/csmsc/voc4/local/train.sh            | 14 +---
 examples/csmsc/voc5/finetune.sh               |  9 ++-
 examples/csmsc/voc5/local/preprocess.sh       | 56 +---------------
 examples/csmsc/voc5/local/train.sh            | 14 +---
 examples/csmsc/voc6/local/preprocess.sh       | 10 ++-
 examples/csmsc/voc6/local/train.sh            | 14 +---
 examples/ljspeech/tts0/local/train.sh         | 13 +---
 examples/ljspeech/tts0/path.sh                | 14 +---
 examples/ljspeech/tts3/local/train.sh         | 13 +---
 examples/ljspeech/tts3/path.sh                | 14 +---
 examples/ljspeech/voc1/local/preprocess.sh    | 10 ++-
 examples/ljspeech/voc1/local/synthesize.sh    | 15 +----
 examples/ljspeech/voc1/local/train.sh         | 14 +---
 examples/ljspeech/voc1/path.sh                | 14 +---
 examples/ljspeech/voc5/local/preprocess.sh    | 56 +---------------
 examples/ljspeech/voc5/local/synthesize.sh    | 15 +----
 examples/ljspeech/voc5/local/train.sh         | 14 +---
 examples/ljspeech/voc5/path.sh                | 14 +---
 examples/vctk/ernie_sat/local/train.sh        | 13 +---
 examples/vctk/ernie_sat/path.sh               | 14 +---
 examples/vctk/tts3/local/train.sh             | 14 +---
 examples/vctk/tts3/path.sh                    | 14 +---
 examples/vctk/voc1/local/preprocess.sh        | 10 ++-
 examples/vctk/voc1/local/synthesize.sh        | 15 +----
 examples/vctk/voc1/local/train.sh             | 14 +---
 examples/vctk/voc1/path.sh                    | 14 +---
 examples/vctk/voc5/local/preprocess.sh        | 56 +---------------
 examples/vctk/voc5/local/synthesize.sh        | 15 +----
 examples/vctk/voc5/local/train.sh             | 14 +---
 examples/vctk/voc5/path.sh                    | 14 +---
 examples/zh_en_tts/tts3/local/train.sh        | 14 +---
 examples/zh_en_tts/tts3/path.sh               | 14 +---
 55 files changed, 90 insertions(+), 979 deletions(-)
 mode change 100755 => 120000 examples/aishell3/tts3/path.sh
 mode change 100755 => 120000 examples/aishell3/vc0/path.sh
 mode change 100755 => 120000 examples/aishell3/vc1/local/train.sh
 mode change 100755 => 120000 examples/aishell3/vc1/path.sh
 mode change 100755 => 120000 examples/aishell3/vc2/local/synthesize.sh
 mode change 100755 => 120000 examples/aishell3/vc2/local/train.sh
 mode change 100755 => 120000 examples/aishell3/vc2/path.sh
 mode change 100755 => 120000 examples/aishell3/voc1/local/synthesize.sh
 mode change 100755 => 120000 examples/aishell3/voc1/local/train.sh
 mode change 100755 => 120000 examples/aishell3/voc1/path.sh
 mode change 100755 => 120000 examples/aishell3/voc5/local/preprocess.sh
 mode change 100755 => 120000 examples/aishell3/voc5/local/synthesize.sh
 mode change 100755 => 120000 examples/aishell3/voc5/local/train.sh
 mode change 100755 => 120000 examples/aishell3/voc5/path.sh
 mode change 100755 => 120000 examples/aishell3_vctk/ernie_sat/local/synthesize.sh
 mode change 100755 => 120000 examples/aishell3_vctk/ernie_sat/local/train.sh
 mode change 100755 => 120000 examples/aishell3_vctk/ernie_sat/path.sh
 mode change 100755 => 120000 examples/csmsc/voc3/finetune.sh
 mode change 100755 => 120000 examples/csmsc/voc3/local/preprocess.sh
 mode change 100755 => 120000 examples/csmsc/voc3/local/train.sh
 mode change 100755 => 120000 examples/csmsc/voc4/local/preprocess.sh
 mode change 100755 => 120000 examples/csmsc/voc4/local/train.sh
 mode change 100755 => 120000 examples/csmsc/voc5/local/preprocess.sh
 mode change 100755 => 120000 examples/csmsc/voc5/local/train.sh
 mode change 100755 => 120000 examples/csmsc/voc6/local/train.sh
 mode change 100755 => 120000 examples/ljspeech/tts0/local/train.sh
 mode change 100755 => 120000 examples/ljspeech/tts0/path.sh
 mode change 100755 => 120000 examples/ljspeech/tts3/local/train.sh
 mode change 100755 => 120000 examples/ljspeech/tts3/path.sh
 mode change 100755 => 120000 examples/ljspeech/voc1/local/synthesize.sh
 mode change 100755 => 120000 examples/ljspeech/voc1/local/train.sh
 mode change 100755 => 120000 examples/ljspeech/voc1/path.sh
 mode change 100755 => 120000 examples/ljspeech/voc5/local/preprocess.sh
 mode change 100755 => 120000 examples/ljspeech/voc5/local/synthesize.sh
 mode change 100755 => 120000 examples/ljspeech/voc5/local/train.sh
 mode change 100755 => 120000 examples/ljspeech/voc5/path.sh
 mode change 100755 => 120000 examples/vctk/ernie_sat/local/train.sh
 mode change 100755 => 120000 examples/vctk/ernie_sat/path.sh
 mode change 100755 => 120000 examples/vctk/tts3/local/train.sh
 mode change 100755 => 120000 examples/vctk/tts3/path.sh
 mode change 100755 => 120000 examples/vctk/voc1/local/synthesize.sh
 mode change 100755 => 120000 examples/vctk/voc1/local/train.sh
 mode change 100755 => 120000 examples/vctk/voc1/path.sh
 mode change 100755 => 120000 examples/vctk/voc5/local/preprocess.sh
 mode change 100755 => 120000 examples/vctk/voc5/local/synthesize.sh
 mode change 100755 => 120000 examples/vctk/voc5/local/train.sh
 mode change 100755 => 120000 examples/vctk/voc5/path.sh
 mode change 100755 => 120000 examples/zh_en_tts/tts3/local/train.sh
 mode change 100755 => 120000 examples/zh_en_tts/tts3/path.sh

diff --git a/examples/aishell3/tts3/path.sh b/examples/aishell3/tts3/path.sh
deleted file mode 100755
index fb7e8411c..000000000
--- a/examples/aishell3/tts3/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=fastspeech2
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
diff --git a/examples/aishell3/tts3/path.sh b/examples/aishell3/tts3/path.sh
new file mode 120000
index 000000000..4785b9095
--- /dev/null
+++ b/examples/aishell3/tts3/path.sh
@@ -0,0 +1 @@
+../../csmsc/tts3/path.sh
\ No newline at end of file
diff --git a/examples/aishell3/vc0/path.sh b/examples/aishell3/vc0/path.sh
deleted file mode 100755
index a37cd21e3..000000000
--- a/examples/aishell3/vc0/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=tacotron2
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
diff --git a/examples/aishell3/vc0/path.sh b/examples/aishell3/vc0/path.sh
new file mode 120000
index 000000000..9e1fdbd16
--- /dev/null
+++ b/examples/aishell3/vc0/path.sh
@@ -0,0 +1 @@
+../../csmsc/tts0/path.sh
\ No newline at end of file
diff --git a/examples/aishell3/vc1/local/train.sh b/examples/aishell3/vc1/local/train.sh
deleted file mode 100755
index c775fcadc..000000000
--- a/examples/aishell3/vc1/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-python3 ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=2 \
-    --phones-dict=dump/phone_id_map.txt \
-    --voice-cloning=True
\ No newline at end of file
diff --git a/examples/aishell3/vc1/local/train.sh b/examples/aishell3/vc1/local/train.sh
new file mode 120000
index 000000000..115a0b8dc
--- /dev/null
+++ b/examples/aishell3/vc1/local/train.sh
@@ -0,0 +1 @@
+../../vc0/local/train.sh
\ No newline at end of file
diff --git a/examples/aishell3/vc1/path.sh b/examples/aishell3/vc1/path.sh
deleted file mode 100755
index fb7e8411c..000000000
--- a/examples/aishell3/vc1/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=fastspeech2
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
diff --git a/examples/aishell3/vc1/path.sh b/examples/aishell3/vc1/path.sh
new file mode 120000
index 000000000..4785b9095
--- /dev/null
+++ b/examples/aishell3/vc1/path.sh
@@ -0,0 +1 @@
+../../csmsc/tts3/path.sh
\ No newline at end of file
diff --git a/examples/aishell3/vc2/local/synthesize.sh b/examples/aishell3/vc2/local/synthesize.sh
deleted file mode 100755
index 8fd8977d3..000000000
--- a/examples/aishell3/vc2/local/synthesize.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-ckpt_name=$3
-
-python3 ${BIN_DIR}/../synthesize.py \
-    --am=fastspeech2_aishell3 \
-    --am_config=${config_path} \
-    --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
-    --am_stat=dump/train/speech_stats.npy \
-    --voc=pwgan_aishell3 \
-    --voc_config=pwg_aishell3_ckpt_0.5/default.yaml \
-    --voc_ckpt=pwg_aishell3_ckpt_0.5/snapshot_iter_1000000.pdz \
-    --voc_stat=pwg_aishell3_ckpt_0.5/feats_stats.npy \
-    --test_metadata=dump/test/norm/metadata.jsonl \
-    --output_dir=${train_output_path}/test \
-    --phones_dict=dump/phone_id_map.txt \
-    --speaker_dict=dump/speaker_id_map.txt \
-    --voice-cloning=True
diff --git a/examples/aishell3/vc2/local/synthesize.sh b/examples/aishell3/vc2/local/synthesize.sh
new file mode 120000
index 000000000..ca8df6b04
--- /dev/null
+++ b/examples/aishell3/vc2/local/synthesize.sh
@@ -0,0 +1 @@
+../../vc1/local/synthesize.sh
\ No newline at end of file
diff --git a/examples/aishell3/vc2/local/train.sh b/examples/aishell3/vc2/local/train.sh
deleted file mode 100755
index c775fcadc..000000000
--- a/examples/aishell3/vc2/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-python3 ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=2 \
-    --phones-dict=dump/phone_id_map.txt \
-    --voice-cloning=True
\ No newline at end of file
diff --git a/examples/aishell3/vc2/local/train.sh b/examples/aishell3/vc2/local/train.sh
new file mode 120000
index 000000000..115a0b8dc
--- /dev/null
+++ b/examples/aishell3/vc2/local/train.sh
@@ -0,0 +1 @@
+../../vc0/local/train.sh
\ No newline at end of file
diff --git a/examples/aishell3/vc2/path.sh b/examples/aishell3/vc2/path.sh
deleted file mode 100755
index fb7e8411c..000000000
--- a/examples/aishell3/vc2/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=fastspeech2
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
diff --git a/examples/aishell3/vc2/path.sh b/examples/aishell3/vc2/path.sh
new file mode 120000
index 000000000..4785b9095
--- /dev/null
+++ b/examples/aishell3/vc2/path.sh
@@ -0,0 +1 @@
+../../csmsc/tts3/path.sh
\ No newline at end of file
diff --git a/examples/aishell3/voc1/local/preprocess.sh b/examples/aishell3/voc1/local/preprocess.sh
index 44cc3dbe4..71eab68ad 100755
--- a/examples/aishell3/voc1/local/preprocess.sh
+++ b/examples/aishell3/voc1/local/preprocess.sh
@@ -42,14 +42,18 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     python3 ${BIN_DIR}/../normalize.py \
         --metadata=dump/train/raw/metadata.jsonl \
         --dumpdir=dump/train/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
+        --skip-wav-copy
+
     python3 ${BIN_DIR}/../normalize.py \
         --metadata=dump/dev/raw/metadata.jsonl \
         --dumpdir=dump/dev/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
+        --skip-wav-copy
     
     python3 ${BIN_DIR}/../normalize.py \
         --metadata=dump/test/raw/metadata.jsonl \
         --dumpdir=dump/test/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
+        --skip-wav-copy
 fi
diff --git a/examples/aishell3/voc1/local/synthesize.sh b/examples/aishell3/voc1/local/synthesize.sh
deleted file mode 100755
index 145557b3d..000000000
--- a/examples/aishell3/voc1/local/synthesize.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-ckpt_name=$3
-
-FLAGS_allocator_strategy=naive_best_fit \
-FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ${BIN_DIR}/../synthesize.py \
-    --config=${config_path} \
-    --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
-    --test-metadata=dump/test/norm/metadata.jsonl \
-    --output-dir=${train_output_path}/test \
-    --generator-type=pwgan
diff --git a/examples/aishell3/voc1/local/synthesize.sh b/examples/aishell3/voc1/local/synthesize.sh
new file mode 120000
index 000000000..d6aecd8d1
--- /dev/null
+++ b/examples/aishell3/voc1/local/synthesize.sh
@@ -0,0 +1 @@
+../../../csmsc/voc1/local/synthesize.sh
\ No newline at end of file
diff --git a/examples/aishell3/voc1/local/train.sh b/examples/aishell3/voc1/local/train.sh
deleted file mode 100755
index 9695631ef..000000000
--- a/examples/aishell3/voc1/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-FLAGS_cudnn_exhaustive_search=true \
-FLAGS_conv_workspace_size_limit=4000 \
-python ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=1
diff --git a/examples/aishell3/voc1/local/train.sh b/examples/aishell3/voc1/local/train.sh
new file mode 120000
index 000000000..2942893d2
--- /dev/null
+++ b/examples/aishell3/voc1/local/train.sh
@@ -0,0 +1 @@
+../../../csmsc/voc1/local/train.sh
\ No newline at end of file
diff --git a/examples/aishell3/voc1/path.sh b/examples/aishell3/voc1/path.sh
deleted file mode 100755
index 1e6647b86..000000000
--- a/examples/aishell3/voc1/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=parallelwave_gan
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL}
\ No newline at end of file
diff --git a/examples/aishell3/voc1/path.sh b/examples/aishell3/voc1/path.sh
new file mode 120000
index 000000000..b7ed4fb8f
--- /dev/null
+++ b/examples/aishell3/voc1/path.sh
@@ -0,0 +1 @@
+../../csmsc/voc1/path.sh
\ No newline at end of file
diff --git a/examples/aishell3/voc5/local/preprocess.sh b/examples/aishell3/voc5/local/preprocess.sh
deleted file mode 100755
index 44cc3dbe4..000000000
--- a/examples/aishell3/voc5/local/preprocess.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/bash
-
-stage=0
-stop_stage=100
-
-config_path=$1
-
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    # get durations from MFA's result
-    echo "Generate durations.txt from MFA results ..."
-    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
-        --inputdir=./aishell3_alignment_tone \
-        --output=durations.txt \
-        --config=${config_path}
-fi
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    # extract features
-    echo "Extract features ..."
-    python3 ${BIN_DIR}/../preprocess.py \
-        --rootdir=~/datasets/data_aishell3/ \
-        --dataset=aishell3 \
-        --dumpdir=dump \
-        --dur-file=durations.txt \
-        --config=${config_path} \
-        --cut-sil=True \
-        --num-cpu=20
-fi
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    # get features' stats(mean and std)
-    echo "Get features' stats ..."
-    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
-        --metadata=dump/train/raw/metadata.jsonl \
-        --field-name="feats"
-fi
-
-if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
-    # normalize, dev and test should use train's stats
-    echo "Normalize ..."
-   
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/train/raw/metadata.jsonl \
-        --dumpdir=dump/train/norm \
-        --stats=dump/train/feats_stats.npy
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/dev/raw/metadata.jsonl \
-        --dumpdir=dump/dev/norm \
-        --stats=dump/train/feats_stats.npy
-    
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/test/raw/metadata.jsonl \
-        --dumpdir=dump/test/norm \
-        --stats=dump/train/feats_stats.npy
-fi
diff --git a/examples/aishell3/voc5/local/preprocess.sh b/examples/aishell3/voc5/local/preprocess.sh
new file mode 120000
index 000000000..f0cb24de9
--- /dev/null
+++ b/examples/aishell3/voc5/local/preprocess.sh
@@ -0,0 +1 @@
+../../voc1/local/preprocess.sh
\ No newline at end of file
diff --git a/examples/aishell3/voc5/local/synthesize.sh b/examples/aishell3/voc5/local/synthesize.sh
deleted file mode 100755
index 647896175..000000000
--- a/examples/aishell3/voc5/local/synthesize.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-ckpt_name=$3
-
-FLAGS_allocator_strategy=naive_best_fit \
-FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ${BIN_DIR}/../synthesize.py \
-    --config=${config_path} \
-    --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
-    --test-metadata=dump/test/norm/metadata.jsonl \
-    --output-dir=${train_output_path}/test \
-    --generator-type=hifigan
diff --git a/examples/aishell3/voc5/local/synthesize.sh b/examples/aishell3/voc5/local/synthesize.sh
new file mode 120000
index 000000000..c887112c0
--- /dev/null
+++ b/examples/aishell3/voc5/local/synthesize.sh
@@ -0,0 +1 @@
+../../../csmsc/voc5/local/synthesize.sh
\ No newline at end of file
diff --git a/examples/aishell3/voc5/local/train.sh b/examples/aishell3/voc5/local/train.sh
deleted file mode 100755
index 9695631ef..000000000
--- a/examples/aishell3/voc5/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-FLAGS_cudnn_exhaustive_search=true \
-FLAGS_conv_workspace_size_limit=4000 \
-python ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=1
diff --git a/examples/aishell3/voc5/local/train.sh b/examples/aishell3/voc5/local/train.sh
new file mode 120000
index 000000000..2942893d2
--- /dev/null
+++ b/examples/aishell3/voc5/local/train.sh
@@ -0,0 +1 @@
+../../../csmsc/voc1/local/train.sh
\ No newline at end of file
diff --git a/examples/aishell3/voc5/path.sh b/examples/aishell3/voc5/path.sh
deleted file mode 100755
index 7451b3218..000000000
--- a/examples/aishell3/voc5/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=hifigan
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL}
diff --git a/examples/aishell3/voc5/path.sh b/examples/aishell3/voc5/path.sh
new file mode 120000
index 000000000..b67fe2b39
--- /dev/null
+++ b/examples/aishell3/voc5/path.sh
@@ -0,0 +1 @@
+../../csmsc/voc5/path.sh
\ No newline at end of file
diff --git a/examples/aishell3_vctk/ernie_sat/local/synthesize.sh b/examples/aishell3_vctk/ernie_sat/local/synthesize.sh
deleted file mode 100755
index 8b4178f13..000000000
--- a/examples/aishell3_vctk/ernie_sat/local/synthesize.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-ckpt_name=$3
-
-stage=0
-stop_stage=0
-
-# hifigan
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    FLAGS_allocator_strategy=naive_best_fit \
-    FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-    python3 ${BIN_DIR}/synthesize.py \
-        --erniesat_config=${config_path} \
-        --erniesat_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
-        --erniesat_stat=dump/train/speech_stats.npy \
-        --voc=hifigan_aishell3 \
-        --voc_config=hifigan_aishell3_ckpt_0.2.0/default.yaml \
-        --voc_ckpt=hifigan_aishell3_ckpt_0.2.0/snapshot_iter_2500000.pdz \
-        --voc_stat=hifigan_aishell3_ckpt_0.2.0/feats_stats.npy \
-        --test_metadata=dump/test/norm/metadata.jsonl \
-        --output_dir=${train_output_path}/test \
-        --phones_dict=dump/phone_id_map.txt
-fi
diff --git a/examples/aishell3_vctk/ernie_sat/local/synthesize.sh b/examples/aishell3_vctk/ernie_sat/local/synthesize.sh
new file mode 120000
index 000000000..5703dcb2c
--- /dev/null
+++ b/examples/aishell3_vctk/ernie_sat/local/synthesize.sh
@@ -0,0 +1 @@
+../../../aishell3/ernie_sat/local/synthesize.sh
\ No newline at end of file
diff --git a/examples/aishell3_vctk/ernie_sat/local/train.sh b/examples/aishell3_vctk/ernie_sat/local/train.sh
deleted file mode 100755
index 526aac435..000000000
--- a/examples/aishell3_vctk/ernie_sat/local/train.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-python3 ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=8 \
-    --phones-dict=dump/phone_id_map.txt
\ No newline at end of file
diff --git a/examples/aishell3_vctk/ernie_sat/local/train.sh b/examples/aishell3_vctk/ernie_sat/local/train.sh
new file mode 120000
index 000000000..9f1d2346d
--- /dev/null
+++ b/examples/aishell3_vctk/ernie_sat/local/train.sh
@@ -0,0 +1 @@
+../../../aishell3/ernie_sat/local/train.sh
\ No newline at end of file
diff --git a/examples/aishell3_vctk/ernie_sat/path.sh b/examples/aishell3_vctk/ernie_sat/path.sh
deleted file mode 100755
index 4ecab0251..000000000
--- a/examples/aishell3_vctk/ernie_sat/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=ernie_sat
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
\ No newline at end of file
diff --git a/examples/aishell3_vctk/ernie_sat/path.sh b/examples/aishell3_vctk/ernie_sat/path.sh
new file mode 120000
index 000000000..5ec397590
--- /dev/null
+++ b/examples/aishell3_vctk/ernie_sat/path.sh
@@ -0,0 +1 @@
+../../aishell3/ernie_sat/path.sh
\ No newline at end of file
diff --git a/examples/csmsc/voc1/local/preprocess.sh b/examples/csmsc/voc1/local/preprocess.sh
index 61d6d62be..62d0717b9 100755
--- a/examples/csmsc/voc1/local/preprocess.sh
+++ b/examples/csmsc/voc1/local/preprocess.sh
@@ -42,14 +42,18 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     python3 ${BIN_DIR}/../normalize.py \
         --metadata=dump/train/raw/metadata.jsonl \
         --dumpdir=dump/train/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
+        --skip-wav-copy
+        
     python3 ${BIN_DIR}/../normalize.py \
         --metadata=dump/dev/raw/metadata.jsonl \
         --dumpdir=dump/dev/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
+        --skip-wav-copy
     
     python3 ${BIN_DIR}/../normalize.py \
         --metadata=dump/test/raw/metadata.jsonl \
         --dumpdir=dump/test/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
+        --skip-wav-copy
 fi
diff --git a/examples/csmsc/voc3/finetune.sh b/examples/csmsc/voc3/finetune.sh
deleted file mode 100755
index 6719bd0be..000000000
--- a/examples/csmsc/voc3/finetune.sh
+++ /dev/null
@@ -1,64 +0,0 @@
-#!/bin/bash
-
-source path.sh
-
-gpus=0
-stage=0
-stop_stage=100
-
-source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
-
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    python3 ${MAIN_ROOT}/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py \
-        --fastspeech2-config=fastspeech2_nosil_baker_ckpt_0.4/default.yaml \
-        --fastspeech2-checkpoint=fastspeech2_nosil_baker_ckpt_0.4/snapshot_iter_76000.pdz \
-        --fastspeech2-stat=fastspeech2_nosil_baker_ckpt_0.4/speech_stats.npy \
-        --dur-file=durations.txt \
-        --output-dir=dump_finetune \
-        --phones-dict=fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt \
-        --dataset=baker \
-        --rootdir=~/datasets/BZNSYP/
-fi
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    python3 ${MAIN_ROOT}/utils/link_wav.py \
-        --old-dump-dir=dump \
-        --dump-dir=dump_finetune
-fi
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    # get features' stats(mean and std)
-    echo "Get features' stats ..."
-    cp dump/train/feats_stats.npy dump_finetune/train/
-fi
-
-if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
-    # normalize, dev and test should use train's stats
-    echo "Normalize ..."
-   
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump_finetune/train/raw/metadata.jsonl \
-        --dumpdir=dump_finetune/train/norm \
-        --stats=dump_finetune/train/feats_stats.npy
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump_finetune/dev/raw/metadata.jsonl \
-        --dumpdir=dump_finetune/dev/norm \
-        --stats=dump_finetune/train/feats_stats.npy
-    
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump_finetune/test/raw/metadata.jsonl \
-        --dumpdir=dump_finetune/test/norm \
-        --stats=dump_finetune/train/feats_stats.npy
-fi
-
-if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
-    CUDA_VISIBLE_DEVICES=${gpus} \
-    FLAGS_cudnn_exhaustive_search=true \
-    FLAGS_conv_workspace_size_limit=4000 \
-    python ${BIN_DIR}/train.py \
-        --train-metadata=dump_finetune/train/norm/metadata.jsonl \
-        --dev-metadata=dump_finetune/dev/norm/metadata.jsonl \
-        --config=conf/finetune.yaml \
-        --output-dir=exp/finetune \
-        --ngpu=1
-fi 
\ No newline at end of file
diff --git a/examples/csmsc/voc3/finetune.sh b/examples/csmsc/voc3/finetune.sh
new file mode 120000
index 000000000..b6fa868e2
--- /dev/null
+++ b/examples/csmsc/voc3/finetune.sh
@@ -0,0 +1 @@
+../voc5/finetune.sh
\ No newline at end of file
diff --git a/examples/csmsc/voc3/local/preprocess.sh b/examples/csmsc/voc3/local/preprocess.sh
deleted file mode 100755
index 61d6d62be..000000000
--- a/examples/csmsc/voc3/local/preprocess.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/bash
-
-stage=0
-stop_stage=100
-
-config_path=$1
-
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    # get durations from MFA's result
-    echo "Generate durations.txt from MFA results ..."
-    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
-        --inputdir=./baker_alignment_tone \
-        --output=durations.txt \
-        --config=${config_path}
-fi
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    # extract features
-    echo "Extract features ..."
-    python3 ${BIN_DIR}/../preprocess.py \
-        --rootdir=~/datasets/BZNSYP/ \
-        --dataset=baker \
-        --dumpdir=dump \
-        --dur-file=durations.txt \
-        --config=${config_path} \
-        --cut-sil=True \
-        --num-cpu=20
-fi
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    # get features' stats(mean and std)
-    echo "Get features' stats ..."
-    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
-        --metadata=dump/train/raw/metadata.jsonl \
-        --field-name="feats"
-fi
-
-if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
-    # normalize, dev and test should use train's stats
-    echo "Normalize ..."
-   
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/train/raw/metadata.jsonl \
-        --dumpdir=dump/train/norm \
-        --stats=dump/train/feats_stats.npy
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/dev/raw/metadata.jsonl \
-        --dumpdir=dump/dev/norm \
-        --stats=dump/train/feats_stats.npy
-    
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/test/raw/metadata.jsonl \
-        --dumpdir=dump/test/norm \
-        --stats=dump/train/feats_stats.npy
-fi
diff --git a/examples/csmsc/voc3/local/preprocess.sh b/examples/csmsc/voc3/local/preprocess.sh
new file mode 120000
index 000000000..f0cb24de9
--- /dev/null
+++ b/examples/csmsc/voc3/local/preprocess.sh
@@ -0,0 +1 @@
+../../voc1/local/preprocess.sh
\ No newline at end of file
diff --git a/examples/csmsc/voc3/local/train.sh b/examples/csmsc/voc3/local/train.sh
deleted file mode 100755
index 9695631ef..000000000
--- a/examples/csmsc/voc3/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-FLAGS_cudnn_exhaustive_search=true \
-FLAGS_conv_workspace_size_limit=4000 \
-python ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=1
diff --git a/examples/csmsc/voc3/local/train.sh b/examples/csmsc/voc3/local/train.sh
new file mode 120000
index 000000000..9ec3ed94b
--- /dev/null
+++ b/examples/csmsc/voc3/local/train.sh
@@ -0,0 +1 @@
+../../voc1/local/train.sh
\ No newline at end of file
diff --git a/examples/csmsc/voc4/local/preprocess.sh b/examples/csmsc/voc4/local/preprocess.sh
deleted file mode 100755
index 61d6d62be..000000000
--- a/examples/csmsc/voc4/local/preprocess.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/bash
-
-stage=0
-stop_stage=100
-
-config_path=$1
-
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    # get durations from MFA's result
-    echo "Generate durations.txt from MFA results ..."
-    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
-        --inputdir=./baker_alignment_tone \
-        --output=durations.txt \
-        --config=${config_path}
-fi
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    # extract features
-    echo "Extract features ..."
-    python3 ${BIN_DIR}/../preprocess.py \
-        --rootdir=~/datasets/BZNSYP/ \
-        --dataset=baker \
-        --dumpdir=dump \
-        --dur-file=durations.txt \
-        --config=${config_path} \
-        --cut-sil=True \
-        --num-cpu=20
-fi
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    # get features' stats(mean and std)
-    echo "Get features' stats ..."
-    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
-        --metadata=dump/train/raw/metadata.jsonl \
-        --field-name="feats"
-fi
-
-if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
-    # normalize, dev and test should use train's stats
-    echo "Normalize ..."
-   
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/train/raw/metadata.jsonl \
-        --dumpdir=dump/train/norm \
-        --stats=dump/train/feats_stats.npy
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/dev/raw/metadata.jsonl \
-        --dumpdir=dump/dev/norm \
-        --stats=dump/train/feats_stats.npy
-    
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/test/raw/metadata.jsonl \
-        --dumpdir=dump/test/norm \
-        --stats=dump/train/feats_stats.npy
-fi
diff --git a/examples/csmsc/voc4/local/preprocess.sh b/examples/csmsc/voc4/local/preprocess.sh
new file mode 120000
index 000000000..f0cb24de9
--- /dev/null
+++ b/examples/csmsc/voc4/local/preprocess.sh
@@ -0,0 +1 @@
+../../voc1/local/preprocess.sh
\ No newline at end of file
diff --git a/examples/csmsc/voc4/local/train.sh b/examples/csmsc/voc4/local/train.sh
deleted file mode 100755
index 9695631ef..000000000
--- a/examples/csmsc/voc4/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-FLAGS_cudnn_exhaustive_search=true \
-FLAGS_conv_workspace_size_limit=4000 \
-python ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=1
diff --git a/examples/csmsc/voc4/local/train.sh b/examples/csmsc/voc4/local/train.sh
new file mode 120000
index 000000000..9ec3ed94b
--- /dev/null
+++ b/examples/csmsc/voc4/local/train.sh
@@ -0,0 +1 @@
+../../voc1/local/train.sh
\ No newline at end of file
diff --git a/examples/csmsc/voc5/finetune.sh b/examples/csmsc/voc5/finetune.sh
index 6719bd0be..eb8325aeb 100755
--- a/examples/csmsc/voc5/finetune.sh
+++ b/examples/csmsc/voc5/finetune.sh
@@ -39,16 +39,19 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     python3 ${BIN_DIR}/../normalize.py \
         --metadata=dump_finetune/train/raw/metadata.jsonl \
         --dumpdir=dump_finetune/train/norm \
-        --stats=dump_finetune/train/feats_stats.npy
+        --stats=dump_finetune/train/feats_stats.npy \
+        --skip-wav-copy
     python3 ${BIN_DIR}/../normalize.py \
         --metadata=dump_finetune/dev/raw/metadata.jsonl \
         --dumpdir=dump_finetune/dev/norm \
-        --stats=dump_finetune/train/feats_stats.npy
+        --stats=dump_finetune/train/feats_stats.npy \
+        --skip-wav-copy
     
     python3 ${BIN_DIR}/../normalize.py \
         --metadata=dump_finetune/test/raw/metadata.jsonl \
         --dumpdir=dump_finetune/test/norm \
-        --stats=dump_finetune/train/feats_stats.npy
+        --stats=dump_finetune/train/feats_stats.npy \
+        --skip-wav-copy
 fi
 
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
diff --git a/examples/csmsc/voc5/local/preprocess.sh b/examples/csmsc/voc5/local/preprocess.sh
deleted file mode 100755
index 61d6d62be..000000000
--- a/examples/csmsc/voc5/local/preprocess.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/bash
-
-stage=0
-stop_stage=100
-
-config_path=$1
-
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    # get durations from MFA's result
-    echo "Generate durations.txt from MFA results ..."
-    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
-        --inputdir=./baker_alignment_tone \
-        --output=durations.txt \
-        --config=${config_path}
-fi
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    # extract features
-    echo "Extract features ..."
-    python3 ${BIN_DIR}/../preprocess.py \
-        --rootdir=~/datasets/BZNSYP/ \
-        --dataset=baker \
-        --dumpdir=dump \
-        --dur-file=durations.txt \
-        --config=${config_path} \
-        --cut-sil=True \
-        --num-cpu=20
-fi
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    # get features' stats(mean and std)
-    echo "Get features' stats ..."
-    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
-        --metadata=dump/train/raw/metadata.jsonl \
-        --field-name="feats"
-fi
-
-if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
-    # normalize, dev and test should use train's stats
-    echo "Normalize ..."
-   
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/train/raw/metadata.jsonl \
-        --dumpdir=dump/train/norm \
-        --stats=dump/train/feats_stats.npy
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/dev/raw/metadata.jsonl \
-        --dumpdir=dump/dev/norm \
-        --stats=dump/train/feats_stats.npy
-    
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/test/raw/metadata.jsonl \
-        --dumpdir=dump/test/norm \
-        --stats=dump/train/feats_stats.npy
-fi
diff --git a/examples/csmsc/voc5/local/preprocess.sh b/examples/csmsc/voc5/local/preprocess.sh
new file mode 120000
index 000000000..f0cb24de9
--- /dev/null
+++ b/examples/csmsc/voc5/local/preprocess.sh
@@ -0,0 +1 @@
+../../voc1/local/preprocess.sh
\ No newline at end of file
diff --git a/examples/csmsc/voc5/local/train.sh b/examples/csmsc/voc5/local/train.sh
deleted file mode 100755
index 9695631ef..000000000
--- a/examples/csmsc/voc5/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-FLAGS_cudnn_exhaustive_search=true \
-FLAGS_conv_workspace_size_limit=4000 \
-python ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=1
diff --git a/examples/csmsc/voc5/local/train.sh b/examples/csmsc/voc5/local/train.sh
new file mode 120000
index 000000000..9ec3ed94b
--- /dev/null
+++ b/examples/csmsc/voc5/local/train.sh
@@ -0,0 +1 @@
+../../voc1/local/train.sh
\ No newline at end of file
diff --git a/examples/csmsc/voc6/local/preprocess.sh b/examples/csmsc/voc6/local/preprocess.sh
index 2dcc39ac7..509824b8e 100755
--- a/examples/csmsc/voc6/local/preprocess.sh
+++ b/examples/csmsc/voc6/local/preprocess.sh
@@ -42,14 +42,18 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     python3 ${BIN_DIR}/../gan_vocoder/normalize.py \
         --metadata=dump/train/raw/metadata.jsonl \
         --dumpdir=dump/train/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
+        --skip-wav-copy
+
     python3 ${BIN_DIR}/../gan_vocoder/normalize.py \
         --metadata=dump/dev/raw/metadata.jsonl \
         --dumpdir=dump/dev/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
+        --skip-wav-copy
     
     python3 ${BIN_DIR}/../gan_vocoder/normalize.py \
         --metadata=dump/test/raw/metadata.jsonl \
         --dumpdir=dump/test/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
+        --skip-wav-copy
 fi
diff --git a/examples/csmsc/voc6/local/train.sh b/examples/csmsc/voc6/local/train.sh
deleted file mode 100755
index 9695631ef..000000000
--- a/examples/csmsc/voc6/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-FLAGS_cudnn_exhaustive_search=true \
-FLAGS_conv_workspace_size_limit=4000 \
-python ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=1
diff --git a/examples/csmsc/voc6/local/train.sh b/examples/csmsc/voc6/local/train.sh
new file mode 120000
index 000000000..9ec3ed94b
--- /dev/null
+++ b/examples/csmsc/voc6/local/train.sh
@@ -0,0 +1 @@
+../../voc1/local/train.sh
\ No newline at end of file
diff --git a/examples/ljspeech/tts0/local/train.sh b/examples/ljspeech/tts0/local/train.sh
deleted file mode 100755
index f90db9150..000000000
--- a/examples/ljspeech/tts0/local/train.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-python3 ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=1 \
-    --phones-dict=dump/phone_id_map.txt
\ No newline at end of file
diff --git a/examples/ljspeech/tts0/local/train.sh b/examples/ljspeech/tts0/local/train.sh
new file mode 120000
index 000000000..7f54e9239
--- /dev/null
+++ b/examples/ljspeech/tts0/local/train.sh
@@ -0,0 +1 @@
+../../../csmsc/tts0/local/train.sh
\ No newline at end of file
diff --git a/examples/ljspeech/tts0/path.sh b/examples/ljspeech/tts0/path.sh
deleted file mode 100755
index a37cd21e3..000000000
--- a/examples/ljspeech/tts0/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=tacotron2
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
diff --git a/examples/ljspeech/tts0/path.sh b/examples/ljspeech/tts0/path.sh
new file mode 120000
index 000000000..9e1fdbd16
--- /dev/null
+++ b/examples/ljspeech/tts0/path.sh
@@ -0,0 +1 @@
+../../csmsc/tts0/path.sh
\ No newline at end of file
diff --git a/examples/ljspeech/tts3/local/train.sh b/examples/ljspeech/tts3/local/train.sh
deleted file mode 100755
index d1302f99f..000000000
--- a/examples/ljspeech/tts3/local/train.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-python3 ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=1 \
-    --phones-dict=dump/phone_id_map.txt
diff --git a/examples/ljspeech/tts3/local/train.sh b/examples/ljspeech/tts3/local/train.sh
new file mode 120000
index 000000000..d7b05058e
--- /dev/null
+++ b/examples/ljspeech/tts3/local/train.sh
@@ -0,0 +1 @@
+../../../csmsc/tts3/local/train.sh
\ No newline at end of file
diff --git a/examples/ljspeech/tts3/path.sh b/examples/ljspeech/tts3/path.sh
deleted file mode 100755
index fb7e8411c..000000000
--- a/examples/ljspeech/tts3/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=fastspeech2
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
diff --git a/examples/ljspeech/tts3/path.sh b/examples/ljspeech/tts3/path.sh
new file mode 120000
index 000000000..4785b9095
--- /dev/null
+++ b/examples/ljspeech/tts3/path.sh
@@ -0,0 +1 @@
+../../csmsc/tts3/path.sh
\ No newline at end of file
diff --git a/examples/ljspeech/voc1/local/preprocess.sh b/examples/ljspeech/voc1/local/preprocess.sh
index d1af60dad..bfbf75b7d 100755
--- a/examples/ljspeech/voc1/local/preprocess.sh
+++ b/examples/ljspeech/voc1/local/preprocess.sh
@@ -42,14 +42,18 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     python3 ${BIN_DIR}/../normalize.py \
         --metadata=dump/train/raw/metadata.jsonl \
         --dumpdir=dump/train/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
+        --skip-wav-copy
+
     python3 ${BIN_DIR}/../normalize.py \
         --metadata=dump/dev/raw/metadata.jsonl \
         --dumpdir=dump/dev/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
+        --skip-wav-copy
     
     python3 ${BIN_DIR}/../normalize.py \
         --metadata=dump/test/raw/metadata.jsonl \
         --dumpdir=dump/test/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
+        --skip-wav-copy
 fi
diff --git a/examples/ljspeech/voc1/local/synthesize.sh b/examples/ljspeech/voc1/local/synthesize.sh
deleted file mode 100755
index 145557b3d..000000000
--- a/examples/ljspeech/voc1/local/synthesize.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-ckpt_name=$3
-
-FLAGS_allocator_strategy=naive_best_fit \
-FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ${BIN_DIR}/../synthesize.py \
-    --config=${config_path} \
-    --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
-    --test-metadata=dump/test/norm/metadata.jsonl \
-    --output-dir=${train_output_path}/test \
-    --generator-type=pwgan
diff --git a/examples/ljspeech/voc1/local/synthesize.sh b/examples/ljspeech/voc1/local/synthesize.sh
new file mode 120000
index 000000000..d6aecd8d1
--- /dev/null
+++ b/examples/ljspeech/voc1/local/synthesize.sh
@@ -0,0 +1 @@
+../../../csmsc/voc1/local/synthesize.sh
\ No newline at end of file
diff --git a/examples/ljspeech/voc1/local/train.sh b/examples/ljspeech/voc1/local/train.sh
deleted file mode 100755
index 9695631ef..000000000
--- a/examples/ljspeech/voc1/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-FLAGS_cudnn_exhaustive_search=true \
-FLAGS_conv_workspace_size_limit=4000 \
-python ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=1
diff --git a/examples/ljspeech/voc1/local/train.sh b/examples/ljspeech/voc1/local/train.sh
new file mode 120000
index 000000000..2942893d2
--- /dev/null
+++ b/examples/ljspeech/voc1/local/train.sh
@@ -0,0 +1 @@
+../../../csmsc/voc1/local/train.sh
\ No newline at end of file
diff --git a/examples/ljspeech/voc1/path.sh b/examples/ljspeech/voc1/path.sh
deleted file mode 100755
index 1e6647b86..000000000
--- a/examples/ljspeech/voc1/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=parallelwave_gan
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL}
\ No newline at end of file
diff --git a/examples/ljspeech/voc1/path.sh b/examples/ljspeech/voc1/path.sh
new file mode 120000
index 000000000..b7ed4fb8f
--- /dev/null
+++ b/examples/ljspeech/voc1/path.sh
@@ -0,0 +1 @@
+../../csmsc/voc1/path.sh
\ No newline at end of file
diff --git a/examples/ljspeech/voc5/local/preprocess.sh b/examples/ljspeech/voc5/local/preprocess.sh
deleted file mode 100755
index d1af60dad..000000000
--- a/examples/ljspeech/voc5/local/preprocess.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/bash
-
-stage=0
-stop_stage=100
-
-config_path=$1
-
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    # get durations from MFA's result
-    echo "Generate durations.txt from MFA results ..."
-    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
-        --inputdir=./ljspeech_alignment \
-        --output=durations.txt \
-        --config=${config_path}
-fi
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    # extract features
-    echo "Extract features ..."
-    python3 ${BIN_DIR}/../preprocess.py \
-        --rootdir=~/datasets/LJSpeech-1.1/ \
-        --dataset=ljspeech \
-        --dumpdir=dump \
-        --dur-file=durations.txt \
-        --config=${config_path} \
-        --cut-sil=True \
-        --num-cpu=20
-fi
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    # get features' stats(mean and std)
-    echo "Get features' stats ..."
-    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
-        --metadata=dump/train/raw/metadata.jsonl \
-        --field-name="feats"
-fi
-
-if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
-    # normalize, dev and test should use train's stats
-    echo "Normalize ..."
-   
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/train/raw/metadata.jsonl \
-        --dumpdir=dump/train/norm \
-        --stats=dump/train/feats_stats.npy
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/dev/raw/metadata.jsonl \
-        --dumpdir=dump/dev/norm \
-        --stats=dump/train/feats_stats.npy
-    
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/test/raw/metadata.jsonl \
-        --dumpdir=dump/test/norm \
-        --stats=dump/train/feats_stats.npy
-fi
diff --git a/examples/ljspeech/voc5/local/preprocess.sh b/examples/ljspeech/voc5/local/preprocess.sh
new file mode 120000
index 000000000..f0cb24de9
--- /dev/null
+++ b/examples/ljspeech/voc5/local/preprocess.sh
@@ -0,0 +1 @@
+../../voc1/local/preprocess.sh
\ No newline at end of file
diff --git a/examples/ljspeech/voc5/local/synthesize.sh b/examples/ljspeech/voc5/local/synthesize.sh
deleted file mode 100755
index 647896175..000000000
--- a/examples/ljspeech/voc5/local/synthesize.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-ckpt_name=$3
-
-FLAGS_allocator_strategy=naive_best_fit \
-FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ${BIN_DIR}/../synthesize.py \
-    --config=${config_path} \
-    --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
-    --test-metadata=dump/test/norm/metadata.jsonl \
-    --output-dir=${train_output_path}/test \
-    --generator-type=hifigan
diff --git a/examples/ljspeech/voc5/local/synthesize.sh b/examples/ljspeech/voc5/local/synthesize.sh
new file mode 120000
index 000000000..c887112c0
--- /dev/null
+++ b/examples/ljspeech/voc5/local/synthesize.sh
@@ -0,0 +1 @@
+../../../csmsc/voc5/local/synthesize.sh
\ No newline at end of file
diff --git a/examples/ljspeech/voc5/local/train.sh b/examples/ljspeech/voc5/local/train.sh
deleted file mode 100755
index 9695631ef..000000000
--- a/examples/ljspeech/voc5/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-FLAGS_cudnn_exhaustive_search=true \
-FLAGS_conv_workspace_size_limit=4000 \
-python ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=1
diff --git a/examples/ljspeech/voc5/local/train.sh b/examples/ljspeech/voc5/local/train.sh
new file mode 120000
index 000000000..2942893d2
--- /dev/null
+++ b/examples/ljspeech/voc5/local/train.sh
@@ -0,0 +1 @@
+../../../csmsc/voc1/local/train.sh
\ No newline at end of file
diff --git a/examples/ljspeech/voc5/path.sh b/examples/ljspeech/voc5/path.sh
deleted file mode 100755
index 7451b3218..000000000
--- a/examples/ljspeech/voc5/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=hifigan
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL}
diff --git a/examples/ljspeech/voc5/path.sh b/examples/ljspeech/voc5/path.sh
new file mode 120000
index 000000000..b67fe2b39
--- /dev/null
+++ b/examples/ljspeech/voc5/path.sh
@@ -0,0 +1 @@
+../../csmsc/voc5/path.sh
\ No newline at end of file
diff --git a/examples/vctk/ernie_sat/local/train.sh b/examples/vctk/ernie_sat/local/train.sh
deleted file mode 100755
index 526aac435..000000000
--- a/examples/vctk/ernie_sat/local/train.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-python3 ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=8 \
-    --phones-dict=dump/phone_id_map.txt
\ No newline at end of file
diff --git a/examples/vctk/ernie_sat/local/train.sh b/examples/vctk/ernie_sat/local/train.sh
new file mode 120000
index 000000000..9f1d2346d
--- /dev/null
+++ b/examples/vctk/ernie_sat/local/train.sh
@@ -0,0 +1 @@
+../../../aishell3/ernie_sat/local/train.sh
\ No newline at end of file
diff --git a/examples/vctk/ernie_sat/path.sh b/examples/vctk/ernie_sat/path.sh
deleted file mode 100755
index 4ecab0251..000000000
--- a/examples/vctk/ernie_sat/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=ernie_sat
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
\ No newline at end of file
diff --git a/examples/vctk/ernie_sat/path.sh b/examples/vctk/ernie_sat/path.sh
new file mode 120000
index 000000000..5ec397590
--- /dev/null
+++ b/examples/vctk/ernie_sat/path.sh
@@ -0,0 +1 @@
+../../aishell3/ernie_sat/path.sh
\ No newline at end of file
diff --git a/examples/vctk/tts3/local/train.sh b/examples/vctk/tts3/local/train.sh
deleted file mode 100755
index 3a5076505..000000000
--- a/examples/vctk/tts3/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-python3 ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=1 \
-    --phones-dict=dump/phone_id_map.txt \
-    --speaker-dict=dump/speaker_id_map.txt
diff --git a/examples/vctk/tts3/local/train.sh b/examples/vctk/tts3/local/train.sh
new file mode 120000
index 000000000..78885a300
--- /dev/null
+++ b/examples/vctk/tts3/local/train.sh
@@ -0,0 +1 @@
+../../../aishell3/tts3/local/train.sh
\ No newline at end of file
diff --git a/examples/vctk/tts3/path.sh b/examples/vctk/tts3/path.sh
deleted file mode 100755
index fb7e8411c..000000000
--- a/examples/vctk/tts3/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=fastspeech2
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
diff --git a/examples/vctk/tts3/path.sh b/examples/vctk/tts3/path.sh
new file mode 120000
index 000000000..4785b9095
--- /dev/null
+++ b/examples/vctk/tts3/path.sh
@@ -0,0 +1 @@
+../../csmsc/tts3/path.sh
\ No newline at end of file
diff --git a/examples/vctk/voc1/local/preprocess.sh b/examples/vctk/voc1/local/preprocess.sh
index 88a478cd5..6b7e5288a 100755
--- a/examples/vctk/voc1/local/preprocess.sh
+++ b/examples/vctk/voc1/local/preprocess.sh
@@ -42,14 +42,18 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
     python3 ${BIN_DIR}/../normalize.py \
         --metadata=dump/train/raw/metadata.jsonl \
         --dumpdir=dump/train/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
+        --skip-wav-copy
+
     python3 ${BIN_DIR}/../normalize.py \
         --metadata=dump/dev/raw/metadata.jsonl \
         --dumpdir=dump/dev/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
+        --skip-wav-copy
     
     python3 ${BIN_DIR}/../normalize.py \
         --metadata=dump/test/raw/metadata.jsonl \
         --dumpdir=dump/test/norm \
-        --stats=dump/train/feats_stats.npy
+        --stats=dump/train/feats_stats.npy \
+        --skip-wav-copy
 fi
diff --git a/examples/vctk/voc1/local/synthesize.sh b/examples/vctk/voc1/local/synthesize.sh
deleted file mode 100755
index 145557b3d..000000000
--- a/examples/vctk/voc1/local/synthesize.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-ckpt_name=$3
-
-FLAGS_allocator_strategy=naive_best_fit \
-FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ${BIN_DIR}/../synthesize.py \
-    --config=${config_path} \
-    --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
-    --test-metadata=dump/test/norm/metadata.jsonl \
-    --output-dir=${train_output_path}/test \
-    --generator-type=pwgan
diff --git a/examples/vctk/voc1/local/synthesize.sh b/examples/vctk/voc1/local/synthesize.sh
new file mode 120000
index 000000000..d6aecd8d1
--- /dev/null
+++ b/examples/vctk/voc1/local/synthesize.sh
@@ -0,0 +1 @@
+../../../csmsc/voc1/local/synthesize.sh
\ No newline at end of file
diff --git a/examples/vctk/voc1/local/train.sh b/examples/vctk/voc1/local/train.sh
deleted file mode 100755
index 9695631ef..000000000
--- a/examples/vctk/voc1/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-FLAGS_cudnn_exhaustive_search=true \
-FLAGS_conv_workspace_size_limit=4000 \
-python ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=1
diff --git a/examples/vctk/voc1/local/train.sh b/examples/vctk/voc1/local/train.sh
new file mode 120000
index 000000000..2942893d2
--- /dev/null
+++ b/examples/vctk/voc1/local/train.sh
@@ -0,0 +1 @@
+../../../csmsc/voc1/local/train.sh
\ No newline at end of file
diff --git a/examples/vctk/voc1/path.sh b/examples/vctk/voc1/path.sh
deleted file mode 100755
index 1e6647b86..000000000
--- a/examples/vctk/voc1/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=parallelwave_gan
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL}
\ No newline at end of file
diff --git a/examples/vctk/voc1/path.sh b/examples/vctk/voc1/path.sh
new file mode 120000
index 000000000..b7ed4fb8f
--- /dev/null
+++ b/examples/vctk/voc1/path.sh
@@ -0,0 +1 @@
+../../csmsc/voc1/path.sh
\ No newline at end of file
diff --git a/examples/vctk/voc5/local/preprocess.sh b/examples/vctk/voc5/local/preprocess.sh
deleted file mode 100755
index 88a478cd5..000000000
--- a/examples/vctk/voc5/local/preprocess.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/bash
-
-stage=0
-stop_stage=100
-
-config_path=$1
-
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    # get durations from MFA's result
-    echo "Generate durations.txt from MFA results ..."
-    python3 ${MAIN_ROOT}/utils/gen_duration_from_textgrid.py \
-        --inputdir=./vctk_alignment \
-        --output=durations.txt \
-        --config=${config_path}
-fi
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    # extract features
-    echo "Extract features ..."
-    python3 ${BIN_DIR}/../preprocess.py \
-        --rootdir=~/datasets/VCTK-Corpus-0.92/ \
-        --dataset=vctk \
-        --dumpdir=dump \
-        --dur-file=durations.txt \
-        --config=${config_path} \
-        --cut-sil=True \
-        --num-cpu=20
-fi
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
-    # get features' stats(mean and std)
-    echo "Get features' stats ..."
-    python3 ${MAIN_ROOT}/utils/compute_statistics.py \
-        --metadata=dump/train/raw/metadata.jsonl \
-        --field-name="feats"
-fi
-
-if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
-    # normalize, dev and test should use train's stats
-    echo "Normalize ..."
-   
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/train/raw/metadata.jsonl \
-        --dumpdir=dump/train/norm \
-        --stats=dump/train/feats_stats.npy
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/dev/raw/metadata.jsonl \
-        --dumpdir=dump/dev/norm \
-        --stats=dump/train/feats_stats.npy
-    
-    python3 ${BIN_DIR}/../normalize.py \
-        --metadata=dump/test/raw/metadata.jsonl \
-        --dumpdir=dump/test/norm \
-        --stats=dump/train/feats_stats.npy
-fi
diff --git a/examples/vctk/voc5/local/preprocess.sh b/examples/vctk/voc5/local/preprocess.sh
new file mode 120000
index 000000000..f0cb24de9
--- /dev/null
+++ b/examples/vctk/voc5/local/preprocess.sh
@@ -0,0 +1 @@
+../../voc1/local/preprocess.sh
\ No newline at end of file
diff --git a/examples/vctk/voc5/local/synthesize.sh b/examples/vctk/voc5/local/synthesize.sh
deleted file mode 100755
index 647896175..000000000
--- a/examples/vctk/voc5/local/synthesize.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-ckpt_name=$3
-
-FLAGS_allocator_strategy=naive_best_fit \
-FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ${BIN_DIR}/../synthesize.py \
-    --config=${config_path} \
-    --checkpoint=${train_output_path}/checkpoints/${ckpt_name} \
-    --test-metadata=dump/test/norm/metadata.jsonl \
-    --output-dir=${train_output_path}/test \
-    --generator-type=hifigan
diff --git a/examples/vctk/voc5/local/synthesize.sh b/examples/vctk/voc5/local/synthesize.sh
new file mode 120000
index 000000000..c887112c0
--- /dev/null
+++ b/examples/vctk/voc5/local/synthesize.sh
@@ -0,0 +1 @@
+../../../csmsc/voc5/local/synthesize.sh
\ No newline at end of file
diff --git a/examples/vctk/voc5/local/train.sh b/examples/vctk/voc5/local/train.sh
deleted file mode 100755
index 9695631ef..000000000
--- a/examples/vctk/voc5/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-FLAGS_cudnn_exhaustive_search=true \
-FLAGS_conv_workspace_size_limit=4000 \
-python ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=1
diff --git a/examples/vctk/voc5/local/train.sh b/examples/vctk/voc5/local/train.sh
new file mode 120000
index 000000000..2942893d2
--- /dev/null
+++ b/examples/vctk/voc5/local/train.sh
@@ -0,0 +1 @@
+../../../csmsc/voc1/local/train.sh
\ No newline at end of file
diff --git a/examples/vctk/voc5/path.sh b/examples/vctk/voc5/path.sh
deleted file mode 100755
index 7451b3218..000000000
--- a/examples/vctk/voc5/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=hifigan
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/gan_vocoder/${MODEL}
diff --git a/examples/vctk/voc5/path.sh b/examples/vctk/voc5/path.sh
new file mode 120000
index 000000000..b67fe2b39
--- /dev/null
+++ b/examples/vctk/voc5/path.sh
@@ -0,0 +1 @@
+../../csmsc/voc5/path.sh
\ No newline at end of file
diff --git a/examples/zh_en_tts/tts3/local/train.sh b/examples/zh_en_tts/tts3/local/train.sh
deleted file mode 100755
index 1da72f117..000000000
--- a/examples/zh_en_tts/tts3/local/train.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-
-config_path=$1
-train_output_path=$2
-
-python3 ${BIN_DIR}/train.py \
-    --train-metadata=dump/train/norm/metadata.jsonl \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
-    --config=${config_path} \
-    --output-dir=${train_output_path} \
-    --ngpu=2 \
-    --phones-dict=dump/phone_id_map.txt \
-    --speaker-dict=dump/speaker_id_map.txt
diff --git a/examples/zh_en_tts/tts3/local/train.sh b/examples/zh_en_tts/tts3/local/train.sh
new file mode 120000
index 000000000..78885a300
--- /dev/null
+++ b/examples/zh_en_tts/tts3/local/train.sh
@@ -0,0 +1 @@
+../../../aishell3/tts3/local/train.sh
\ No newline at end of file
diff --git a/examples/zh_en_tts/tts3/path.sh b/examples/zh_en_tts/tts3/path.sh
deleted file mode 100755
index fb7e8411c..000000000
--- a/examples/zh_en_tts/tts3/path.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-export MAIN_ROOT=`realpath ${PWD}/../../../`
-
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
-export LC_ALL=C
-
-export PYTHONDONTWRITEBYTECODE=1
-# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8
-export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
-
-MODEL=fastspeech2
-export BIN_DIR=${MAIN_ROOT}/paddlespeech/t2s/exps/${MODEL}
diff --git a/examples/zh_en_tts/tts3/path.sh b/examples/zh_en_tts/tts3/path.sh
new file mode 120000
index 000000000..4785b9095
--- /dev/null
+++ b/examples/zh_en_tts/tts3/path.sh
@@ -0,0 +1 @@
+../../csmsc/tts3/path.sh
\ No newline at end of file

From 31c2c226cacf88281332e61bd03bb863b1c1e9cf Mon Sep 17 00:00:00 2001
From: zxcd <228587199@qq.com>
Date: Mon, 30 Jan 2023 19:11:02 +0800
Subject: [PATCH 13/24] clean fluid elementwise_max and square api. (#2852)

---
 paddlespeech/s2t/training/gradclip.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddlespeech/s2t/training/gradclip.py b/paddlespeech/s2t/training/gradclip.py
index 26ac501e2..b2c0500d3 100644
--- a/paddlespeech/s2t/training/gradclip.py
+++ b/paddlespeech/s2t/training/gradclip.py
@@ -43,7 +43,7 @@ class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm):
             if g.type == core.VarDesc.VarType.SELECTED_ROWS:
                 merge_grad = layers.merge_selected_rows(g)
                 merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
-            square = layers.square(merge_grad)
+            square = paddle.square(merge_grad)
             sum_square = layers.reduce_sum(square)
             sum_square_list.append(sum_square)
 
@@ -66,7 +66,7 @@ class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm):
             shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm)
         clip_var = layers.elementwise_div(
             x=max_global_norm,
-            y=layers.elementwise_max(x=global_norm_var, y=max_global_norm))
+            y=paddle.maximum(x=global_norm_var, y=max_global_norm))
         for i, (p, g) in enumerate(params_grads):
             if g is None:
                 continue

From b5764e9f74665babfdd922189560ba269c072635 Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Mon, 30 Jan 2023 19:17:37 +0800
Subject: [PATCH 14/24] [Install]rm protobuf in setup.py (#2853)

* rm protobuf in setup.py && rm audio's dependances in setup.py
---
 audio/setup.py        | 2 +-
 docs/requirements.txt | 4 +---
 setup.py              | 6 +-----
 3 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/audio/setup.py b/audio/setup.py
index 82e9a55a5..d36b2c440 100644
--- a/audio/setup.py
+++ b/audio/setup.py
@@ -43,7 +43,7 @@ base = [
     "scipy>=1.0.0",
     "soundfile~=0.10",
     "colorlog",
-    "pathos == 0.2.8",
+    "pathos==0.2.8",
     "pybind11",
     "parameterized",
     "tqdm",
diff --git a/docs/requirements.txt b/docs/requirements.txt
index c6228d917..5422c26f9 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,5 +1,4 @@
 braceexpand
-colorlog
 editdistance
 fastapi
 g2p_en
@@ -16,7 +15,7 @@ matplotlib
 myst-parser
 nara_wpe
 numpydoc
-onnxruntime==1.10.0
+onnxruntime>=1.11.0
 opencc
 paddlenlp
 # use paddlepaddle == 2.3.* according to: https://github.com/PaddlePaddle/Paddle/issues/48243
@@ -24,7 +23,6 @@ paddlepaddle>=2.2.2,<2.4.0
 paddlespeech_ctcdecoders
 paddlespeech_feat
 pandas
-pathos==0.2.8
 pattern_singleton
 Pillow>=9.0.0
 ppdiffusers>=0.9.0
diff --git a/setup.py b/setup.py
index 212d3b109..be6cf63a9 100644
--- a/setup.py
+++ b/setup.py
@@ -53,7 +53,6 @@ base = [
     "paddlespeech_feat",
     "Pillow>=9.0.0",
     "praatio==5.0.0",
-    "protobuf>=3.1.0, <=3.20.0",
     "pypinyin<=0.44.0",
     "pypinyin-dict",
     "python-dateutil",
@@ -72,12 +71,9 @@ base = [
     "yacs~=0.1.8",
     "prettytable",
     "zhon",
-    "colorlog",
-    "pathos==0.2.8",
     "braceexpand",
     "pyyaml",
-    "pybind11",
-    "paddleslim==2.3.4",
+    "paddleslim>=2.3.4",
     "paddleaudio>=1.0.2",
 ]
 

From 64aeb6dccc73a262bab9f9ed2a1b8c7b15a30582 Mon Sep 17 00:00:00 2001
From: zxcd <228587199@qq.com>
Date: Tue, 31 Jan 2023 19:52:45 +0800
Subject: [PATCH 15/24] remove some fluid api (elementwise_div elementwise_mul
 sqrt reduce_sum). (#2859)

---
 paddlespeech/s2t/training/gradclip.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/paddlespeech/s2t/training/gradclip.py b/paddlespeech/s2t/training/gradclip.py
index b2c0500d3..be6fcf589 100644
--- a/paddlespeech/s2t/training/gradclip.py
+++ b/paddlespeech/s2t/training/gradclip.py
@@ -44,7 +44,7 @@ class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm):
                 merge_grad = layers.merge_selected_rows(g)
                 merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
             square = paddle.square(merge_grad)
-            sum_square = layers.reduce_sum(square)
+            sum_square = paddle.sum(square)
             sum_square_list.append(sum_square)
 
             # debug log, not dump all since slow down train process
@@ -57,14 +57,15 @@ class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm):
             return params_grads
 
         global_norm_var = layers.concat(sum_square_list)
-        global_norm_var = layers.reduce_sum(global_norm_var)
-        global_norm_var = layers.sqrt(global_norm_var)
+        global_norm_var = paddle.sum(global_norm_var)
+        global_norm_var = paddle.sqrt(global_norm_var)
+
         # debug log
         logger.debug(f"Grad Global Norm: {float(global_norm_var)}!!!!")
 
         max_global_norm = layers.fill_constant(
             shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm)
-        clip_var = layers.elementwise_div(
+        clip_var = paddle.divide(
             x=max_global_norm,
             y=paddle.maximum(x=global_norm_var, y=max_global_norm))
         for i, (p, g) in enumerate(params_grads):
@@ -73,7 +74,7 @@ class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm):
             if getattr(p, 'need_clip', True) is False:
                 params_and_grads.append((p, g))
                 continue
-            new_grad = layers.elementwise_mul(x=g, y=clip_var)
+            new_grad = paddle.multiply(x=g, y=clip_var)
             params_and_grads.append((p, new_grad))
 
             # debug log, not dump all since slow down train process

From 2f526c093cac230493f1ae399fa7182f73d588d3 Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Wed, 1 Feb 2023 14:06:46 +0800
Subject: [PATCH 16/24] fix data for slim (#2862)

---
 examples/csmsc/tts3/local/PTQ_static.sh | 2 +-
 examples/csmsc/voc1/local/PTQ_static.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/csmsc/tts3/local/PTQ_static.sh b/examples/csmsc/tts3/local/PTQ_static.sh
index a70a77b58..c6dce53cb 100755
--- a/examples/csmsc/tts3/local/PTQ_static.sh
+++ b/examples/csmsc/tts3/local/PTQ_static.sh
@@ -5,4 +5,4 @@ python3 ${BIN_DIR}/../PTQ_static.py \
     --dev-metadata=dump/dev/norm/metadata.jsonl \
     --inference_dir ${train_output_path}/inference \
     --model_name ${model_name} \
-    --onnx_forma=True
\ No newline at end of file
+    --onnx_format=True
\ No newline at end of file
diff --git a/examples/csmsc/voc1/local/PTQ_static.sh b/examples/csmsc/voc1/local/PTQ_static.sh
index 2e5166141..c85ebd109 100755
--- a/examples/csmsc/voc1/local/PTQ_static.sh
+++ b/examples/csmsc/voc1/local/PTQ_static.sh
@@ -2,7 +2,7 @@ train_output_path=$1
 model_name=$2
 
 python3 ${BIN_DIR}/../../PTQ_static.py \
-    --dev-metadata=dump/dev/norm/metadata.jsonl \
+    --dev-metadata=dump/dev/raw/metadata.jsonl \
     --inference_dir ${train_output_path}/inference \
     --model_name ${model_name} \
     --onnx_format=True 
\ No newline at end of file

From ac3ed3c5a8a4e81ad662b8c41efa562f415dad7b Mon Sep 17 00:00:00 2001
From: QuanZ9 <31169290+QuanZ9@users.noreply.github.com>
Date: Wed, 1 Feb 2023 15:55:52 +0800
Subject: [PATCH 17/24] Update zh_frontend.py (#2863)

---
 paddlespeech/t2s/frontend/zh_frontend.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddlespeech/t2s/frontend/zh_frontend.py b/paddlespeech/t2s/frontend/zh_frontend.py
index ddd8cf5c7..efb673e36 100644
--- a/paddlespeech/t2s/frontend/zh_frontend.py
+++ b/paddlespeech/t2s/frontend/zh_frontend.py
@@ -138,7 +138,7 @@ class Frontend():
             "拐儿", "聋儿", "乞儿", "患儿", "幼儿", "孤儿", "婴儿", "婴幼儿", "连体儿", "脑瘫儿",
             "流浪儿", "体弱儿", "混血儿", "蜜雪儿", "舫儿", "祖儿", "美儿", "应采儿", "可儿", "侄儿",
             "孙儿", "侄孙儿", "女儿", "男儿", "红孩儿", "花儿", "虫儿", "马儿", "鸟儿", "猪儿", "猫儿",
-            "狗儿"
+            "狗儿", "少儿"
         }
 
         self.vocab_phones = {}

From 896da6dcd152b6241f606343dfa5ee6ec4932df5 Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Wed, 1 Feb 2023 18:25:00 +0800
Subject: [PATCH 18/24] remove utils and third_party in paddlespeech's
 site-packages (#2867)

---
 audio/setup.py | 2 +-
 setup.py       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/audio/setup.py b/audio/setup.py
index d36b2c440..6e358346c 100644
--- a/audio/setup.py
+++ b/audio/setup.py
@@ -273,7 +273,7 @@ def main():
         },
 
         # Package info
-        packages=find_packages(include=('paddleaudio*')),
+        packages=find_packages(include=['paddleaudio*']),
         package_data=lib_package_data,
         ext_modules=setup_helpers.get_ext_modules(),
         zip_safe=True,
diff --git a/setup.py b/setup.py
index be6cf63a9..2c97ce783 100644
--- a/setup.py
+++ b/setup.py
@@ -300,7 +300,7 @@ setup_info = dict(
     },
 
     # Package info
-    packages=find_packages(include=('paddlespeech*')),
+    packages=find_packages(include=['paddlespeech*'], exclude=['utils', 'third_party']),
     zip_safe=True,
     classifiers=[
         'Development Status :: 5 - Production/Stable',

From a55fd2e55685236c34330e0ba01e98878fc5b8cb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E8=89=BE=E6=A2=A6?= <HighCWu@163.com>
Date: Thu, 2 Feb 2023 13:03:41 +0800
Subject: [PATCH 19/24] [TTS]Fix diffusion wavenet denoiser final conv init
 param (#2868)

* add diffusion module for training diffsinger

* add wavenet denoiser final conv initializer
---
 paddlespeech/t2s/modules/diffusion.py | 34 +++++++++++----------------
 1 file changed, 14 insertions(+), 20 deletions(-)

diff --git a/paddlespeech/t2s/modules/diffusion.py b/paddlespeech/t2s/modules/diffusion.py
index 52fe84ceb..eb67ffb0d 100644
--- a/paddlespeech/t2s/modules/diffusion.py
+++ b/paddlespeech/t2s/modules/diffusion.py
@@ -40,7 +40,7 @@ class WaveNetDenoiser(nn.Layer):
         layers (int, optional): 
             Number of residual blocks inside, by default 20
         stacks (int, optional):
-            The number of groups to split the residual blocks into, by default 4
+            The number of groups to split the residual blocks into, by default 5
             Within each group, the dilation of the residual block grows exponentially.
         residual_channels (int, optional): 
             Residual channel of the residual blocks, by default 256
@@ -64,7 +64,7 @@ class WaveNetDenoiser(nn.Layer):
             out_channels: int=80,
             kernel_size: int=3,
             layers: int=20,
-            stacks: int=4,
+            stacks: int=5,
             residual_channels: int=256,
             gate_channels: int=512,
             skip_channels: int=256,
@@ -72,7 +72,7 @@ class WaveNetDenoiser(nn.Layer):
             dropout: float=0.,
             bias: bool=True,
             use_weight_norm: bool=False,
-            init_type: str="kaiming_uniform", ):
+            init_type: str="kaiming_normal", ):
         super().__init__()
 
         # initialize parameters
@@ -118,18 +118,15 @@ class WaveNetDenoiser(nn.Layer):
                 bias=bias)
             self.conv_layers.append(conv)
 
+        final_conv = nn.Conv1D(skip_channels, out_channels, 1, bias_attr=True)
+        nn.initializer.Constant(0.0)(final_conv.weight)
         self.last_conv_layers = nn.Sequential(nn.ReLU(),
                                               nn.Conv1D(
                                                   skip_channels,
                                                   skip_channels,
                                                   1,
                                                   bias_attr=True),
-                                              nn.ReLU(),
-                                              nn.Conv1D(
-                                                  skip_channels,
-                                                  out_channels,
-                                                  1,
-                                                  bias_attr=True))
+                                              nn.ReLU(), final_conv)
 
         if use_weight_norm:
             self.apply_weight_norm()
@@ -200,10 +197,6 @@ class GaussianDiffusion(nn.Layer):
     Args:
         denoiser (Layer, optional): 
             The model used for denoising noises.
-            In fact, the denoiser model performs the operation 
-            of producing a output with more noises from the noisy input. 
-            Then we use the diffusion algorithm to calculate 
-            the input with the output to get the denoised result.
         num_train_timesteps (int, optional): 
             The number of timesteps between the noise and the real during training, by default 1000.
         beta_start (float, optional): 
@@ -233,7 +226,8 @@ class GaussianDiffusion(nn.Layer):
         >>>     def callback(index, timestep, num_timesteps, sample):
         >>>         nonlocal pbar
         >>>         if pbar is None:
-        >>>             pbar = tqdm(total=num_timesteps-index)
+        >>>             pbar = tqdm(total=num_timesteps)
+        >>>             pbar.update(index)
         >>>         pbar.update()
         >>> 
         >>>     return callback
@@ -247,7 +241,7 @@ class GaussianDiffusion(nn.Layer):
         >>> diffusion = GaussianDiffusion(denoiser, num_train_timesteps=ds, num_max_timesteps=K_step)
         >>> with paddle.no_grad():
         >>>     sample = diffusion.inference(
-        >>>         paddle.randn(x.shape), c, x, 
+        >>>         paddle.randn(x.shape), c, ref_x=x_in, 
         >>>         num_inference_steps=infer_steps,
         >>>         scheduler_type=scheduler_type,
         >>>         callback=create_progress_callback())
@@ -262,7 +256,7 @@ class GaussianDiffusion(nn.Layer):
         >>> diffusion = GaussianDiffusion(denoiser, num_train_timesteps=ds, num_max_timesteps=K_step)
         >>> with paddle.no_grad():
         >>>     sample = diffusion.inference(
-        >>>         paddle.randn(x.shape), c, x_in, 
+        >>>         paddle.randn(x.shape), c, ref_x=x_in, 
         >>>         num_inference_steps=infer_steps,
         >>>         scheduler_type=scheduler_type,
         >>>         callback=create_progress_callback())
@@ -277,11 +271,11 @@ class GaussianDiffusion(nn.Layer):
         >>> diffusion = GaussianDiffusion(denoiser, num_train_timesteps=ds, num_max_timesteps=K_step)
         >>> with paddle.no_grad():
         >>>     sample = diffusion.inference(
-        >>>         paddle.randn(x.shape), c, None, 
+        >>>         paddle.randn(x.shape), c, ref_x=x_in, 
         >>>         num_inference_steps=infer_steps,
         >>>         scheduler_type=scheduler_type,
         >>>         callback=create_progress_callback())
-        100%|█████| 25/25 [00:01<00:00, 19.75it/s]
+        100%|█████| 34/34 [00:01<00:00, 19.75it/s]
         >>> 
         >>> # ds=1000, K_step=100, scheduler=pndm, infer_step=50, from aux fs2 mel output
         >>> ds = 1000
@@ -292,11 +286,11 @@ class GaussianDiffusion(nn.Layer):
         >>> diffusion = GaussianDiffusion(denoiser, num_train_timesteps=ds, num_max_timesteps=K_step)
         >>> with paddle.no_grad():
         >>>     sample = diffusion.inference(
-        >>>         paddle.randn(x.shape), c, x, 
+        >>>         paddle.randn(x.shape), c, ref_x=x_in, 
         >>>         num_inference_steps=infer_steps,
         >>>         scheduler_type=scheduler_type,
         >>>         callback=create_progress_callback())
-        100%|█████| 5/5 [00:00<00:00, 23.80it/s]
+        100%|█████| 14/14 [00:00<00:00, 23.80it/s]
 
     """
 

From a283f8a57e8bbc411bd36f2e0d8df3e0780a1c0e Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Thu, 2 Feb 2023 13:04:20 +0800
Subject: [PATCH 20/24] [TTS]fix open encoding (#2865)

---
 paddlespeech/cli/tts/infer.py                        | 6 +++---
 paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py    | 2 +-
 paddlespeech/t2s/exps/ernie_sat/train.py             | 2 +-
 paddlespeech/t2s/exps/fastspeech2/train.py           | 4 ++--
 paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py | 6 +++---
 paddlespeech/t2s/exps/speedyspeech/train.py          | 6 +++---
 paddlespeech/t2s/exps/syn_utils.py                   | 8 ++++----
 paddlespeech/t2s/exps/tacotron2/train.py             | 2 +-
 paddlespeech/t2s/exps/transformer_tts/train.py       | 2 +-
 paddlespeech/t2s/exps/vits/train.py                  | 4 ++--
 paddlespeech/t2s/frontend/phonectic.py               | 2 +-
 paddlespeech/t2s/frontend/zh_frontend.py             | 4 ++--
 12 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/paddlespeech/cli/tts/infer.py b/paddlespeech/cli/tts/infer.py
index 707518c05..5515ade26 100644
--- a/paddlespeech/cli/tts/infer.py
+++ b/paddlespeech/cli/tts/infer.py
@@ -292,19 +292,19 @@ class TTSExecutor(BaseExecutor):
         with open(self.voc_config) as f:
             self.voc_config = CfgNode(yaml.safe_load(f))
 
-        with open(self.phones_dict, "r") as f:
+        with open(self.phones_dict, 'rt', encoding='utf-8') as f:
             phn_id = [line.strip().split() for line in f.readlines()]
         vocab_size = len(phn_id)
 
         tone_size = None
         if self.tones_dict:
-            with open(self.tones_dict, "r") as f:
+            with open(self.tones_dict, 'rt', encoding='utf-8') as f:
                 tone_id = [line.strip().split() for line in f.readlines()]
             tone_size = len(tone_id)
 
         spk_num = None
         if self.speaker_dict:
-            with open(self.speaker_dict, 'rt') as f:
+            with open(self.speaker_dict, 'rt', encoding='utf-8') as f:
                 spk_id = [line.strip().split() for line in f.readlines()]
             spk_num = len(spk_id)
 
diff --git a/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py b/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py
index e450aa1a0..c43dafb3c 100644
--- a/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/ernie_sat/synthesize_e2e.py
@@ -437,7 +437,7 @@ if __name__ == '__main__':
 
     vocab_phones = {}
 
-    with open(args.phones_dict, 'rt') as f:
+    with open(args.phones_dict, 'rt', encoding='utf-8') as f:
         phn_id = [line.strip().split() for line in f.readlines()]
     for phn, id in phn_id:
         vocab_phones[phn] = int(id)
diff --git a/paddlespeech/t2s/exps/ernie_sat/train.py b/paddlespeech/t2s/exps/ernie_sat/train.py
index 75a666bb1..c98d691be 100644
--- a/paddlespeech/t2s/exps/ernie_sat/train.py
+++ b/paddlespeech/t2s/exps/ernie_sat/train.py
@@ -109,7 +109,7 @@ def train_sp(args, config):
         num_workers=config.num_workers)
     print("dataloaders done!")
 
-    with open(args.phones_dict, "r") as f:
+    with open(args.phones_dict, 'rt', encoding='utf-8') as f:
         phn_id = [line.strip().split() for line in f.readlines()]
     vocab_size = len(phn_id)
     print("vocab_size:", vocab_size)
diff --git a/paddlespeech/t2s/exps/fastspeech2/train.py b/paddlespeech/t2s/exps/fastspeech2/train.py
index d31e62a82..97626db0b 100644
--- a/paddlespeech/t2s/exps/fastspeech2/train.py
+++ b/paddlespeech/t2s/exps/fastspeech2/train.py
@@ -67,7 +67,7 @@ def train_sp(args, config):
     if args.speaker_dict is not None:
         print("multiple speaker fastspeech2!")
         collate_fn = fastspeech2_multi_spk_batch_fn
-        with open(args.speaker_dict, 'rt') as f:
+        with open(args.speaker_dict, 'rt', encoding='utf-8') as f:
             spk_id = [line.strip().split() for line in f.readlines()]
         spk_num = len(spk_id)
         fields += ["spk_id"]
@@ -123,7 +123,7 @@ def train_sp(args, config):
         num_workers=config.num_workers)
     print("dataloaders done!")
 
-    with open(args.phones_dict, "r") as f:
+    with open(args.phones_dict, 'rt', encoding='utf-8') as f:
         phn_id = [line.strip().split() for line in f.readlines()]
     vocab_size = len(phn_id)
     print("vocab_size:", vocab_size)
diff --git a/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py b/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py
index 644ec250d..d05dfafcf 100644
--- a/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py
@@ -39,18 +39,18 @@ def evaluate(args, speedyspeech_config, pwg_config):
 
     # construct dataset for evaluation
     sentences = []
-    with open(args.text, 'rt') as f:
+    with open(args.text, 'rt', encoding='utf-8') as f:
         for line in f:
             items = line.strip().split()
             utt_id = items[0]
             sentence = "".join(items[1:])
             sentences.append((utt_id, sentence))
 
-    with open(args.phones_dict, "r") as f:
+    with open(args.phones_dict, 'rt', encoding='utf-8') as f:
         phn_id = [line.strip().split() for line in f.readlines()]
     vocab_size = len(phn_id)
     print("vocab_size:", vocab_size)
-    with open(args.tones_dict, "r") as f:
+    with open(args.tones_dict, 'rt', encoding='utf-8') as f:
         tone_id = [line.strip().split() for line in f.readlines()]
     tone_size = len(tone_id)
     print("tone_size:", tone_size)
diff --git a/paddlespeech/t2s/exps/speedyspeech/train.py b/paddlespeech/t2s/exps/speedyspeech/train.py
index 7b422e64f..c90090daa 100644
--- a/paddlespeech/t2s/exps/speedyspeech/train.py
+++ b/paddlespeech/t2s/exps/speedyspeech/train.py
@@ -70,7 +70,7 @@ def train_sp(args, config):
     if args.speaker_dict is not None:
         print("multiple speaker speedyspeech!")
         collate_fn = speedyspeech_multi_spk_batch_fn
-        with open(args.speaker_dict, 'rt') as f:
+        with open(args.speaker_dict, 'rt', encoding='utf-8') as f:
             spk_id = [line.strip().split() for line in f.readlines()]
         spk_num = len(spk_id)
         fields += ["spk_id"]
@@ -133,11 +133,11 @@ def train_sp(args, config):
         collate_fn=collate_fn,
         num_workers=config.num_workers)
     print("dataloaders done!")
-    with open(args.phones_dict, "r") as f:
+    with open(args.phones_dict, 'rt', encoding='utf-8') as f:
         phn_id = [line.strip().split() for line in f.readlines()]
     vocab_size = len(phn_id)
     print("vocab_size:", vocab_size)
-    with open(args.tones_dict, "r") as f:
+    with open(args.tones_dict, 'rt', encoding='utf-8') as f:
         tone_id = [line.strip().split() for line in f.readlines()]
     tone_size = len(tone_id)
     print("tone_size:", tone_size)
diff --git a/paddlespeech/t2s/exps/syn_utils.py b/paddlespeech/t2s/exps/syn_utils.py
index 6b693440c..491edda30 100644
--- a/paddlespeech/t2s/exps/syn_utils.py
+++ b/paddlespeech/t2s/exps/syn_utils.py
@@ -106,7 +106,7 @@ def get_chunks(data, block_size: int, pad_size: int):
 def get_sentences(text_file: Optional[os.PathLike], lang: str='zh'):
     # construct dataset for evaluation
     sentences = []
-    with open(text_file, 'rt') as f:
+    with open(text_file, 'rt', encoding='utf-8') as f:
         for line in f:
             if line.strip() != "":
                 items = re.split(r"\s+", line.strip(), 1)
@@ -325,17 +325,17 @@ def get_am_inference(am: str='fastspeech2_csmsc',
                      tones_dict: Optional[os.PathLike]=None,
                      speaker_dict: Optional[os.PathLike]=None,
                      return_am: bool=False):
-    with open(phones_dict, "r") as f:
+    with open(phones_dict, 'rt', encoding='utf-8') as f:
         phn_id = [line.strip().split() for line in f.readlines()]
     vocab_size = len(phn_id)
     tone_size = None
     if tones_dict is not None:
-        with open(tones_dict, "r") as f:
+        with open(tones_dict, 'rt', encoding='utf-8') as f:
             tone_id = [line.strip().split() for line in f.readlines()]
         tone_size = len(tone_id)
     spk_num = None
     if speaker_dict is not None:
-        with open(speaker_dict, 'rt') as f:
+        with open(speaker_dict, 'rt', encoding='utf-8') as f:
             spk_id = [line.strip().split() for line in f.readlines()]
         spk_num = len(spk_id)
     odim = am_config.n_mels
diff --git a/paddlespeech/t2s/exps/tacotron2/train.py b/paddlespeech/t2s/exps/tacotron2/train.py
index 69ff80e46..db88009a8 100644
--- a/paddlespeech/t2s/exps/tacotron2/train.py
+++ b/paddlespeech/t2s/exps/tacotron2/train.py
@@ -119,7 +119,7 @@ def train_sp(args, config):
         num_workers=config.num_workers)
     print("dataloaders done!")
 
-    with open(args.phones_dict, "r") as f:
+    with open(args.phones_dict, 'rt', encoding='utf-8') as f:
         phn_id = [line.strip().split() for line in f.readlines()]
     vocab_size = len(phn_id)
     print("vocab_size:", vocab_size)
diff --git a/paddlespeech/t2s/exps/transformer_tts/train.py b/paddlespeech/t2s/exps/transformer_tts/train.py
index da48b6b99..d49baad99 100644
--- a/paddlespeech/t2s/exps/transformer_tts/train.py
+++ b/paddlespeech/t2s/exps/transformer_tts/train.py
@@ -114,7 +114,7 @@ def train_sp(args, config):
         num_workers=config.num_workers)
     print("dataloaders done!")
 
-    with open(args.phones_dict, "r") as f:
+    with open(args.phones_dict, 'rt', encoding='utf-8') as f:
         phn_id = [line.strip().split() for line in f.readlines()]
     vocab_size = len(phn_id)
     print("vocab_size:", vocab_size)
diff --git a/paddlespeech/t2s/exps/vits/train.py b/paddlespeech/t2s/exps/vits/train.py
index f6a31ced2..0e74bf631 100644
--- a/paddlespeech/t2s/exps/vits/train.py
+++ b/paddlespeech/t2s/exps/vits/train.py
@@ -78,7 +78,7 @@ def train_sp(args, config):
     if args.speaker_dict is not None:
         print("multiple speaker vits!")
         collate_fn = vits_multi_spk_batch_fn
-        with open(args.speaker_dict, 'rt') as f:
+        with open(args.speaker_dict, 'rt', encoding='utf-8') as f:
             spk_id = [line.strip().split() for line in f.readlines()]
         spk_num = len(spk_id)
         fields += ["spk_id"]
@@ -132,7 +132,7 @@ def train_sp(args, config):
         num_workers=config.num_workers)
     print("dataloaders done!")
 
-    with open(args.phones_dict, "r") as f:
+    with open(args.phones_dict, 'rt', encoding='utf-8') as f:
         phn_id = [line.strip().split() for line in f.readlines()]
     vocab_size = len(phn_id)
     print("vocab_size:", vocab_size)
diff --git a/paddlespeech/t2s/frontend/phonectic.py b/paddlespeech/t2s/frontend/phonectic.py
index 261db80a8..af86d9b80 100644
--- a/paddlespeech/t2s/frontend/phonectic.py
+++ b/paddlespeech/t2s/frontend/phonectic.py
@@ -58,7 +58,7 @@ class English(Phonetics):
         self.punc = "：，；。？！“”‘’':,;.?!"
         self.text_normalizer = TextNormalizer()
         if phone_vocab_path:
-            with open(phone_vocab_path, 'rt') as f:
+            with open(phone_vocab_path, 'rt', encoding='utf-8') as f:
                 phn_id = [line.strip().split() for line in f.readlines()]
             for phn, id in phn_id:
                 self.vocab_phones[phn] = int(id)
diff --git a/paddlespeech/t2s/frontend/zh_frontend.py b/paddlespeech/t2s/frontend/zh_frontend.py
index efb673e36..35b97a93a 100644
--- a/paddlespeech/t2s/frontend/zh_frontend.py
+++ b/paddlespeech/t2s/frontend/zh_frontend.py
@@ -144,12 +144,12 @@ class Frontend():
         self.vocab_phones = {}
         self.vocab_tones = {}
         if phone_vocab_path:
-            with open(phone_vocab_path, 'rt') as f:
+            with open(phone_vocab_path, 'rt', encoding='utf-8') as f:
                 phn_id = [line.strip().split() for line in f.readlines()]
             for phn, id in phn_id:
                 self.vocab_phones[phn] = int(id)
         if tone_vocab_path:
-            with open(tone_vocab_path, 'rt') as f:
+            with open(tone_vocab_path, 'rt', encoding='utf-8') as f:
                 tone_id = [line.strip().split() for line in f.readlines()]
             for tone, id in tone_id:
                 self.vocab_tones[tone] = int(id)

From c764710aa12a2f0db23475b15e1f6cafd5f05e57 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=AB=A0=E5=AE=8F=E5=BD=AC?=
 <57510731+hopingZ@users.noreply.github.com>
Date: Thu, 2 Feb 2023 13:05:35 +0800
Subject: [PATCH 21/24] [TTS]Avoid using variable "attn_loss" before assignment
 (#2860)

* Avoid using variable "attn_loss" before assignment

* Update tacotron2_updater.py

---------

Co-authored-by: TianYuan <white-sky@qq.com>
---
 .../t2s/models/tacotron2/tacotron2_updater.py        | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/paddlespeech/t2s/models/tacotron2/tacotron2_updater.py b/paddlespeech/t2s/models/tacotron2/tacotron2_updater.py
index 09e6827d0..1db9248ae 100644
--- a/paddlespeech/t2s/models/tacotron2/tacotron2_updater.py
+++ b/paddlespeech/t2s/models/tacotron2/tacotron2_updater.py
@@ -113,16 +113,18 @@ class Tacotron2Updater(StandardUpdater):
         loss.backward()
         optimizer.step()
 
+        if self.use_guided_attn_loss:
+            report("train/attn_loss", float(attn_loss))
+            losses_dict["attn_loss"] = float(attn_loss)
+        
         report("train/l1_loss", float(l1_loss))
         report("train/mse_loss", float(mse_loss))
         report("train/bce_loss", float(bce_loss))
-        report("train/attn_loss", float(attn_loss))
         report("train/loss", float(loss))
 
         losses_dict["l1_loss"] = float(l1_loss)
         losses_dict["mse_loss"] = float(mse_loss)
         losses_dict["bce_loss"] = float(bce_loss)
-        losses_dict["attn_loss"] = float(attn_loss)
         losses_dict["loss"] = float(loss)
         self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
                               for k, v in losses_dict.items())
@@ -202,17 +204,19 @@ class Tacotron2Evaluator(StandardEvaluator):
             attn_loss = self.attn_loss(
                 att_ws=att_ws, ilens=batch["text_lengths"] + 1, olens=olens_in)
             loss = loss + attn_loss
+        
+        if self.use_guided_attn_loss:
+            report("eval/attn_loss", float(attn_loss))
+            losses_dict["attn_loss"] = float(attn_loss)
 
         report("eval/l1_loss", float(l1_loss))
         report("eval/mse_loss", float(mse_loss))
         report("eval/bce_loss", float(bce_loss))
-        report("eval/attn_loss", float(attn_loss))
         report("eval/loss", float(loss))
 
         losses_dict["l1_loss"] = float(l1_loss)
         losses_dict["mse_loss"] = float(mse_loss)
         losses_dict["bce_loss"] = float(bce_loss)
-        losses_dict["attn_loss"] = float(attn_loss)
         losses_dict["loss"] = float(loss)
         self.msg += ', '.join('{}: {:>.6f}'.format(k, v)
                               for k, v in losses_dict.items())

From 6b00ad6064a390525bd992dc747e1e5681b49db4 Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Fri, 3 Feb 2023 09:57:51 +0800
Subject: [PATCH 22/24] [Install]clean dependencies (#2871)

* clean dependencies

* update paddleaudio's version

* rm dependency in librosa and paddlenlp

* rm dependency in paddlepaddle

* rm dependency in speech_web
---
 audio/setup.py                                 |  7 +------
 .../speech_web/speech_server/requirements.txt  |  4 +---
 docs/requirements.txt                          | 15 +++------------
 setup.py                                       | 18 +++++-------------
 4 files changed, 10 insertions(+), 34 deletions(-)

diff --git a/audio/setup.py b/audio/setup.py
index 6e358346c..823e5dfad 100644
--- a/audio/setup.py
+++ b/audio/setup.py
@@ -40,14 +40,9 @@ COMMITID = 'none'
 base = [
     "kaldiio",
     "librosa==0.8.1",
-    "scipy>=1.0.0",
-    "soundfile~=0.10",
-    "colorlog",
-    "pathos==0.2.8",
+    "pathos",
     "pybind11",
     "parameterized",
-    "tqdm",
-    "scikit-learn"
 ]
 
 requirements = {
diff --git a/demos/speech_web/speech_server/requirements.txt b/demos/speech_web/speech_server/requirements.txt
index cdc654656..8425a1fee 100644
--- a/demos/speech_web/speech_server/requirements.txt
+++ b/demos/speech_web/speech_server/requirements.txt
@@ -1,8 +1,6 @@
 aiofiles
 faiss-cpu
-praatio==5.0.0
+praatio>=5.0.0
 pydantic
 python-multipart
-scikit_learn
 starlette
-uvicorn
diff --git a/docs/requirements.txt b/docs/requirements.txt
index 5422c26f9..609f27925 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,11 +1,9 @@
 braceexpand
 editdistance
-fastapi
 g2p_en
 g2pM
 h5py
 inflect
-jieba
 jsonlines
 kaldiio
 keyboard
@@ -24,30 +22,23 @@ paddlespeech_ctcdecoders
 paddlespeech_feat
 pandas
 pattern_singleton
-Pillow>=9.0.0
 ppdiffusers>=0.9.0
-praatio==5.0.0
+praatio>=5.0.0
 prettytable
 pypinyin-dict
 pypinyin<=0.44.0
 python-dateutil
-pyworld==0.2.12
+pyworld>=0.2.12
 recommonmark>=0.5.0
-resampy==0.2.2
+resampy
 sacrebleu
-scipy
-sentencepiece~=0.1.96
-soundfile~=0.10
 sphinx
 sphinx-autobuild
 sphinx-markdown-tables
 sphinx_rtd_theme
 textgrid
 timer
-tqdm
 typeguard
-uvicorn
-visualdl
 webrtcvad
 websockets
 yacs~=0.1.8
diff --git a/setup.py b/setup.py
index 2c97ce783..76bc5be8d 100644
--- a/setup.py
+++ b/setup.py
@@ -37,9 +37,7 @@ base = [
     "g2pM",
     "h5py",
     "inflect",
-    "jieba",
     "jsonlines",
-    "kaldiio",
     "librosa==0.8.1",
     "loguru",
     "matplotlib",
@@ -51,22 +49,16 @@ base = [
     "paddlenlp>=2.4.8",
     "ppdiffusers>=0.9.0",
     "paddlespeech_feat",
-    "Pillow>=9.0.0",
-    "praatio==5.0.0",
+    "praatio>=5.0.0",
     "pypinyin<=0.44.0",
     "pypinyin-dict",
     "python-dateutil",
-    "pyworld==0.2.12",
-    "resampy==0.2.2",
+    "pyworld>=0.2.12",
+    "resampy",
     "sacrebleu",
-    "scipy",
-    "sentencepiece~=0.1.96",
-    "soundfile~=0.10",
     "textgrid",
     "timer",
-    "tqdm",
     "typeguard",
-    "visualdl",
     "webrtcvad",
     "yacs~=0.1.8",
     "prettytable",
@@ -74,10 +66,10 @@ base = [
     "braceexpand",
     "pyyaml",
     "paddleslim>=2.3.4",
-    "paddleaudio>=1.0.2",
+    "paddleaudio>=1.1.0",
 ]
 
-server = ["fastapi", "uvicorn", "pattern_singleton", "websockets"]
+server = ["pattern_singleton", "websockets"]
 
 requirements = {
     "install":

From 089c060756c9fe5494ad9e13a57e61451103fee1 Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Mon, 6 Feb 2023 19:59:02 +0800
Subject: [PATCH 23/24] fix pwgan tipc (#2882)

---
 tests/test_tipc/prepare.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/test_tipc/prepare.sh b/tests/test_tipc/prepare.sh
index cb05a1d0f..9ff81bd8b 100755
--- a/tests/test_tipc/prepare.sh
+++ b/tests/test_tipc/prepare.sh
@@ -73,6 +73,9 @@ if [[ ${MODE} = "benchmark_train" ]];then
         mkdir -p BZNSYP
         unrar x BZNSYP.rar BZNSYP
         wget -nc https://paddlespeech.bj.bcebos.com/Parakeet/benchmark/durations.txt
+        # 避免网络问题导致的 nltk_data 无法下载使程序 hang 住
+        wget -nc https://paddlespeech.bj.bcebos.com/Parakeet/tools/nltk_data.tar.gz
+        tar -xzf nltk_data.tar.gz -C ${HOME}
         # 数据预处理
         python ../paddlespeech/t2s/exps/gan_vocoder/preprocess.py --rootdir=BZNSYP/ --dumpdir=dump --num-cpu=20 --cut-sil=True --dur-file=durations.txt --config=../examples/csmsc/voc1/conf/default.yaml
         python ../utils/compute_statistics.py --metadata=dump/train/raw/metadata.jsonl --field-name="feats"

From 16d84367c6c7452deb0cc9955aa40298271637b0 Mon Sep 17 00:00:00 2001
From: Zhou Wei <1183042833@qq.com>
Date: Tue, 7 Feb 2023 10:10:53 +0800
Subject: [PATCH 24/24] fix Tensor.numpy()[0] to float(Tensor) to adapt 0D
 (#2884)

---
 examples/tess/cls0/local/train.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/tess/cls0/local/train.py b/examples/tess/cls0/local/train.py
index 25382d8c3..f023a37b7 100644
--- a/examples/tess/cls0/local/train.py
+++ b/examples/tess/cls0/local/train.py
@@ -121,7 +121,7 @@ if __name__ == "__main__":
             optimizer.clear_grad()
 
             # Calculate loss
-            avg_loss += loss.numpy()[0]
+            avg_loss += float(loss)
 
             # Calculate metrics
             preds = paddle.argmax(logits, axis=1)