From 30f6d7bcbfb580392af4d3381b179929493e811c Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Sun, 9 Jan 2022 12:18:01 +0000
Subject: [PATCH 01/60] fix config, test=asr

---
 examples/aishell/asr0/conf/deepspeech2.yaml        | 2 +-
 examples/aishell/asr0/conf/deepspeech2_online.yaml | 2 +-
 examples/tiny/asr0/conf/deepspeech2.yaml           | 2 +-
 examples/tiny/asr0/conf/deepspeech2_online.yaml    | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/aishell/asr0/conf/deepspeech2.yaml b/examples/aishell/asr0/conf/deepspeech2.yaml
index 1dc8581e0..ec9e02b66 100644
--- a/examples/aishell/asr0/conf/deepspeech2.yaml
+++ b/examples/aishell/asr0/conf/deepspeech2.yaml
@@ -23,7 +23,7 @@ augmentation_config: conf/augmentation.json
 random_seed: 0
 spm_model_prefix: 
 spectrum_type: linear
-feat_dim: 
+feat_dim: 161
 delta_delta: False
 stride_ms: 10.0
 window_ms: 20.0
diff --git a/examples/aishell/asr0/conf/deepspeech2_online.yaml b/examples/aishell/asr0/conf/deepspeech2_online.yaml
index c49973a26..05594e2d7 100644
--- a/examples/aishell/asr0/conf/deepspeech2_online.yaml
+++ b/examples/aishell/asr0/conf/deepspeech2_online.yaml
@@ -23,7 +23,7 @@ augmentation_config: conf/augmentation.json
 random_seed: 0
 spm_model_prefix: 
 spectrum_type: linear #linear, mfcc, fbank
-feat_dim: 
+feat_dim: 161
 delta_delta: False
 stride_ms: 10.0
 window_ms: 20.0
diff --git a/examples/tiny/asr0/conf/deepspeech2.yaml b/examples/tiny/asr0/conf/deepspeech2.yaml
index a16a79d3a..2cc4483eb 100644
--- a/examples/tiny/asr0/conf/deepspeech2.yaml
+++ b/examples/tiny/asr0/conf/deepspeech2.yaml
@@ -23,7 +23,7 @@ augmentation_config: conf/augmentation.json
 random_seed: 0
 spm_model_prefix: 
 spectrum_type: linear
-feat_dim: 
+feat_dim: 161
 delta_delta: False
 stride_ms: 10.0
 window_ms: 20.0
diff --git a/examples/tiny/asr0/conf/deepspeech2_online.yaml b/examples/tiny/asr0/conf/deepspeech2_online.yaml
index 5458cfb30..3bd4f6350 100644
--- a/examples/tiny/asr0/conf/deepspeech2_online.yaml
+++ b/examples/tiny/asr0/conf/deepspeech2_online.yaml
@@ -23,7 +23,7 @@ augmentation_config: conf/augmentation.json
 random_seed: 0
 spm_model_prefix: 
 spectrum_type: linear
-feat_dim: 
+feat_dim: 161
 delta_delta: False
 stride_ms: 10.0
 window_ms: 20.0

From 35ca7f6e984699d88edf4c96017294d4284f1623 Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Mon, 10 Jan 2022 02:40:46 +0000
Subject: [PATCH 02/60] fix config, test=doc_fix

---
 examples/aishell/asr0/conf/deepspeech2.yaml            | 4 ++--
 examples/aishell/asr0/conf/deepspeech2_online.yaml     | 4 ++--
 examples/librispeech/asr0/conf/deepspeech2.yaml        | 4 ++--
 examples/librispeech/asr0/conf/deepspeech2_online.yaml | 4 ++--
 examples/tiny/asr0/conf/deepspeech2.yaml               | 4 ++--
 examples/tiny/asr0/conf/deepspeech2_online.yaml        | 4 ++--
 examples/wenetspeech/asr1/local/data.sh                | 2 +-
 7 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/examples/aishell/asr0/conf/deepspeech2.yaml b/examples/aishell/asr0/conf/deepspeech2.yaml
index ec9e02b66..fb6998647 100644
--- a/examples/aishell/asr0/conf/deepspeech2.yaml
+++ b/examples/aishell/asr0/conf/deepspeech2.yaml
@@ -54,9 +54,9 @@ ctc_grad_norm_type: instance
 ###########################################
 n_epoch: 80
 accum_grad: 1
-lr: 2e-3
+lr: 2.0e-3
 lr_decay: 0.83
-weight_decay: 1e-06
+weight_decay: 1.0e-6
 global_grad_clip: 3.0
 log_interval: 100
 checkpoint:
diff --git a/examples/aishell/asr0/conf/deepspeech2_online.yaml b/examples/aishell/asr0/conf/deepspeech2_online.yaml
index 05594e2d7..ef01ac595 100644
--- a/examples/aishell/asr0/conf/deepspeech2_online.yaml
+++ b/examples/aishell/asr0/conf/deepspeech2_online.yaml
@@ -56,9 +56,9 @@ blank_id: 0
 ###########################################
 n_epoch: 65
 accum_grad: 1
-lr: 5e-4
+lr: 5.0e-4
 lr_decay: 0.93
-weight_decay: 1e-06
+weight_decay: 1.0e-6
 global_grad_clip: 3.0
 log_interval: 100
 checkpoint:
diff --git a/examples/librispeech/asr0/conf/deepspeech2.yaml b/examples/librispeech/asr0/conf/deepspeech2.yaml
index 0b0a1550d..0307b9f39 100644
--- a/examples/librispeech/asr0/conf/deepspeech2.yaml
+++ b/examples/librispeech/asr0/conf/deepspeech2.yaml
@@ -55,9 +55,9 @@ blank_id: 0
 ###########################################
 n_epoch: 50
 accum_grad: 1
-lr: 1e-3
+lr: 1.0e-3
 lr_decay: 0.83
-weight_decay: 1e-06
+weight_decay: 1.0e-6
 global_grad_clip: 5.0
 log_interval: 100
 checkpoint:
diff --git a/examples/librispeech/asr0/conf/deepspeech2_online.yaml b/examples/librispeech/asr0/conf/deepspeech2_online.yaml
index 8bd5a6727..a0d2bcfe2 100644
--- a/examples/librispeech/asr0/conf/deepspeech2_online.yaml
+++ b/examples/librispeech/asr0/conf/deepspeech2_online.yaml
@@ -57,9 +57,9 @@ blank_id: 0
 ###########################################
 n_epoch: 50
 accum_grad: 4
-lr: 1e-3
+lr: 1.0e-3
 lr_decay: 0.83
-weight_decay: 1e-06
+weight_decay: 1.0e-6
 global_grad_clip: 5.0
 log_interval: 100
 checkpoint:
diff --git a/examples/tiny/asr0/conf/deepspeech2.yaml b/examples/tiny/asr0/conf/deepspeech2.yaml
index 2cc4483eb..64d432e26 100644
--- a/examples/tiny/asr0/conf/deepspeech2.yaml
+++ b/examples/tiny/asr0/conf/deepspeech2.yaml
@@ -55,9 +55,9 @@ blank_id: 0
 ###########################################
 n_epoch: 5
 accum_grad: 1
-lr: 1e-5 
+lr: 1.0e-5 
 lr_decay: 0.8 
-weight_decay: 1e-06
+weight_decay: 1.0e-6
 global_grad_clip: 5.0
 log_interval: 1
 checkpoint:
diff --git a/examples/tiny/asr0/conf/deepspeech2_online.yaml b/examples/tiny/asr0/conf/deepspeech2_online.yaml
index 3bd4f6350..74a4dc814 100644
--- a/examples/tiny/asr0/conf/deepspeech2_online.yaml
+++ b/examples/tiny/asr0/conf/deepspeech2_online.yaml
@@ -57,9 +57,9 @@ blank_id: 0
 ###########################################
 n_epoch: 5
 accum_grad: 1
-lr: 1e-5 
+lr: 1.0e-5 
 lr_decay: 1.0 
-weight_decay: 1e-06
+weight_decay: 1.0e-6
 global_grad_clip: 5.0
 log_interval: 1
 checkpoint:
diff --git a/examples/wenetspeech/asr1/local/data.sh b/examples/wenetspeech/asr1/local/data.sh
index 7dd478d19..d216dd84a 100755
--- a/examples/wenetspeech/asr1/local/data.sh
+++ b/examples/wenetspeech/asr1/local/data.sh
@@ -96,7 +96,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     python3 ${MAIN_ROOT}/utils/build_vocab.py \
     --unit_type="char" \
     --count_threshold=0 \
-    --vocab_path="data/vocab.txt" \
+    --vocab_path="data/lang_char/vocab.txt" \
     --manifest_paths "data/manifest.train.raw"
 
     if [ $? -ne 0 ]; then

From 5c9e4caa7b8603f3b9ad0fe3d81cdfa99043c115 Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Mon, 10 Jan 2022 09:30:10 +0000
Subject: [PATCH 03/60] add en and decode_method for cli/asr, test=asr

---
 paddlespeech/cli/asr/infer.py           |  91 ++++++-------
 paddlespeech/s2t/frontend/normalizer.py |  13 +-
 utils/generate_infer_yaml.py            | 174 ++++++++++++++++++++++++
 3 files changed, 225 insertions(+), 53 deletions(-)
 create mode 100644 utils/generate_infer_yaml.py

diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py
index 8de964768..53379ed71 100644
--- a/paddlespeech/cli/asr/infer.py
+++ b/paddlespeech/cli/asr/infer.py
@@ -46,19 +46,29 @@ pretrained_models = {
     # "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav"
     "conformer_wenetspeech-zh-16k": {
         'url':
-        'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/conformer.model.tar.gz',
+        'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1_conformer_wenetspeech_ckpt_0.1.1.model.tar.gz',
         'md5':
-        '54e7a558a6e020c2f5fb224874943f97',
+        'b9afd8285ff5b2596bf96afab656b02f',
         'cfg_path':
-        'conf/conformer.yaml',
+        'conf/conformer_infer.yaml',
         'ckpt_path':
         'exp/conformer/checkpoints/wenetspeech',
     },
+    "transformer_librispeech-en-16k": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/asr1_transformer_librispeech_ckpt_0.1.1.model.tar.gz',
+        'md5':
+        'c95b9997f5f81478b32879a38532913d',
+        'cfg_path':
+        'conf/transformer_infer.yaml',
+        'ckpt_path':
+        'exp/transformer/checkpoints/avg_10',
+    },
 }
 
 model_alias = {
-    "ds2_offline": "paddlespeech.s2t.models.ds2:DeepSpeech2Model",
-    "ds2_online": "paddlespeech.s2t.models.ds2_online:DeepSpeech2ModelOnline",
+    "deepspeech2offline": "paddlespeech.s2t.models.ds2:DeepSpeech2Model",
+    "deepspeech2online": "paddlespeech.s2t.models.ds2_online:DeepSpeech2ModelOnline",
     "conformer": "paddlespeech.s2t.models.u2:U2Model",
     "transformer": "paddlespeech.s2t.models.u2:U2Model",
     "wenetspeech": "paddlespeech.s2t.models.u2:U2Model",
@@ -85,7 +95,7 @@ class ASRExecutor(BaseExecutor):
             '--lang',
             type=str,
             default='zh',
-            help='Choose model language. zh or en')
+            help='Choose model language. zh or en, zh:[conformer_wenetspeech-zh-16k], en:[transformer_librispeech-en-16k]')
         self.parser.add_argument(
             "--sample_rate",
             type=int,
@@ -97,6 +107,12 @@ class ASRExecutor(BaseExecutor):
             type=str,
             default=None,
             help='Config of asr task. Use deault config when it is None.')
+        self.parser.add_argument(
+            '--decode_method',
+            type=str,
+            default='attention_rescoring',
+            choices=['ctc_greedy_search', 'ctc_prefix_beam_search', 'attention', 'attention_rescoring'],
+            help='only support transformer and conformer model')
         self.parser.add_argument(
             '--ckpt_path',
             type=str,
@@ -136,6 +152,7 @@ class ASRExecutor(BaseExecutor):
                         lang: str='zh',
                         sample_rate: int=16000,
                         cfg_path: Optional[os.PathLike]=None,
+                        decode_method: str='attention_rescoring',
                         ckpt_path: Optional[os.PathLike]=None):
         """
         Init model and other resources from a specific path.
@@ -165,45 +182,30 @@ class ASRExecutor(BaseExecutor):
         #Init body.
         self.config = CfgNode(new_allowed=True)
         self.config.merge_from_file(self.cfg_path)
-        self.config.decoding.decoding_method = "attention_rescoring"
 
         with UpdateConfig(self.config):
-            if "ds2_online" in model_type or "ds2_offline" in model_type:
+            if "deepspeech2online" in model_type or "deepspeech2offline" in model_type:
                 from paddlespeech.s2t.io.collator import SpeechCollator
-                self.config.collator.vocab_filepath = os.path.join(
-                    res_path, self.config.collator.vocab_filepath)
-                self.config.collator.mean_std_filepath = os.path.join(
-                    res_path, self.config.collator.cmvn_path)
+                self.vocab = self.config.vocab_filepath
+                self.config.decode.lang_model_path = os.path.join(res_path, self.config.decode.lang_model_path)
                 self.collate_fn_test = SpeechCollator.from_config(self.config)
                 self.text_feature = TextFeaturizer(
-                    unit_type=self.config.collator.unit_type,
-                    vocab=self.config.collator.vocab_filepath,
-                    spm_model_prefix=self.config.collator.spm_model_prefix)
-                self.config.model.input_dim = self.collate_fn_test.feature_size
-                self.config.model.output_dim = self.text_feature.vocab_size
+                    unit_type=self.config.unit_type,
+                    vocab=self.vocab)
             elif "conformer" in model_type or "transformer" in model_type or "wenetspeech" in model_type:
-                self.config.collator.vocab_filepath = os.path.join(
-                    res_path, self.config.collator.vocab_filepath)
-                self.config.collator.augmentation_config = os.path.join(
-                    res_path, self.config.collator.augmentation_config)
-                self.config.collator.spm_model_prefix = os.path.join(
-                    res_path, self.config.collator.spm_model_prefix)
+                self.config.spm_model_prefix = os.path.join(self.res_path, self.config.spm_model_prefix)
                 self.text_feature = TextFeaturizer(
-                    unit_type=self.config.collator.unit_type,
-                    vocab=self.config.collator.vocab_filepath,
-                    spm_model_prefix=self.config.collator.spm_model_prefix)
-                self.config.model.input_dim = self.config.collator.feat_dim
-                self.config.model.output_dim = self.text_feature.vocab_size
+                    unit_type=self.config.unit_type,
+                    vocab=self.config.vocab_filepath,
+                    spm_model_prefix=self.config.spm_model_prefix)
+                self.config.decode.decoding_method = decode_method
 
             else:
                 raise Exception("wrong type")
-        # Enter the path of model root
-
         model_name = model_type[:model_type.rindex(
             '_')]  # model_type: {model_name}_{dataset}
         model_class = dynamic_import(model_name, model_alias)
-        model_conf = self.config.model
-        logger.info(model_conf)
+        model_conf = self.config
         model = model_class.from_config(model_conf)
         self.model = model
         self.model.eval()
@@ -222,7 +224,7 @@ class ASRExecutor(BaseExecutor):
         logger.info("Preprocess audio_file:" + audio_file)
 
         # Get the object for feature extraction
-        if "ds2_online" in model_type or "ds2_offline" in model_type:
+        if "deepspeech2online" in model_type or "deepspeech2offline" in model_type:
             audio, _ = self.collate_fn_test.process_utterance(
                 audio_file=audio_file, transcript=" ")
             audio_len = audio.shape[0]
@@ -236,18 +238,7 @@ class ASRExecutor(BaseExecutor):
 
         elif "conformer" in model_type or "transformer" in model_type or "wenetspeech" in model_type:
             logger.info("get the preprocess conf")
-            preprocess_conf_file = self.config.collator.augmentation_config
-            # redirect the cmvn path
-            with io.open(preprocess_conf_file, encoding="utf-8") as f:
-                preprocess_conf = yaml.safe_load(f)
-                for idx, process in enumerate(preprocess_conf["process"]):
-                    if process['type'] == "cmvn_json":
-                        preprocess_conf["process"][idx][
-                            "cmvn_path"] = os.path.join(
-                                self.res_path,
-                                preprocess_conf["process"][idx]["cmvn_path"])
-                        break
-            logger.info(preprocess_conf)
+            preprocess_conf = self.config.preprocess_config
             preprocess_args = {"train": False}
             preprocessing = Transformation(preprocess_conf)
             logger.info("read the audio file")
@@ -289,10 +280,10 @@ class ASRExecutor(BaseExecutor):
         Model inference and result stored in self.output.
         """
 
-        cfg = self.config.decoding
+        cfg = self.config.decode
         audio = self._inputs["audio"]
         audio_len = self._inputs["audio_len"]
-        if "ds2_online" in model_type or "ds2_offline" in model_type:
+        if "deepspeech2online" in model_type or "deepspeech2offline" in model_type:
             result_transcripts = self.model.decode(
                 audio,
                 audio_len,
@@ -414,12 +405,13 @@ class ASRExecutor(BaseExecutor):
         config = parser_args.config
         ckpt_path = parser_args.ckpt_path
         audio_file = parser_args.input
+        decode_method = parser_args.decode_method
         force_yes = parser_args.yes
         device = parser_args.device
 
         try:
             res = self(audio_file, model, lang, sample_rate, config, ckpt_path,
-                       force_yes, device)
+                        decode_method, force_yes, device)
             logger.info('ASR Result: {}'.format(res))
             return True
         except Exception as e:
@@ -434,6 +426,7 @@ class ASRExecutor(BaseExecutor):
                  sample_rate: int=16000,
                  config: os.PathLike=None,
                  ckpt_path: os.PathLike=None,
+                 decode_method: str='attention_rescoring',
                  force_yes: bool=False,
                  device=paddle.get_device()):
         """
@@ -442,7 +435,7 @@ class ASRExecutor(BaseExecutor):
         audio_file = os.path.abspath(audio_file)
         self._check(audio_file, sample_rate, force_yes)
         paddle.set_device(device)
-        self._init_from_path(model, lang, sample_rate, config, ckpt_path)
+        self._init_from_path(model, lang, sample_rate, config, decode_method, ckpt_path)
         self.preprocess(model, audio_file)
         self.infer(model)
         res = self.postprocess()  # Retrieve result of asr.
diff --git a/paddlespeech/s2t/frontend/normalizer.py b/paddlespeech/s2t/frontend/normalizer.py
index 017851e63..b596b2ab0 100644
--- a/paddlespeech/s2t/frontend/normalizer.py
+++ b/paddlespeech/s2t/frontend/normalizer.py
@@ -117,7 +117,8 @@ class FeatureNormalizer(object):
             self._compute_mean_std(manifest_path, featurize_func, num_samples,
                                    num_workers)
         else:
-            self._read_mean_std_from_file(mean_std_filepath)
+            mean_std = mean_std_filepath
+            self._read_mean_std_from_file(mean_std)
 
     def apply(self, features):
         """Normalize features to be of zero mean and unit stddev.
@@ -131,10 +132,14 @@ class FeatureNormalizer(object):
         """
         return (features - self._mean) * self._istd
 
-    def _read_mean_std_from_file(self, filepath, eps=1e-20):
+    def _read_mean_std_from_file(self, mean_std, eps=1e-20):
         """Load mean and std from file."""
-        filetype = filepath.split(".")[-1]
-        mean, istd = load_cmvn(filepath, filetype=filetype)
+        if isinstance(mean_std, list):
+            mean = mean_std[0]['cmvn_stats']['mean']
+            istd = mean_std[0]['cmvn_stats']['istd']
+        else:
+            filetype = mean_std.split(".")[-1]
+            mean, istd = load_cmvn(mean_std, filetype=filetype)
         self._mean = np.expand_dims(mean, axis=0)
         self._istd = np.expand_dims(istd, axis=0)
 
diff --git a/utils/generate_infer_yaml.py b/utils/generate_infer_yaml.py
new file mode 100644
index 000000000..5eed738c6
--- /dev/null
+++ b/utils/generate_infer_yaml.py
@@ -0,0 +1,174 @@
+#!/usr/bin/env python3
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+'''
+    Merge training configs into a single inference config.
+'''
+
+import yaml
+import json
+import os
+import argparse
+import math
+from yacs.config import CfgNode
+
+from paddlespeech.s2t.frontend.utility import load_dict
+from contextlib import redirect_stdout
+
+
+def save(save_path, config):
+    with open(save_path, 'w') as fp:
+        with redirect_stdout(fp):
+            print(config.dump())
+
+
+def load(save_path):
+    config = CfgNode(new_allowed=True)
+    config.merge_from_file(save_path)
+    return config
+
+def load_json(json_path):
+    with open(json_path) as f:
+        json_content = json.load(f)
+    return json_content
+
+def remove_config_part(config, key_list):
+    if len(key_list) == 0:
+        return
+    for i in range(len(key_list) -1):
+        config = config[key_list[i]]
+    config.pop(key_list[-1])
+
+def load_cmvn_from_json(cmvn_stats):
+    means = cmvn_stats['mean_stat']
+    variance = cmvn_stats['var_stat']
+    count = cmvn_stats['frame_num']
+    for i in range(len(means)):
+        means[i] /= count
+        variance[i] = variance[i] / count - means[i] * means[i]
+        if variance[i] < 1.0e-20:
+            variance[i] = 1.0e-20
+        variance[i] = 1.0 / math.sqrt(variance[i])
+    cmvn_stats = {"mean":means, "istd":variance}
+    return cmvn_stats
+
+def merge_configs(
+        conf_path = "conf/conformer.yaml",
+        preprocess_path = "conf/preprocess.yaml",
+        decode_path = "conf/tuning/decode.yaml",
+        vocab_path = "data/vocab.txt",
+        cmvn_path = "data/mean_std.json",
+        save_path = "conf/conformer_infer.yaml",
+    ):
+
+    # Load the configs
+    config = load(conf_path)
+    decode_config = load(decode_path)
+    vocab_list = load_dict(vocab_path)
+    cmvn_stats = load_json(cmvn_path)
+    if os.path.exists(preprocess_path):
+        preprocess_config =  load(preprocess_path)
+        for idx, process in enumerate(preprocess_config["process"]):
+            if process['type'] == "cmvn_json":
+                preprocess_config["process"][idx][
+                    "cmvn_path"] = cmvn_stats
+                break
+
+        config.preprocess_config = preprocess_config
+    else:
+        cmvn_stats = load_cmvn_from_json(cmvn_stats)
+        config.mean_std_filepath = [{"cmvn_stats":cmvn_stats}]
+        config.augmentation_config = ''
+
+    # Updata the config
+    config.vocab_filepath = vocab_list
+    config.input_dim = config.feat_dim
+    config.output_dim = len(config.vocab_filepath)
+    config.decode = decode_config
+    # Remove some parts of the config
+
+    if os.path.exists(preprocess_path):
+        remove_list = ["train_manifest",
+            "dev_manifest",
+            "test_manifest",
+            "n_epoch",
+            "accum_grad",
+            "global_grad_clip",
+            "optim",
+            "optim_conf",
+            "scheduler",
+            "scheduler_conf",
+            "log_interval",
+            "checkpoint",
+            "shuffle_method",
+            "weight_decay",
+            "ctc_grad_norm_type",
+            "minibatches",
+            "batch_bins",
+            "batch_count",
+            "batch_frames_in",
+            "batch_frames_inout",
+            "batch_frames_out",
+            "sortagrad",
+            "feat_dim",
+            "stride_ms",
+            "window_ms",
+            "batch_size",
+            "maxlen_in",
+            "maxlen_out",
+            ]
+    else:
+         remove_list = ["train_manifest",
+            "dev_manifest",
+            "test_manifest",
+            "n_epoch",
+            "accum_grad",
+            "global_grad_clip",
+            "log_interval",
+            "checkpoint",
+            "lr",
+            "lr_decay",
+            "batch_size",
+            "shuffle_method",
+            "weight_decay",
+            "sortagrad",
+            "num_workers",
+            ]
+
+    for item in remove_list:
+        try:
+            remove_config_part(config, [item])
+        except:
+            print ( item + " " +"can not be removed")
+
+    # Save the config
+    save(save_path, config)
+
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog='Config merge', add_help=True)
+    parser.add_argument(
+        '--cfg_pth', type=str, default = 'conf/transformer.yaml', help='origin config file')
+    parser.add_argument(
+        '--pre_pth', type=str, default= "conf/preprocess.yaml", help='')
+    parser.add_argument(
+        '--dcd_pth', type=str, default= "conf/tuninig/decode.yaml", help='')
+    parser.add_argument(
+        '--vb_pth', type=str, default= "data/lang_char/vocab.txt", help='')
+    parser.add_argument(
+        '--cmvn_pth', type=str, default= "data/mean_std.json", help='')
+    parser.add_argument(
+        '--save_pth', type=str, default= "conf/transformer_infer.yaml", help='')
+    parser_args = parser.parse_args()
+
+    merge_configs(
+        conf_path = parser_args.cfg_pth,
+        preprocess_path =  parser_args.pre_pth,
+        vocab_path = parser_args.vb_pth,
+        cmvn_path = parser_args.cmvn_pth,
+        save_path = parser_args.save_pth,
+    )
+
+

From d902f3879119695d2d2835eccc04a1e4fb4085ee Mon Sep 17 00:00:00 2001
From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com>
Date: Mon, 10 Jan 2022 17:56:29 +0800
Subject: [PATCH 04/60] test=asr

---
 CHANGELOG.md | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4dc68c6ff..0374659b2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,2 +1,11 @@
 # Changelog
 
+
+Date: 2022-1-10, Author: Jackwaterveg.  
+Add features to: CLI:  
+  - Support English (librispeech/asr1/transformer).  
+  - Support choosing `decode_method` for conformer and transformer models.  
+  - Refactor the config, using the unified config.  
+  - Pr_link: https://github.com/PaddlePaddle/PaddleSpeech/pull/1297
+
+***

From 11ba35d08be6f9826e88f3f54f403cd3e52a5fd3 Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Mon, 10 Jan 2022 12:08:31 +0000
Subject: [PATCH 05/60] fix, test=doc_fix

---
 CHANGELOG.md                  |  2 +-
 paddlespeech/cli/asr/infer.py | 10 +++++-----
 utils/generate_infer_yaml.py  | 10 +++++++---
 3 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 0374659b2..5ffe80984 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,6 +6,6 @@ Add features to: CLI:
   - Support English (librispeech/asr1/transformer).  
   - Support choosing `decode_method` for conformer and transformer models.  
   - Refactor the config, using the unified config.  
-  - Pr_link: https://github.com/PaddlePaddle/PaddleSpeech/pull/1297
+  - PRLink: https://github.com/PaddlePaddle/PaddleSpeech/pull/1297
 
 ***
diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py
index 53379ed71..aa4e31d9e 100644
--- a/paddlespeech/cli/asr/infer.py
+++ b/paddlespeech/cli/asr/infer.py
@@ -48,9 +48,9 @@ pretrained_models = {
         'url':
         'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1_conformer_wenetspeech_ckpt_0.1.1.model.tar.gz',
         'md5':
-        'b9afd8285ff5b2596bf96afab656b02f',
+        '76cb19ed857e6623856b7cd7ebbfeda4',
         'cfg_path':
-        'conf/conformer_infer.yaml',
+        'model.yaml',
         'ckpt_path':
         'exp/conformer/checkpoints/wenetspeech',
     },
@@ -58,9 +58,9 @@ pretrained_models = {
         'url':
         'https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/asr1_transformer_librispeech_ckpt_0.1.1.model.tar.gz',
         'md5':
-        'c95b9997f5f81478b32879a38532913d',
+        '2c667da24922aad391eacafe37bc1660',
         'cfg_path':
-        'conf/transformer_infer.yaml',
+        'model.yaml',
         'ckpt_path':
         'exp/transformer/checkpoints/avg_10',
     },
@@ -176,7 +176,7 @@ class ASRExecutor(BaseExecutor):
         else:
             self.cfg_path = os.path.abspath(cfg_path)
             self.ckpt_path = os.path.abspath(ckpt_path + ".pdparams")
-            res_path = os.path.dirname(
+            self.res_path = os.path.dirname(
                 os.path.dirname(os.path.abspath(self.cfg_path)))
 
         #Init body.
diff --git a/utils/generate_infer_yaml.py b/utils/generate_infer_yaml.py
index 5eed738c6..d2a6777c7 100644
--- a/utils/generate_infer_yaml.py
+++ b/utils/generate_infer_yaml.py
@@ -3,6 +3,8 @@
 
 '''
     Merge training configs into a single inference config.
+    The single inference config is for CLI, which only takes a single config to do inferencing.
+    The trainig configs includes: model config, preprocess config, decode config, vocab file and cmvn file.
 '''
 
 import yaml
@@ -88,7 +90,7 @@ def merge_configs(
     # Remove some parts of the config
 
     if os.path.exists(preprocess_path):
-        remove_list = ["train_manifest",
+        remove_train_list = ["train_manifest",
             "dev_manifest",
             "test_manifest",
             "n_epoch",
@@ -104,6 +106,7 @@ def merge_configs(
             "weight_decay",
             "ctc_grad_norm_type",
             "minibatches",
+            "subsampling_factor",
             "batch_bins",
             "batch_count",
             "batch_frames_in",
@@ -118,7 +121,7 @@ def merge_configs(
             "maxlen_out",
             ]
     else:
-         remove_list = ["train_manifest",
+         remove_train_list = ["train_manifest",
             "dev_manifest",
             "test_manifest",
             "n_epoch",
@@ -135,7 +138,7 @@ def merge_configs(
             "num_workers",
             ]
 
-    for item in remove_list:
+    for item in remove_train_list:
         try:
             remove_config_part(config, [item])
         except:
@@ -165,6 +168,7 @@ if __name__ == "__main__":
 
     merge_configs(
         conf_path = parser_args.cfg_pth,
+        decode_path = parser_args.dcd_pth,
         preprocess_path =  parser_args.pre_pth,
         vocab_path = parser_args.vb_pth,
         cmvn_path = parser_args.cmvn_pth,

From 75c2bd5faff0e361cce219ce31b8e5e4a32f6b60 Mon Sep 17 00:00:00 2001
From: Jerryuhoo <jerryuhoo@gmail.com>
Date: Tue, 11 Jan 2022 14:43:19 +0800
Subject: [PATCH 06/60] fix link_wav.py path, test=tts

---
 examples/csmsc/voc3/finetune.sh | 2 +-
 examples/csmsc/voc5/finetune.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/csmsc/voc3/finetune.sh b/examples/csmsc/voc3/finetune.sh
index ca7958cf4..6719bd0be 100755
--- a/examples/csmsc/voc3/finetune.sh
+++ b/examples/csmsc/voc3/finetune.sh
@@ -21,7 +21,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
 fi
 
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    python3 link_wav.py \
+    python3 ${MAIN_ROOT}/utils/link_wav.py \
         --old-dump-dir=dump \
         --dump-dir=dump_finetune
 fi
diff --git a/examples/csmsc/voc5/finetune.sh b/examples/csmsc/voc5/finetune.sh
index ca7958cf4..6719bd0be 100755
--- a/examples/csmsc/voc5/finetune.sh
+++ b/examples/csmsc/voc5/finetune.sh
@@ -21,7 +21,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
 fi
 
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    python3 link_wav.py \
+    python3 ${MAIN_ROOT}/utils/link_wav.py \
         --old-dump-dir=dump \
         --dump-dir=dump_finetune
 fi

From c09466ebbe32e61888a98f6de52144e527998005 Mon Sep 17 00:00:00 2001
From: KP <109694228@qq.com>
Date: Tue, 11 Jan 2022 14:46:57 +0800
Subject: [PATCH 07/60] Add ECAPA_TDNN. (#1295)

---
 paddlespeech/vector/models/ecapa_tdnn.py | 417 +++++++++++++++++++++++
 1 file changed, 417 insertions(+)
 create mode 100644 paddlespeech/vector/models/ecapa_tdnn.py

diff --git a/paddlespeech/vector/models/ecapa_tdnn.py b/paddlespeech/vector/models/ecapa_tdnn.py
new file mode 100644
index 000000000..5512f5097
--- /dev/null
+++ b/paddlespeech/vector/models/ecapa_tdnn.py
@@ -0,0 +1,417 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+def length_to_mask(length, max_len=None, dtype=None):
+    assert len(length.shape) == 1
+
+    if max_len is None:
+        max_len = length.max().astype(
+            'int').item()  # using arange to generate mask
+    mask = paddle.arange(
+        max_len, dtype=length.dtype).expand(
+            (len(length), max_len)) < length.unsqueeze(1)
+
+    if dtype is None:
+        dtype = length.dtype
+
+    mask = paddle.to_tensor(mask, dtype=dtype)
+    return mask
+
+
+class Conv1d(nn.Layer):
+    def __init__(
+            self,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=1,
+            padding="same",
+            dilation=1,
+            groups=1,
+            bias=True,
+            padding_mode="reflect", ):
+        super(Conv1d, self).__init__()
+
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.dilation = dilation
+        self.padding = padding
+        self.padding_mode = padding_mode
+
+        self.conv = nn.Conv1D(
+            in_channels,
+            out_channels,
+            self.kernel_size,
+            stride=self.stride,
+            padding=0,
+            dilation=self.dilation,
+            groups=groups,
+            bias_attr=bias, )
+
+    def forward(self, x):
+        if self.padding == "same":
+            x = self._manage_padding(x, self.kernel_size, self.dilation,
+                                     self.stride)
+        else:
+            raise ValueError("Padding must be 'same'. Got {self.padding}")
+
+        return self.conv(x)
+
+    def _manage_padding(self, x, kernel_size: int, dilation: int, stride: int):
+        L_in = x.shape[-1]  # Detecting input shape
+        padding = self._get_padding_elem(L_in, stride, kernel_size,
+                                         dilation)  # Time padding
+        x = F.pad(
+            x, padding, mode=self.padding_mode,
+            data_format="NCL")  # Applying padding
+        return x
+
+    def _get_padding_elem(self,
+                          L_in: int,
+                          stride: int,
+                          kernel_size: int,
+                          dilation: int):
+        if stride > 1:
+            n_steps = math.ceil(((L_in - kernel_size * dilation) / stride) + 1)
+            L_out = stride * (n_steps - 1) + kernel_size * dilation
+            padding = [kernel_size // 2, kernel_size // 2]
+        else:
+            L_out = (L_in - dilation * (kernel_size - 1) - 1) // stride + 1
+
+            padding = [(L_in - L_out) // 2, (L_in - L_out) // 2]
+
+        return padding
+
+
+class BatchNorm1d(nn.Layer):
+    def __init__(
+            self,
+            input_size,
+            eps=1e-05,
+            momentum=0.9,
+            weight_attr=None,
+            bias_attr=None,
+            data_format='NCL',
+            use_global_stats=None, ):
+        super(BatchNorm1d, self).__init__()
+
+        self.norm = nn.BatchNorm1D(
+            input_size,
+            epsilon=eps,
+            momentum=momentum,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr,
+            data_format=data_format,
+            use_global_stats=use_global_stats, )
+
+    def forward(self, x):
+        x_n = self.norm(x)
+        return x_n
+
+
+class TDNNBlock(nn.Layer):
+    def __init__(
+            self,
+            in_channels,
+            out_channels,
+            kernel_size,
+            dilation,
+            activation=nn.ReLU, ):
+        super(TDNNBlock, self).__init__()
+        self.conv = Conv1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            dilation=dilation, )
+        self.activation = activation()
+        self.norm = BatchNorm1d(input_size=out_channels)
+
+    def forward(self, x):
+        return self.norm(self.activation(self.conv(x)))
+
+
+class Res2NetBlock(nn.Layer):
+    def __init__(self, in_channels, out_channels, scale=8, dilation=1):
+        super(Res2NetBlock, self).__init__()
+        assert in_channels % scale == 0
+        assert out_channels % scale == 0
+
+        in_channel = in_channels // scale
+        hidden_channel = out_channels // scale
+
+        self.blocks = nn.LayerList([
+            TDNNBlock(
+                in_channel, hidden_channel, kernel_size=3, dilation=dilation)
+            for i in range(scale - 1)
+        ])
+        self.scale = scale
+
+    def forward(self, x):
+        y = []
+        for i, x_i in enumerate(paddle.chunk(x, self.scale, axis=1)):
+            if i == 0:
+                y_i = x_i
+            elif i == 1:
+                y_i = self.blocks[i - 1](x_i)
+            else:
+                y_i = self.blocks[i - 1](x_i + y_i)
+            y.append(y_i)
+        y = paddle.concat(y, axis=1)
+        return y
+
+
+class SEBlock(nn.Layer):
+    def __init__(self, in_channels, se_channels, out_channels):
+        super(SEBlock, self).__init__()
+
+        self.conv1 = Conv1d(
+            in_channels=in_channels, out_channels=se_channels, kernel_size=1)
+        self.relu = paddle.nn.ReLU()
+        self.conv2 = Conv1d(
+            in_channels=se_channels, out_channels=out_channels, kernel_size=1)
+        self.sigmoid = paddle.nn.Sigmoid()
+
+    def forward(self, x, lengths=None):
+        L = x.shape[-1]
+        if lengths is not None:
+            mask = length_to_mask(lengths * L, max_len=L)
+            mask = mask.unsqueeze(1)
+            total = mask.sum(axis=2, keepdim=True)
+            s = (x * mask).sum(axis=2, keepdim=True) / total
+        else:
+            s = x.mean(axis=2, keepdim=True)
+
+        s = self.relu(self.conv1(s))
+        s = self.sigmoid(self.conv2(s))
+
+        return s * x
+
+
+class AttentiveStatisticsPooling(nn.Layer):
+    def __init__(self, channels, attention_channels=128, global_context=True):
+        super().__init__()
+
+        self.eps = 1e-12
+        self.global_context = global_context
+        if global_context:
+            self.tdnn = TDNNBlock(channels * 3, attention_channels, 1, 1)
+        else:
+            self.tdnn = TDNNBlock(channels, attention_channels, 1, 1)
+        self.tanh = nn.Tanh()
+        self.conv = Conv1d(
+            in_channels=attention_channels,
+            out_channels=channels,
+            kernel_size=1)
+
+    def forward(self, x, lengths=None):
+        C, L = x.shape[1], x.shape[2]  # KP: (N, C, L)
+
+        def _compute_statistics(x, m, axis=2, eps=self.eps):
+            mean = (m * x).sum(axis)
+            std = paddle.sqrt(
+                (m * (x - mean.unsqueeze(axis)).pow(2)).sum(axis).clip(eps))
+            return mean, std
+
+        if lengths is None:
+            lengths = paddle.ones([x.shape[0]])
+
+        # Make binary mask of shape [N, 1, L]
+        mask = length_to_mask(lengths * L, max_len=L)
+        mask = mask.unsqueeze(1)
+
+        # Expand the temporal context of the pooling layer by allowing the
+        # self-attention to look at global properties of the utterance.
+        if self.global_context:
+            total = mask.sum(axis=2, keepdim=True).astype('float32')
+            mean, std = _compute_statistics(x, mask / total)
+            mean = mean.unsqueeze(2).tile((1, 1, L))
+            std = std.unsqueeze(2).tile((1, 1, L))
+            attn = paddle.concat([x, mean, std], axis=1)
+        else:
+            attn = x
+
+        # Apply layers
+        attn = self.conv(self.tanh(self.tdnn(attn)))
+
+        # Filter out zero-paddings
+        attn = paddle.where(
+            mask.tile((1, C, 1)) == 0,
+            paddle.ones_like(attn) * float("-inf"), attn)
+
+        attn = F.softmax(attn, axis=2)
+        mean, std = _compute_statistics(x, attn)
+
+        # Append mean and std of the batch
+        pooled_stats = paddle.concat((mean, std), axis=1)
+        pooled_stats = pooled_stats.unsqueeze(2)
+
+        return pooled_stats
+
+
+class SERes2NetBlock(nn.Layer):
+    def __init__(
+            self,
+            in_channels,
+            out_channels,
+            res2net_scale=8,
+            se_channels=128,
+            kernel_size=1,
+            dilation=1,
+            activation=nn.ReLU, ):
+        super(SERes2NetBlock, self).__init__()
+        self.out_channels = out_channels
+        self.tdnn1 = TDNNBlock(
+            in_channels,
+            out_channels,
+            kernel_size=1,
+            dilation=1,
+            activation=activation, )
+        self.res2net_block = Res2NetBlock(out_channels, out_channels,
+                                          res2net_scale, dilation)
+        self.tdnn2 = TDNNBlock(
+            out_channels,
+            out_channels,
+            kernel_size=1,
+            dilation=1,
+            activation=activation, )
+        self.se_block = SEBlock(out_channels, se_channels, out_channels)
+
+        self.shortcut = None
+        if in_channels != out_channels:
+            self.shortcut = Conv1d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=1, )
+
+    def forward(self, x, lengths=None):
+        residual = x
+        if self.shortcut:
+            residual = self.shortcut(x)
+
+        x = self.tdnn1(x)
+        x = self.res2net_block(x)
+        x = self.tdnn2(x)
+        x = self.se_block(x, lengths)
+
+        return x + residual
+
+
+class ECAPA_TDNN(nn.Layer):
+    def __init__(
+            self,
+            input_size,
+            lin_neurons=192,
+            activation=nn.ReLU,
+            channels=[512, 512, 512, 512, 1536],
+            kernel_sizes=[5, 3, 3, 3, 1],
+            dilations=[1, 2, 3, 4, 1],
+            attention_channels=128,
+            res2net_scale=8,
+            se_channels=128,
+            global_context=True, ):
+
+        super(ECAPA_TDNN, self).__init__()
+        assert len(channels) == len(kernel_sizes)
+        assert len(channels) == len(dilations)
+        self.channels = channels
+        self.blocks = nn.LayerList()
+        self.emb_size = lin_neurons
+
+        # The initial TDNN layer
+        self.blocks.append(
+            TDNNBlock(
+                input_size,
+                channels[0],
+                kernel_sizes[0],
+                dilations[0],
+                activation, ))
+
+        # SE-Res2Net layers
+        for i in range(1, len(channels) - 1):
+            self.blocks.append(
+                SERes2NetBlock(
+                    channels[i - 1],
+                    channels[i],
+                    res2net_scale=res2net_scale,
+                    se_channels=se_channels,
+                    kernel_size=kernel_sizes[i],
+                    dilation=dilations[i],
+                    activation=activation, ))
+
+        # Multi-layer feature aggregation
+        self.mfa = TDNNBlock(
+            channels[-1],
+            channels[-1],
+            kernel_sizes[-1],
+            dilations[-1],
+            activation, )
+
+        # Attentive Statistical Pooling
+        self.asp = AttentiveStatisticsPooling(
+            channels[-1],
+            attention_channels=attention_channels,
+            global_context=global_context, )
+        self.asp_bn = BatchNorm1d(input_size=channels[-1] * 2)
+
+        # Final linear transformation
+        self.fc = Conv1d(
+            in_channels=channels[-1] * 2,
+            out_channels=self.emb_size,
+            kernel_size=1, )
+
+    def forward(self, x, lengths=None):
+        xl = []
+        for layer in self.blocks:
+            try:
+                x = layer(x, lengths=lengths)
+            except TypeError:
+                x = layer(x)
+            xl.append(x)
+
+        # Multi-layer feature aggregation
+        x = paddle.concat(xl[1:], axis=1)
+        x = self.mfa(x)
+
+        # Attentive Statistical Pooling
+        x = self.asp(x, lengths=lengths)
+        x = self.asp_bn(x)
+
+        # Final linear transformation
+        x = self.fc(x)
+
+        return x
+
+
+class Classifier(nn.Layer):
+    def __init__(self, backbone, num_class, dtype=paddle.float32):
+        super(Classifier, self).__init__()
+        self.backbone = backbone
+        self.params = nn.ParameterList([
+            paddle.create_parameter(
+                shape=[num_class, self.backbone.emb_size], dtype=dtype)
+        ])
+
+    def forward(self, x):
+        emb = self.backbone(x.transpose([0, 2, 1])).transpose([0, 2, 1])
+        logits = F.linear(
+            F.normalize(emb.squeeze(1)),
+            F.normalize(self.params[0]).transpose([1, 0]))
+
+        return logits

From 010aa65b2b59f2578ea73114c38f26ba5c37a36c Mon Sep 17 00:00:00 2001
From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com>
Date: Tue, 11 Jan 2022 14:48:36 +0800
Subject: [PATCH 08/60] [cli] asr - support English, decode_metod and unified
 config (#1297)

* fix config, test=asr

* fix config, test=doc_fix

* add en and decode_method for cli/asr, test=asr

* test=asr

* fix, test=doc_fix
---
 CHANGELOG.md                            |   9 ++
 paddlespeech/cli/asr/infer.py           |  93 ++++++-------
 paddlespeech/s2t/frontend/normalizer.py |  13 +-
 utils/generate_infer_yaml.py            | 178 ++++++++++++++++++++++++
 4 files changed, 239 insertions(+), 54 deletions(-)
 create mode 100644 utils/generate_infer_yaml.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4dc68c6ff..5ffe80984 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,2 +1,11 @@
 # Changelog
 
+
+Date: 2022-1-10, Author: Jackwaterveg.  
+Add features to: CLI:  
+  - Support English (librispeech/asr1/transformer).  
+  - Support choosing `decode_method` for conformer and transformer models.  
+  - Refactor the config, using the unified config.  
+  - PRLink: https://github.com/PaddlePaddle/PaddleSpeech/pull/1297
+
+***
diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py
index 8de964768..aa4e31d9e 100644
--- a/paddlespeech/cli/asr/infer.py
+++ b/paddlespeech/cli/asr/infer.py
@@ -46,19 +46,29 @@ pretrained_models = {
     # "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav"
     "conformer_wenetspeech-zh-16k": {
         'url':
-        'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/conformer.model.tar.gz',
+        'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1_conformer_wenetspeech_ckpt_0.1.1.model.tar.gz',
         'md5':
-        '54e7a558a6e020c2f5fb224874943f97',
+        '76cb19ed857e6623856b7cd7ebbfeda4',
         'cfg_path':
-        'conf/conformer.yaml',
+        'model.yaml',
         'ckpt_path':
         'exp/conformer/checkpoints/wenetspeech',
     },
+    "transformer_librispeech-en-16k": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/asr1_transformer_librispeech_ckpt_0.1.1.model.tar.gz',
+        'md5':
+        '2c667da24922aad391eacafe37bc1660',
+        'cfg_path':
+        'model.yaml',
+        'ckpt_path':
+        'exp/transformer/checkpoints/avg_10',
+    },
 }
 
 model_alias = {
-    "ds2_offline": "paddlespeech.s2t.models.ds2:DeepSpeech2Model",
-    "ds2_online": "paddlespeech.s2t.models.ds2_online:DeepSpeech2ModelOnline",
+    "deepspeech2offline": "paddlespeech.s2t.models.ds2:DeepSpeech2Model",
+    "deepspeech2online": "paddlespeech.s2t.models.ds2_online:DeepSpeech2ModelOnline",
     "conformer": "paddlespeech.s2t.models.u2:U2Model",
     "transformer": "paddlespeech.s2t.models.u2:U2Model",
     "wenetspeech": "paddlespeech.s2t.models.u2:U2Model",
@@ -85,7 +95,7 @@ class ASRExecutor(BaseExecutor):
             '--lang',
             type=str,
             default='zh',
-            help='Choose model language. zh or en')
+            help='Choose model language. zh or en, zh:[conformer_wenetspeech-zh-16k], en:[transformer_librispeech-en-16k]')
         self.parser.add_argument(
             "--sample_rate",
             type=int,
@@ -97,6 +107,12 @@ class ASRExecutor(BaseExecutor):
             type=str,
             default=None,
             help='Config of asr task. Use deault config when it is None.')
+        self.parser.add_argument(
+            '--decode_method',
+            type=str,
+            default='attention_rescoring',
+            choices=['ctc_greedy_search', 'ctc_prefix_beam_search', 'attention', 'attention_rescoring'],
+            help='only support transformer and conformer model')
         self.parser.add_argument(
             '--ckpt_path',
             type=str,
@@ -136,6 +152,7 @@ class ASRExecutor(BaseExecutor):
                         lang: str='zh',
                         sample_rate: int=16000,
                         cfg_path: Optional[os.PathLike]=None,
+                        decode_method: str='attention_rescoring',
                         ckpt_path: Optional[os.PathLike]=None):
         """
         Init model and other resources from a specific path.
@@ -159,51 +176,36 @@ class ASRExecutor(BaseExecutor):
         else:
             self.cfg_path = os.path.abspath(cfg_path)
             self.ckpt_path = os.path.abspath(ckpt_path + ".pdparams")
-            res_path = os.path.dirname(
+            self.res_path = os.path.dirname(
                 os.path.dirname(os.path.abspath(self.cfg_path)))
 
         #Init body.
         self.config = CfgNode(new_allowed=True)
         self.config.merge_from_file(self.cfg_path)
-        self.config.decoding.decoding_method = "attention_rescoring"
 
         with UpdateConfig(self.config):
-            if "ds2_online" in model_type or "ds2_offline" in model_type:
+            if "deepspeech2online" in model_type or "deepspeech2offline" in model_type:
                 from paddlespeech.s2t.io.collator import SpeechCollator
-                self.config.collator.vocab_filepath = os.path.join(
-                    res_path, self.config.collator.vocab_filepath)
-                self.config.collator.mean_std_filepath = os.path.join(
-                    res_path, self.config.collator.cmvn_path)
+                self.vocab = self.config.vocab_filepath
+                self.config.decode.lang_model_path = os.path.join(res_path, self.config.decode.lang_model_path)
                 self.collate_fn_test = SpeechCollator.from_config(self.config)
                 self.text_feature = TextFeaturizer(
-                    unit_type=self.config.collator.unit_type,
-                    vocab=self.config.collator.vocab_filepath,
-                    spm_model_prefix=self.config.collator.spm_model_prefix)
-                self.config.model.input_dim = self.collate_fn_test.feature_size
-                self.config.model.output_dim = self.text_feature.vocab_size
+                    unit_type=self.config.unit_type,
+                    vocab=self.vocab)
             elif "conformer" in model_type or "transformer" in model_type or "wenetspeech" in model_type:
-                self.config.collator.vocab_filepath = os.path.join(
-                    res_path, self.config.collator.vocab_filepath)
-                self.config.collator.augmentation_config = os.path.join(
-                    res_path, self.config.collator.augmentation_config)
-                self.config.collator.spm_model_prefix = os.path.join(
-                    res_path, self.config.collator.spm_model_prefix)
+                self.config.spm_model_prefix = os.path.join(self.res_path, self.config.spm_model_prefix)
                 self.text_feature = TextFeaturizer(
-                    unit_type=self.config.collator.unit_type,
-                    vocab=self.config.collator.vocab_filepath,
-                    spm_model_prefix=self.config.collator.spm_model_prefix)
-                self.config.model.input_dim = self.config.collator.feat_dim
-                self.config.model.output_dim = self.text_feature.vocab_size
+                    unit_type=self.config.unit_type,
+                    vocab=self.config.vocab_filepath,
+                    spm_model_prefix=self.config.spm_model_prefix)
+                self.config.decode.decoding_method = decode_method
 
             else:
                 raise Exception("wrong type")
-        # Enter the path of model root
-
         model_name = model_type[:model_type.rindex(
             '_')]  # model_type: {model_name}_{dataset}
         model_class = dynamic_import(model_name, model_alias)
-        model_conf = self.config.model
-        logger.info(model_conf)
+        model_conf = self.config
         model = model_class.from_config(model_conf)
         self.model = model
         self.model.eval()
@@ -222,7 +224,7 @@ class ASRExecutor(BaseExecutor):
         logger.info("Preprocess audio_file:" + audio_file)
 
         # Get the object for feature extraction
-        if "ds2_online" in model_type or "ds2_offline" in model_type:
+        if "deepspeech2online" in model_type or "deepspeech2offline" in model_type:
             audio, _ = self.collate_fn_test.process_utterance(
                 audio_file=audio_file, transcript=" ")
             audio_len = audio.shape[0]
@@ -236,18 +238,7 @@ class ASRExecutor(BaseExecutor):
 
         elif "conformer" in model_type or "transformer" in model_type or "wenetspeech" in model_type:
             logger.info("get the preprocess conf")
-            preprocess_conf_file = self.config.collator.augmentation_config
-            # redirect the cmvn path
-            with io.open(preprocess_conf_file, encoding="utf-8") as f:
-                preprocess_conf = yaml.safe_load(f)
-                for idx, process in enumerate(preprocess_conf["process"]):
-                    if process['type'] == "cmvn_json":
-                        preprocess_conf["process"][idx][
-                            "cmvn_path"] = os.path.join(
-                                self.res_path,
-                                preprocess_conf["process"][idx]["cmvn_path"])
-                        break
-            logger.info(preprocess_conf)
+            preprocess_conf = self.config.preprocess_config
             preprocess_args = {"train": False}
             preprocessing = Transformation(preprocess_conf)
             logger.info("read the audio file")
@@ -289,10 +280,10 @@ class ASRExecutor(BaseExecutor):
         Model inference and result stored in self.output.
         """
 
-        cfg = self.config.decoding
+        cfg = self.config.decode
         audio = self._inputs["audio"]
         audio_len = self._inputs["audio_len"]
-        if "ds2_online" in model_type or "ds2_offline" in model_type:
+        if "deepspeech2online" in model_type or "deepspeech2offline" in model_type:
             result_transcripts = self.model.decode(
                 audio,
                 audio_len,
@@ -414,12 +405,13 @@ class ASRExecutor(BaseExecutor):
         config = parser_args.config
         ckpt_path = parser_args.ckpt_path
         audio_file = parser_args.input
+        decode_method = parser_args.decode_method
         force_yes = parser_args.yes
         device = parser_args.device
 
         try:
             res = self(audio_file, model, lang, sample_rate, config, ckpt_path,
-                       force_yes, device)
+                        decode_method, force_yes, device)
             logger.info('ASR Result: {}'.format(res))
             return True
         except Exception as e:
@@ -434,6 +426,7 @@ class ASRExecutor(BaseExecutor):
                  sample_rate: int=16000,
                  config: os.PathLike=None,
                  ckpt_path: os.PathLike=None,
+                 decode_method: str='attention_rescoring',
                  force_yes: bool=False,
                  device=paddle.get_device()):
         """
@@ -442,7 +435,7 @@ class ASRExecutor(BaseExecutor):
         audio_file = os.path.abspath(audio_file)
         self._check(audio_file, sample_rate, force_yes)
         paddle.set_device(device)
-        self._init_from_path(model, lang, sample_rate, config, ckpt_path)
+        self._init_from_path(model, lang, sample_rate, config, decode_method, ckpt_path)
         self.preprocess(model, audio_file)
         self.infer(model)
         res = self.postprocess()  # Retrieve result of asr.
diff --git a/paddlespeech/s2t/frontend/normalizer.py b/paddlespeech/s2t/frontend/normalizer.py
index 017851e63..b596b2ab0 100644
--- a/paddlespeech/s2t/frontend/normalizer.py
+++ b/paddlespeech/s2t/frontend/normalizer.py
@@ -117,7 +117,8 @@ class FeatureNormalizer(object):
             self._compute_mean_std(manifest_path, featurize_func, num_samples,
                                    num_workers)
         else:
-            self._read_mean_std_from_file(mean_std_filepath)
+            mean_std = mean_std_filepath
+            self._read_mean_std_from_file(mean_std)
 
     def apply(self, features):
         """Normalize features to be of zero mean and unit stddev.
@@ -131,10 +132,14 @@ class FeatureNormalizer(object):
         """
         return (features - self._mean) * self._istd
 
-    def _read_mean_std_from_file(self, filepath, eps=1e-20):
+    def _read_mean_std_from_file(self, mean_std, eps=1e-20):
         """Load mean and std from file."""
-        filetype = filepath.split(".")[-1]
-        mean, istd = load_cmvn(filepath, filetype=filetype)
+        if isinstance(mean_std, list):
+            mean = mean_std[0]['cmvn_stats']['mean']
+            istd = mean_std[0]['cmvn_stats']['istd']
+        else:
+            filetype = mean_std.split(".")[-1]
+            mean, istd = load_cmvn(mean_std, filetype=filetype)
         self._mean = np.expand_dims(mean, axis=0)
         self._istd = np.expand_dims(istd, axis=0)
 
diff --git a/utils/generate_infer_yaml.py b/utils/generate_infer_yaml.py
new file mode 100644
index 000000000..d2a6777c7
--- /dev/null
+++ b/utils/generate_infer_yaml.py
@@ -0,0 +1,178 @@
+#!/usr/bin/env python3
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+'''
+    Merge training configs into a single inference config.
+    The single inference config is for CLI, which only takes a single config to do inferencing.
+    The trainig configs includes: model config, preprocess config, decode config, vocab file and cmvn file.
+'''
+
+import yaml
+import json
+import os
+import argparse
+import math
+from yacs.config import CfgNode
+
+from paddlespeech.s2t.frontend.utility import load_dict
+from contextlib import redirect_stdout
+
+
+def save(save_path, config):
+    with open(save_path, 'w') as fp:
+        with redirect_stdout(fp):
+            print(config.dump())
+
+
+def load(save_path):
+    config = CfgNode(new_allowed=True)
+    config.merge_from_file(save_path)
+    return config
+
+def load_json(json_path):
+    with open(json_path) as f:
+        json_content = json.load(f)
+    return json_content
+
+def remove_config_part(config, key_list):
+    if len(key_list) == 0:
+        return
+    for i in range(len(key_list) -1):
+        config = config[key_list[i]]
+    config.pop(key_list[-1])
+
+def load_cmvn_from_json(cmvn_stats):
+    means = cmvn_stats['mean_stat']
+    variance = cmvn_stats['var_stat']
+    count = cmvn_stats['frame_num']
+    for i in range(len(means)):
+        means[i] /= count
+        variance[i] = variance[i] / count - means[i] * means[i]
+        if variance[i] < 1.0e-20:
+            variance[i] = 1.0e-20
+        variance[i] = 1.0 / math.sqrt(variance[i])
+    cmvn_stats = {"mean":means, "istd":variance}
+    return cmvn_stats
+
+def merge_configs(
+        conf_path = "conf/conformer.yaml",
+        preprocess_path = "conf/preprocess.yaml",
+        decode_path = "conf/tuning/decode.yaml",
+        vocab_path = "data/vocab.txt",
+        cmvn_path = "data/mean_std.json",
+        save_path = "conf/conformer_infer.yaml",
+    ):
+
+    # Load the configs
+    config = load(conf_path)
+    decode_config = load(decode_path)
+    vocab_list = load_dict(vocab_path)
+    cmvn_stats = load_json(cmvn_path)
+    if os.path.exists(preprocess_path):
+        preprocess_config =  load(preprocess_path)
+        for idx, process in enumerate(preprocess_config["process"]):
+            if process['type'] == "cmvn_json":
+                preprocess_config["process"][idx][
+                    "cmvn_path"] = cmvn_stats
+                break
+
+        config.preprocess_config = preprocess_config
+    else:
+        cmvn_stats = load_cmvn_from_json(cmvn_stats)
+        config.mean_std_filepath = [{"cmvn_stats":cmvn_stats}]
+        config.augmentation_config = ''
+
+    # Updata the config
+    config.vocab_filepath = vocab_list
+    config.input_dim = config.feat_dim
+    config.output_dim = len(config.vocab_filepath)
+    config.decode = decode_config
+    # Remove some parts of the config
+
+    if os.path.exists(preprocess_path):
+        remove_train_list = ["train_manifest",
+            "dev_manifest",
+            "test_manifest",
+            "n_epoch",
+            "accum_grad",
+            "global_grad_clip",
+            "optim",
+            "optim_conf",
+            "scheduler",
+            "scheduler_conf",
+            "log_interval",
+            "checkpoint",
+            "shuffle_method",
+            "weight_decay",
+            "ctc_grad_norm_type",
+            "minibatches",
+            "subsampling_factor",
+            "batch_bins",
+            "batch_count",
+            "batch_frames_in",
+            "batch_frames_inout",
+            "batch_frames_out",
+            "sortagrad",
+            "feat_dim",
+            "stride_ms",
+            "window_ms",
+            "batch_size",
+            "maxlen_in",
+            "maxlen_out",
+            ]
+    else:
+         remove_train_list = ["train_manifest",
+            "dev_manifest",
+            "test_manifest",
+            "n_epoch",
+            "accum_grad",
+            "global_grad_clip",
+            "log_interval",
+            "checkpoint",
+            "lr",
+            "lr_decay",
+            "batch_size",
+            "shuffle_method",
+            "weight_decay",
+            "sortagrad",
+            "num_workers",
+            ]
+
+    for item in remove_train_list:
+        try:
+            remove_config_part(config, [item])
+        except:
+            print ( item + " " +"can not be removed")
+
+    # Save the config
+    save(save_path, config)
+
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        prog='Config merge', add_help=True)
+    parser.add_argument(
+        '--cfg_pth', type=str, default = 'conf/transformer.yaml', help='origin config file')
+    parser.add_argument(
+        '--pre_pth', type=str, default= "conf/preprocess.yaml", help='')
+    parser.add_argument(
+        '--dcd_pth', type=str, default= "conf/tuninig/decode.yaml", help='')
+    parser.add_argument(
+        '--vb_pth', type=str, default= "data/lang_char/vocab.txt", help='')
+    parser.add_argument(
+        '--cmvn_pth', type=str, default= "data/mean_std.json", help='')
+    parser.add_argument(
+        '--save_pth', type=str, default= "conf/transformer_infer.yaml", help='')
+    parser_args = parser.parse_args()
+
+    merge_configs(
+        conf_path = parser_args.cfg_pth,
+        decode_path = parser_args.dcd_pth,
+        preprocess_path =  parser_args.pre_pth,
+        vocab_path = parser_args.vb_pth,
+        cmvn_path = parser_args.cmvn_pth,
+        save_path = parser_args.save_pth,
+    )
+
+

From fcc34e3e95b8dbdf4ef7b701b1d4a25712f27121 Mon Sep 17 00:00:00 2001
From: Jerryuhoo <jerryuhoo@gmail.com>
Date: Tue, 11 Jan 2022 14:53:38 +0800
Subject: [PATCH 09/60] [tts] add gen_gta_mel.py for finetuning speedypeech,
 test=tts

---
 .../t2s/exps/speedyspeech/gen_gta_mel.py      | 235 ++++++++++++++++++
 1 file changed, 235 insertions(+)
 create mode 100644 paddlespeech/t2s/exps/speedyspeech/gen_gta_mel.py

diff --git a/paddlespeech/t2s/exps/speedyspeech/gen_gta_mel.py b/paddlespeech/t2s/exps/speedyspeech/gen_gta_mel.py
new file mode 100644
index 000000000..ddd961a95
--- /dev/null
+++ b/paddlespeech/t2s/exps/speedyspeech/gen_gta_mel.py
@@ -0,0 +1,235 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# generate mels using durations.txt
+# for mb melgan finetune
+# 长度和原本的 mel 不一致怎么办？
+import argparse
+from pathlib import Path
+
+import numpy as np
+import paddle
+import yaml
+from yacs.config import CfgNode
+from tqdm import tqdm
+import os
+
+from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur
+from paddlespeech.t2s.datasets.preprocess_utils import merge_silence
+from paddlespeech.t2s.models.speedyspeech import SpeedySpeech
+from paddlespeech.t2s.models.speedyspeech import SpeedySpeechInference
+from paddlespeech.t2s.modules.normalizer import ZScore
+from paddlespeech.t2s.frontend.zh_frontend import Frontend
+
+def evaluate(args, speedyspeech_config):
+    rootdir = Path(args.rootdir).expanduser()
+    assert rootdir.is_dir()
+
+    # construct dataset for evaluation
+    with open(args.phones_dict, "r") as f:
+        phn_id = [line.strip().split() for line in f.readlines()]
+    vocab_size = len(phn_id)
+    print("vocab_size:", vocab_size)
+
+    phone_dict = {}
+    for phn, id in phn_id:
+        phone_dict[phn] = int(id)
+
+    with open(args.tones_dict, "r") as f:
+        tone_id = [line.strip().split() for line in f.readlines()]
+    tone_size = len(tone_id)
+    print("tone_size:", tone_size)
+
+    frontend = Frontend(phone_vocab_path=args.phones_dict, tone_vocab_path=args.tones_dict)
+
+    if args.speaker_dict:
+        with open(args.speaker_dict, 'rt') as f:
+            spk_id_list = [line.strip().split() for line in f.readlines()]
+            spk_num = len(spk_id_list)
+    else:
+        spk_num=None
+
+    model = SpeedySpeech(
+        vocab_size=vocab_size, tone_size=tone_size, **speedyspeech_config["model"], spk_num=spk_num)
+
+    model.set_state_dict(
+        paddle.load(args.speedyspeech_checkpoint)["main_params"])
+    model.eval()
+
+    stat = np.load(args.speedyspeech_stat)
+    mu, std = stat
+    mu = paddle.to_tensor(mu)
+    std = paddle.to_tensor(std)
+    speedyspeech_normalizer = ZScore(mu, std)
+
+    speedyspeech_inference = SpeedySpeechInference(speedyspeech_normalizer,
+                                                      model)
+    speedyspeech_inference.eval()
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    sentences, speaker_set = get_phn_dur(args.dur_file)
+    merge_silence(sentences)
+
+    if args.dataset == "baker":
+        wav_files = sorted(list((rootdir / "Wave").rglob("*.wav")))
+        # split data into 3 sections
+        num_train = 9800
+        num_dev = 100
+        train_wav_files = wav_files[:num_train]
+        dev_wav_files = wav_files[num_train:num_train + num_dev]
+        test_wav_files = wav_files[num_train + num_dev:]
+    elif args.dataset == "aishell3":
+        sub_num_dev = 5
+        wav_dir = rootdir / "train" / "wav"
+        train_wav_files = []
+        dev_wav_files = []
+        test_wav_files = []
+        for speaker in os.listdir(wav_dir):
+            wav_files = sorted(list((wav_dir / speaker).rglob("*.wav")))
+            if len(wav_files) > 100:
+                train_wav_files += wav_files[:-sub_num_dev * 2]
+                dev_wav_files += wav_files[-sub_num_dev * 2:-sub_num_dev]
+                test_wav_files += wav_files[-sub_num_dev:]
+            else:
+                train_wav_files += wav_files
+
+    train_wav_files = [os.path.basename(str(str_path)) for str_path in train_wav_files]
+    dev_wav_files = [os.path.basename(str(str_path)) for str_path in dev_wav_files]
+    test_wav_files = [os.path.basename(str(str_path)) for str_path in test_wav_files]
+
+    for i, utt_id in enumerate(tqdm(sentences)):
+        phones = sentences[utt_id][0]
+        durations = sentences[utt_id][1]
+        speaker = sentences[utt_id][2]
+        # 裁剪掉开头和结尾的 sil
+        if args.cut_sil:
+            if phones[0] == "sil" and len(durations) > 1:
+                durations = durations[1:]
+                phones = phones[1:]
+            if phones[-1] == 'sil' and len(durations) > 1:
+                durations = durations[:-1]
+                phones = phones[:-1]
+
+        phones, tones = frontend._get_phone_tone(
+                phones, get_tone_ids=True)
+        if tones:
+            tone_ids = frontend._t2id(tones)
+            tone_ids = paddle.to_tensor(tone_ids)
+        if phones:
+            phone_ids = frontend._p2id(phones)
+            phone_ids = paddle.to_tensor(phone_ids)
+
+        if args.speaker_dict:
+            speaker_id = int([item[1] for item in spk_id_list if speaker == item[0]][0])    
+            speaker_id = paddle.to_tensor(speaker_id)
+        else:
+            speaker_id = None
+
+        durations = paddle.to_tensor(np.array(durations))
+        # 生成的和真实的可能有 1, 2 帧的差距，但是 batch_fn 会修复
+        # split data into 3 sections
+
+        wav_path = utt_id + ".wav"
+
+        if wav_path in train_wav_files:
+            sub_output_dir = output_dir / ("train/raw")
+        elif wav_path in dev_wav_files:
+            sub_output_dir = output_dir / ("dev/raw")
+        elif wav_path in test_wav_files:
+            sub_output_dir = output_dir / ("test/raw")
+
+        sub_output_dir.mkdir(parents=True, exist_ok=True)
+
+        with paddle.no_grad():
+            mel = speedyspeech_inference(phone_ids, tone_ids, spk_id=speaker_id)
+        np.save(sub_output_dir / (utt_id + "_feats.npy"), mel)
+
+
+def main():
+    # parse args and config and redirect to train_sp
+    parser = argparse.ArgumentParser(
+        description="Synthesize with speedyspeech & parallel wavegan.")
+    parser.add_argument(
+        "--dataset",
+        default="baker",
+        type=str,
+        help="name of dataset, should in {baker, ljspeech, vctk} now")
+    parser.add_argument(
+        "--rootdir", default=None, type=str, help="directory to dataset.")
+    parser.add_argument(
+        "--speedyspeech-config", type=str, help="speedyspeech config file.")
+    parser.add_argument(
+        "--speedyspeech-checkpoint",
+        type=str,
+        help="speedyspeech checkpoint to load.")
+    parser.add_argument(
+        "--speedyspeech-stat",
+        type=str,
+        help="mean and standard deviation used to normalize spectrogram when training speedyspeech."
+    )
+
+    parser.add_argument(
+        "--phones-dict",
+        type=str,
+        default="phone_id_map.txt",
+        help="phone vocabulary file.")
+    parser.add_argument(
+        "--tones-dict",
+        type=str,
+        default="tone_id_map.txt",
+        help="tone vocabulary file.")
+    parser.add_argument(
+        "--speaker-dict",
+        type=str,
+        default=None,
+        help="speaker id map file.")
+
+    parser.add_argument(
+        "--dur-file", default=None, type=str, help="path to durations.txt.")
+    parser.add_argument("--output-dir", type=str, help="output dir.")
+    parser.add_argument(
+        "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
+
+    def str2bool(str):
+        return True if str.lower() == 'true' else False
+
+    parser.add_argument(
+        "--cut-sil",
+        type=str2bool,
+        default=True,
+        help="whether cut sil in the edge of audio")
+
+    args = parser.parse_args()
+
+    if args.ngpu == 0:
+        paddle.set_device("cpu")
+    elif args.ngpu > 0:
+        paddle.set_device("gpu")
+    else:
+        print("ngpu should >= 0 !")
+
+    with open(args.speedyspeech_config) as f:
+        speedyspeech_config = CfgNode(yaml.safe_load(f))
+
+    print("========Args========")
+    print(yaml.safe_dump(vars(args)))
+    print("========Config========")
+    print(speedyspeech_config)
+
+    evaluate(args, speedyspeech_config)
+
+
+if __name__ == "__main__":
+    main()

From 61b68ed3ef0c5b7d82509acac5d635359561da63 Mon Sep 17 00:00:00 2001
From: Jerryuhoo <jerryuhoo@gmail.com>
Date: Tue, 11 Jan 2022 14:55:28 +0800
Subject: [PATCH 10/60] deal with exceptions of link_wav.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

如果遇到已存在的软链接则跳过，如果在dump中找不到这个符号则删除dump_finetune中的文件。
---
 utils/link_wav.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/utils/link_wav.py b/utils/link_wav.py
index 5b24c87d3..f1b513989 100644
--- a/utils/link_wav.py
+++ b/utils/link_wav.py
@@ -58,9 +58,15 @@ def main():
             mel_path = output_dir / ("raw/" + name)
             gen_mel = np.load(mel_path)
             wave_name = utt_id + "_wave.npy"
-            wav = np.load(old_dump_dir / sub / ("raw/" + wave_name))
-            os.symlink(old_dump_dir / sub / ("raw/" + wave_name),
-                       output_dir / ("raw/" + wave_name))
+            try:
+                wav = np.load(old_dump_dir / sub / ("raw/" + wave_name))
+                os.symlink(old_dump_dir / sub / ("raw/" + wave_name),
+                        output_dir / ("raw/" + wave_name))
+            except FileNotFoundError:
+                print("delete " + name + " because it cannot be found in the dump folder")
+                os.remove(output_dir / "raw" / name)
+            except FileExistsError:
+                print("file " + name + " exists, skip.")     
             num_sample = wav.shape[0]
             num_frames = gen_mel.shape[0]
             wav_path = output_dir / ("raw/" + wave_name)

From 52a8b2f3209b9bd5e6809f9d38348962e2627c75 Mon Sep 17 00:00:00 2001
From: KP <109694228@qq.com>
Date: Tue, 11 Jan 2022 15:04:23 +0800
Subject: [PATCH 11/60] Add ECAPA_TDNN. (#1301)

---
 paddlespeech/vector/models/ecapa_tdnn.py | 44 ++++++++++--------------
 1 file changed, 18 insertions(+), 26 deletions(-)

diff --git a/paddlespeech/vector/models/ecapa_tdnn.py b/paddlespeech/vector/models/ecapa_tdnn.py
index 5512f5097..e493b8004 100644
--- a/paddlespeech/vector/models/ecapa_tdnn.py
+++ b/paddlespeech/vector/models/ecapa_tdnn.py
@@ -47,7 +47,7 @@ class Conv1d(nn.Layer):
             groups=1,
             bias=True,
             padding_mode="reflect", ):
-        super(Conv1d, self).__init__()
+        super().__init__()
 
         self.kernel_size = kernel_size
         self.stride = stride
@@ -110,7 +110,7 @@ class BatchNorm1d(nn.Layer):
             bias_attr=None,
             data_format='NCL',
             use_global_stats=None, ):
-        super(BatchNorm1d, self).__init__()
+        super().__init__()
 
         self.norm = nn.BatchNorm1D(
             input_size,
@@ -134,7 +134,7 @@ class TDNNBlock(nn.Layer):
             kernel_size,
             dilation,
             activation=nn.ReLU, ):
-        super(TDNNBlock, self).__init__()
+        super().__init__()
         self.conv = Conv1d(
             in_channels=in_channels,
             out_channels=out_channels,
@@ -149,7 +149,7 @@ class TDNNBlock(nn.Layer):
 
 class Res2NetBlock(nn.Layer):
     def __init__(self, in_channels, out_channels, scale=8, dilation=1):
-        super(Res2NetBlock, self).__init__()
+        super().__init__()
         assert in_channels % scale == 0
         assert out_channels % scale == 0
 
@@ -179,7 +179,7 @@ class Res2NetBlock(nn.Layer):
 
 class SEBlock(nn.Layer):
     def __init__(self, in_channels, se_channels, out_channels):
-        super(SEBlock, self).__init__()
+        super().__init__()
 
         self.conv1 = Conv1d(
             in_channels=in_channels, out_channels=se_channels, kernel_size=1)
@@ -275,7 +275,7 @@ class SERes2NetBlock(nn.Layer):
             kernel_size=1,
             dilation=1,
             activation=nn.ReLU, ):
-        super(SERes2NetBlock, self).__init__()
+        super().__init__()
         self.out_channels = out_channels
         self.tdnn1 = TDNNBlock(
             in_channels,
@@ -313,7 +313,7 @@ class SERes2NetBlock(nn.Layer):
         return x + residual
 
 
-class ECAPA_TDNN(nn.Layer):
+class EcapaTdnn(nn.Layer):
     def __init__(
             self,
             input_size,
@@ -327,7 +327,7 @@ class ECAPA_TDNN(nn.Layer):
             se_channels=128,
             global_context=True, ):
 
-        super(ECAPA_TDNN, self).__init__()
+        super().__init__()
         assert len(channels) == len(kernel_sizes)
         assert len(channels) == len(dilations)
         self.channels = channels
@@ -377,6 +377,16 @@ class ECAPA_TDNN(nn.Layer):
             kernel_size=1, )
 
     def forward(self, x, lengths=None):
+        """
+        Compute embeddings.
+
+        Args:
+            x (paddle.Tensor): Input log-fbanks with shape (N, n_mels, T).
+            lengths (paddle.Tensor, optional): Length proportions of batch length with shape (N). Defaults to None.
+
+        Returns:
+            paddle.Tensor: Output embeddings with shape (N, self.emb_size, 1)
+        """
         xl = []
         for layer in self.blocks:
             try:
@@ -397,21 +407,3 @@ class ECAPA_TDNN(nn.Layer):
         x = self.fc(x)
 
         return x
-
-
-class Classifier(nn.Layer):
-    def __init__(self, backbone, num_class, dtype=paddle.float32):
-        super(Classifier, self).__init__()
-        self.backbone = backbone
-        self.params = nn.ParameterList([
-            paddle.create_parameter(
-                shape=[num_class, self.backbone.emb_size], dtype=dtype)
-        ])
-
-    def forward(self, x):
-        emb = self.backbone(x.transpose([0, 2, 1])).transpose([0, 2, 1])
-        logits = F.linear(
-            F.normalize(emb.squeeze(1)),
-            F.normalize(self.params[0]).transpose([1, 0]))
-
-        return logits

From 9c1efde2e3220ad40c299aa588bad00a8e71aba4 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Tue, 11 Jan 2022 15:27:50 +0800
Subject: [PATCH 12/60] more mergify label (#1308)

---
 .mergify.yml | 34 +++++++++++++++++++++++++++++-----
 1 file changed, 29 insertions(+), 5 deletions(-)

diff --git a/.mergify.yml b/.mergify.yml
index 2c30721f7..3347c6dc3 100644
--- a/.mergify.yml
+++ b/.mergify.yml
@@ -32,6 +32,12 @@ pull_request_rules:
     actions:
       label:
         remove: ["conflicts"]
+  - name: "auto add label=Dataset"
+    conditions:
+      - files~=^dataset/
+    actions:
+      label:
+        add: ["Dataset"]
   - name: "auto add label=S2T"
     conditions:
       - files~=^paddlespeech/s2t/
@@ -50,18 +56,30 @@ pull_request_rules:
     actions:
       label:
         add: ["Audio"]
-  - name: "auto add label=TextProcess"
+  - name: "auto add label=Vector"
+    conditions:
+      - files~=^paddlespeech/vector/
+    actions:
+      label:
+        add: ["Vector"]
+  - name: "auto add label=Text"
     conditions:
       - files~=^paddlespeech/text/
     actions:
       label:
-        add: ["TextProcess"]
+        add: ["Text"]
   - name: "auto add label=Example"
     conditions:
       - files~=^examples/
     actions:
       label:
         add: ["Example"]
+  - name: "auto add label=CLI"
+    conditions:
+      - files~=^paddlespeech/cli
+    actions:
+      label:
+        add: ["CLI"]
   - name: "auto add label=Demo"
     conditions:
       - files~=^demos/
@@ -70,13 +88,13 @@ pull_request_rules:
         add: ["Demo"]
   - name: "auto add label=README"
     conditions:
-      - files~=README.md
+      - files~=(README.md|READEME_cn.md)
     actions:
       label:
         add: ["README"]
   - name: "auto add label=Documentation"
     conditions:
-      - files~=^docs/
+      - files~=^(docs/|CHANGELOG.md|paddleaudio/CHANGELOG.md)
     actions:
       label:
         add: ["Documentation"]
@@ -88,10 +106,16 @@ pull_request_rules:
         add: ["CI"]
   - name: "auto add label=Installation"
     conditions:
-      - files~=^(tools/|setup.py|setup.sh)
+      - files~=^(tools/|setup.py|setup.cfg|setup_audio.py)
     actions:
       label:
         add: ["Installation"]
+  - name: "auto add label=Test"
+    conditions:
+      - files~=^(tests/)
+    actions:
+      label:
+        add: ["Test"]
   - name: "auto add label=mergify"
     conditions:
       - files~=^.mergify.yml

From be99807d6172c65298e48610d2e15a859d71cb57 Mon Sep 17 00:00:00 2001
From: Jerryuhoo <jerryuhoo@gmail.com>
Date: Tue, 11 Jan 2022 16:32:33 +0800
Subject: [PATCH 13/60] Add durations to gen_gta_mel.py inference

---
 .../t2s/exps/speedyspeech/gen_gta_mel.py      |  6 ++-
 .../t2s/models/speedyspeech/speedyspeech.py   | 52 +++++++++++--------
 2 files changed, 34 insertions(+), 24 deletions(-)

diff --git a/paddlespeech/t2s/exps/speedyspeech/gen_gta_mel.py b/paddlespeech/t2s/exps/speedyspeech/gen_gta_mel.py
index ddd961a95..0c2bb02d6 100644
--- a/paddlespeech/t2s/exps/speedyspeech/gen_gta_mel.py
+++ b/paddlespeech/t2s/exps/speedyspeech/gen_gta_mel.py
@@ -73,7 +73,7 @@ def evaluate(args, speedyspeech_config):
     speedyspeech_normalizer = ZScore(mu, std)
 
     speedyspeech_inference = SpeedySpeechInference(speedyspeech_normalizer,
-                                                      model)
+                                                   model)
     speedyspeech_inference.eval()
 
     output_dir = Path(args.output_dir)
@@ -138,6 +138,8 @@ def evaluate(args, speedyspeech_config):
             speaker_id = None
 
         durations = paddle.to_tensor(np.array(durations))
+        durations = paddle.unsqueeze(durations, axis=0)
+
         # 生成的和真实的可能有 1, 2 帧的差距，但是 batch_fn 会修复
         # split data into 3 sections
 
@@ -153,7 +155,7 @@ def evaluate(args, speedyspeech_config):
         sub_output_dir.mkdir(parents=True, exist_ok=True)
 
         with paddle.no_grad():
-            mel = speedyspeech_inference(phone_ids, tone_ids, spk_id=speaker_id)
+            mel = speedyspeech_inference(phone_ids, tone_ids, durations=durations, spk_id=speaker_id)
         np.save(sub_output_dir / (utt_id + "_feats.npy"), mel)
 
 
diff --git a/paddlespeech/t2s/models/speedyspeech/speedyspeech.py b/paddlespeech/t2s/models/speedyspeech/speedyspeech.py
index 107c5f1cc..263b4c6b9 100644
--- a/paddlespeech/t2s/models/speedyspeech/speedyspeech.py
+++ b/paddlespeech/t2s/models/speedyspeech/speedyspeech.py
@@ -222,7 +222,7 @@ class SpeedySpeech(nn.Layer):
         decoded = self.decoder(encodings)
         return decoded, pred_durations
 
-    def inference(self, text, tones=None, spk_id=None):
+    def inference(self, text, tones=None, durations=None, spk_id=None):
         # text: [T]
         # tones: [T]
         # input of embedding must be int64
@@ -234,24 +234,28 @@ class SpeedySpeech(nn.Layer):
 
         encodings = self.encoder(text, tones, spk_id)
 
-        pred_durations = self.duration_predictor(encodings)  # (1, T)
-        durations_to_expand = paddle.round(pred_durations.exp())
-        durations_to_expand = (durations_to_expand).astype(paddle.int64)
-
-        slens = paddle.sum(durations_to_expand, -1)  # [1]
-        t_dec = slens[0]  # [1]
-        t_enc = paddle.shape(pred_durations)[-1]
-        M = paddle.zeros([1, t_dec, t_enc])
-
-        k = paddle.full([1], 0, dtype=paddle.int64)
-        for j in range(t_enc):
-            d = durations_to_expand[0, j]
-            # If the d == 0, slice action is meaningless and not supported
-            if d >= 1:
-                M[0, k:k + d, j] = 1
-            k += d
-
-        encodings = paddle.matmul(M, encodings)
+        if type(durations) == type(None):
+            pred_durations = self.duration_predictor(encodings)  # (1, T)
+            durations_to_expand = paddle.round(pred_durations.exp())
+            durations_to_expand = (durations_to_expand).astype(paddle.int64)
+
+            slens = paddle.sum(durations_to_expand, -1)  # [1]
+            t_dec = slens[0]  # [1]
+            t_enc = paddle.shape(pred_durations)[-1]
+            M = paddle.zeros([1, t_dec, t_enc])
+
+            k = paddle.full([1], 0, dtype=paddle.int64)
+            for j in range(t_enc):
+                d = durations_to_expand[0, j]
+                # If the d == 0, slice action is meaningless and not supported
+                if d >= 1:
+                    M[0, k:k + d, j] = 1
+                k += d
+
+            encodings = paddle.matmul(M, encodings)
+        else:
+            durations_to_expand = durations
+            encodings = expand(encodings, durations_to_expand)
 
         shape = paddle.shape(encodings)
         t_dec, feature_size = shape[1], shape[2]
@@ -266,7 +270,11 @@ class SpeedySpeechInference(nn.Layer):
         self.normalizer = normalizer
         self.acoustic_model = speedyspeech_model
 
-    def forward(self, phones, tones, spk_id=None):
-        normalized_mel = self.acoustic_model.inference(phones, tones, spk_id)
+    def forward(self, phones, tones, durations=None, spk_id=None):
+        normalized_mel = self.acoustic_model.inference(
+            phones, 
+            tones, 
+            durations=durations, 
+            spk_id=spk_id)
         logmel = self.normalizer.inverse(normalized_mel)
-        return logmel
+        return logmel
\ No newline at end of file

From 27bb76bdb92330ccbed1d70f47ac38dae2a3213e Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Tue, 11 Jan 2022 08:53:25 +0000
Subject: [PATCH 14/60] fix tone_sandhi of yi, test=tts

---
 paddlespeech/t2s/frontend/tone_sandhi.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/paddlespeech/t2s/frontend/tone_sandhi.py b/paddlespeech/t2s/frontend/tone_sandhi.py
index 6ba567bb9..5264e0687 100644
--- a/paddlespeech/t2s/frontend/tone_sandhi.py
+++ b/paddlespeech/t2s/frontend/tone_sandhi.py
@@ -65,6 +65,7 @@ class ToneSandhi():
         self.must_not_neural_tone_words = {
             "男子", "女子", "分子", "原子", "量子", "莲子", "石子", "瓜子", "电子"
         }
+        self.punc = "：，；。？！“”‘’':,;.?!"
 
     # the meaning of jieba pos tag: https://blog.csdn.net/weixin_44174352/article/details/113731041
     # e.g.
@@ -147,7 +148,9 @@ class ToneSandhi():
                         finals[i] = finals[i][:-1] + "2"
                     # "一" before non-tone4 should be yi4, e.g. 一天
                     else:
-                        finals[i] = finals[i][:-1] + "4"
+                        # "一" 后面如果是标点，还读一声
+                        if word[i + 1] not in self.punc:
+                            finals[i] = finals[i][:-1] + "4"
         return finals
 
     def _split_word(self, word: str) -> List[str]:

From c893d0d8287f674a6fd32a83662b6cb529cbee22 Mon Sep 17 00:00:00 2001
From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com>
Date: Tue, 11 Jan 2022 19:35:27 +0800
Subject: [PATCH 15/60] update the required version of yacs, test=doc_fix
 (#1311)

---
 requirements.txt | 2 +-
 setup.py         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index e567dfa70..c68893183 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -43,5 +43,5 @@ typeguard
 unidecode
 visualdl
 webrtcvad
-yacs
+yacs~=0.1.8
 yq
diff --git a/setup.py b/setup.py
index 75b3fe5c8..5d4ff80f7 100644
--- a/setup.py
+++ b/setup.py
@@ -61,7 +61,7 @@ requirements = {
         "typeguard",
         "visualdl",
         "webrtcvad",
-        "yacs",
+        "yacs~=0.1.8",
     ],
     "develop": [
         "ConfigArgParse",

From fe1dc9d2111695d9159be9e89a5239e09f084aaf Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Tue, 11 Jan 2022 11:42:14 +0000
Subject: [PATCH 16/60] refactor the cli/st, test=st

---
 paddlespeech/cli/st/infer.py | 35 +++++++++++++++--------------------
 utils/generate_infer_yaml.py | 33 +++++++++++++++++++--------------
 2 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/paddlespeech/cli/st/infer.py b/paddlespeech/cli/st/infer.py
index d6bd6304d..1276424c5 100644
--- a/paddlespeech/cli/st/infer.py
+++ b/paddlespeech/cli/st/infer.py
@@ -40,11 +40,11 @@ __all__ = ["STExecutor"]
 pretrained_models = {
     "fat_st_ted-en-zh": {
         "url":
-        "https://paddlespeech.bj.bcebos.com/s2t/ted_en_zh/st1/fat_st_ted-en-zh.tar.gz",
+        "https://paddlespeech.bj.bcebos.com/s2t/ted_en_zh/st1/st1_transformer_mtl_noam_ted-en-zh_ckpt_0.1.1.model.tar.gz",
         "md5":
-        "fa0a7425b91b4f8d259c70b2aca5ae67",
+        "d62063f35a16d91210a71081bd2dd557",
         "cfg_path":
-        "conf/transformer_mtl_noam.yaml",
+        "model.yaml",
         "ckpt_path":
         "exp/transformer_mtl_noam/checkpoints/fat_st_ted-en-zh.pdparams",
     }
@@ -170,24 +170,19 @@ class STExecutor(BaseExecutor):
         #Init body.
         self.config = CfgNode(new_allowed=True)
         self.config.merge_from_file(self.cfg_path)
-        self.config.decoding.decoding_method = "fullsentence"
+        self.config.decode.decoding_method = "fullsentence"
 
         with UpdateConfig(self.config):
-            self.config.collator.vocab_filepath = os.path.join(
-                res_path, self.config.collator.vocab_filepath)
-            self.config.collator.cmvn_path = os.path.join(
-                res_path, self.config.collator.cmvn_path)
-            self.config.collator.spm_model_prefix = os.path.join(
-                res_path, self.config.collator.spm_model_prefix)
+            self.config.cmvn_path = os.path.join(
+                res_path, self.config.cmvn_path)
+            self.config.spm_model_prefix = os.path.join(
+                res_path, self.config.spm_model_prefix)
             self.text_feature = TextFeaturizer(
-                unit_type=self.config.collator.unit_type,
-                vocab=self.config.collator.vocab_filepath,
-                spm_model_prefix=self.config.collator.spm_model_prefix)
-            self.config.model.input_dim = self.config.collator.feat_dim
-            self.config.model.output_dim = self.text_feature.vocab_size
-
-        model_conf = self.config.model
-        logger.info(model_conf)
+                unit_type=self.config.unit_type,
+                vocab=self.config.vocab_filepath,
+                spm_model_prefix=self.config.spm_model_prefix)
+
+        model_conf = self.config
         model_name = model_type[:model_type.rindex(
             '_')]  # model_type: {model_name}_{dataset}
         model_class = dynamic_import(model_name, model_alias)
@@ -218,7 +213,7 @@ class STExecutor(BaseExecutor):
         logger.info("Preprocess audio_file:" + audio_file)
 
         if "fat_st" in model_type:
-            cmvn = self.config.collator.cmvn_path
+            cmvn = self.config.cmvn_path
             utt_name = "_tmp"
 
             # Get the object for feature extraction
@@ -284,7 +279,7 @@ class STExecutor(BaseExecutor):
         """
             Model inference and result stored in self.output.
         """
-        cfg = self.config.decoding
+        cfg = self.config.decode
         audio = self._inputs["audio"]
         audio_len = self._inputs["audio_len"]
         if model_type == "fat_st_ted":
diff --git a/utils/generate_infer_yaml.py b/utils/generate_infer_yaml.py
index d2a6777c7..a2eb28c76 100644
--- a/utils/generate_infer_yaml.py
+++ b/utils/generate_infer_yaml.py
@@ -67,21 +67,26 @@ def merge_configs(
     config = load(conf_path)
     decode_config = load(decode_path)
     vocab_list = load_dict(vocab_path)
-    cmvn_stats = load_json(cmvn_path)
-    if os.path.exists(preprocess_path):
-        preprocess_config =  load(preprocess_path)
-        for idx, process in enumerate(preprocess_config["process"]):
-            if process['type'] == "cmvn_json":
-                preprocess_config["process"][idx][
-                    "cmvn_path"] = cmvn_stats
-                break
-
-        config.preprocess_config = preprocess_config
-    else:
-        cmvn_stats = load_cmvn_from_json(cmvn_stats)
-        config.mean_std_filepath = [{"cmvn_stats":cmvn_stats}]
-        config.augmentation_config = ''
 
+    # If use the kaldi feature, do not load the cmvn file
+    if cmvn_path.split(".")[-1] == 'json':
+        cmvn_stats = load_json(cmvn_path)
+        if os.path.exists(preprocess_path):
+            preprocess_config =  load(preprocess_path)
+            for idx, process in enumerate(preprocess_config["process"]):
+                if process['type'] == "cmvn_json":
+                    preprocess_config["process"][idx][
+                        "cmvn_path"] = cmvn_stats
+                    break
+
+            config.preprocess_config = preprocess_config
+        else:
+            cmvn_stats = load_cmvn_from_json(cmvn_stats)
+            config.mean_std_filepath = [{"cmvn_stats":cmvn_stats}]
+            config.augmentation_config = ''
+    # the cmvn file is end with .ark
+    else:
+        config.cmvn_path = cmvn_path
     # Updata the config
     config.vocab_filepath = vocab_list
     config.input_dim = config.feat_dim

From e7189b216c2f0443fffaea8efa6f9804c8de7997 Mon Sep 17 00:00:00 2001
From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com>
Date: Tue, 11 Jan 2022 20:47:02 +0800
Subject: [PATCH 17/60] Update chunk_decode.yaml

---
 examples/aishell/asr1/conf/tuning/chunk_decode.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/aishell/asr1/conf/tuning/chunk_decode.yaml b/examples/aishell/asr1/conf/tuning/chunk_decode.yaml
index 72ede9272..7e8afb7a8 100644
--- a/examples/aishell/asr1/conf/tuning/chunk_decode.yaml
+++ b/examples/aishell/asr1/conf/tuning/chunk_decode.yaml
@@ -3,9 +3,9 @@ decode_batch_size: 128
 error_rate_type: cer 
 decoding_method: attention # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
 ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
-decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
+decoding_chunk_size: 16 # decoding chunk size. Defaults to -1.
     # <0: for decoding, use full chunk.
     # >0: for decoding, use fixed chunk size as set.
     # 0: used for training, it's prohibited here. 
 num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
-simulate_streaming: False  # simulate streaming inference. Defaults to False.
+simulate_streaming: True  # simulate streaming inference. Defaults to False.

From 1e710ef57000d71a3f1981913a9b6f07690b4bbf Mon Sep 17 00:00:00 2001
From: Jerryuhoo <jerryuhoo@gmail.com>
Date: Wed, 12 Jan 2022 09:29:13 +0800
Subject: [PATCH 18/60] Update link_wav.py, test=tts

---
 utils/link_wav.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/utils/link_wav.py b/utils/link_wav.py
index f1b513989..a9ad4136c 100644
--- a/utils/link_wav.py
+++ b/utils/link_wav.py
@@ -65,8 +65,10 @@ def main():
             except FileNotFoundError:
                 print("delete " + name + " because it cannot be found in the dump folder")
                 os.remove(output_dir / "raw" / name)
+                continue
             except FileExistsError:
-                print("file " + name + " exists, skip.")     
+                print("file " + name + " exists, skip.")
+                continue
             num_sample = wav.shape[0]
             num_frames = gen_mel.shape[0]
             wav_path = output_dir / ("raw/" + wave_name)

From 111a45237889ace23764d1399b64a619a45047ce Mon Sep 17 00:00:00 2001
From: Jerryuhoo <jerryuhoo@gmail.com>
Date: Wed, 12 Jan 2022 13:58:33 +0800
Subject: [PATCH 19/60] Fix the code format, test=tts

---
 .../t2s/exps/speedyspeech/gen_gta_mel.py      | 43 +++++++++++--------
 .../t2s/models/speedyspeech/speedyspeech.py   |  7 +--
 utils/link_wav.py                             |  6 ++-
 3 files changed, 32 insertions(+), 24 deletions(-)

diff --git a/paddlespeech/t2s/exps/speedyspeech/gen_gta_mel.py b/paddlespeech/t2s/exps/speedyspeech/gen_gta_mel.py
index 0c2bb02d6..b6440fd6f 100644
--- a/paddlespeech/t2s/exps/speedyspeech/gen_gta_mel.py
+++ b/paddlespeech/t2s/exps/speedyspeech/gen_gta_mel.py
@@ -15,21 +15,22 @@
 # for mb melgan finetune
 # 长度和原本的 mel 不一致怎么办？
 import argparse
+import os
 from pathlib import Path
 
 import numpy as np
 import paddle
 import yaml
-from yacs.config import CfgNode
 from tqdm import tqdm
-import os
+from yacs.config import CfgNode
 
 from paddlespeech.t2s.datasets.preprocess_utils import get_phn_dur
 from paddlespeech.t2s.datasets.preprocess_utils import merge_silence
+from paddlespeech.t2s.frontend.zh_frontend import Frontend
 from paddlespeech.t2s.models.speedyspeech import SpeedySpeech
 from paddlespeech.t2s.models.speedyspeech import SpeedySpeechInference
 from paddlespeech.t2s.modules.normalizer import ZScore
-from paddlespeech.t2s.frontend.zh_frontend import Frontend
+
 
 def evaluate(args, speedyspeech_config):
     rootdir = Path(args.rootdir).expanduser()
@@ -50,17 +51,21 @@ def evaluate(args, speedyspeech_config):
     tone_size = len(tone_id)
     print("tone_size:", tone_size)
 
-    frontend = Frontend(phone_vocab_path=args.phones_dict, tone_vocab_path=args.tones_dict)
+    frontend = Frontend(
+        phone_vocab_path=args.phones_dict, tone_vocab_path=args.tones_dict)
 
     if args.speaker_dict:
         with open(args.speaker_dict, 'rt') as f:
             spk_id_list = [line.strip().split() for line in f.readlines()]
             spk_num = len(spk_id_list)
     else:
-        spk_num=None
+        spk_num = None
 
     model = SpeedySpeech(
-        vocab_size=vocab_size, tone_size=tone_size, **speedyspeech_config["model"], spk_num=spk_num)
+        vocab_size=vocab_size,
+        tone_size=tone_size,
+        **speedyspeech_config["model"],
+        spk_num=spk_num)
 
     model.set_state_dict(
         paddle.load(args.speedyspeech_checkpoint)["main_params"])
@@ -105,9 +110,15 @@ def evaluate(args, speedyspeech_config):
             else:
                 train_wav_files += wav_files
 
-    train_wav_files = [os.path.basename(str(str_path)) for str_path in train_wav_files]
-    dev_wav_files = [os.path.basename(str(str_path)) for str_path in dev_wav_files]
-    test_wav_files = [os.path.basename(str(str_path)) for str_path in test_wav_files]
+    train_wav_files = [
+        os.path.basename(str(str_path)) for str_path in train_wav_files
+    ]
+    dev_wav_files = [
+        os.path.basename(str(str_path)) for str_path in dev_wav_files
+    ]
+    test_wav_files = [
+        os.path.basename(str(str_path)) for str_path in test_wav_files
+    ]
 
     for i, utt_id in enumerate(tqdm(sentences)):
         phones = sentences[utt_id][0]
@@ -122,8 +133,7 @@ def evaluate(args, speedyspeech_config):
                 durations = durations[:-1]
                 phones = phones[:-1]
 
-        phones, tones = frontend._get_phone_tone(
-                phones, get_tone_ids=True)
+        phones, tones = frontend._get_phone_tone(phones, get_tone_ids=True)
         if tones:
             tone_ids = frontend._t2id(tones)
             tone_ids = paddle.to_tensor(tone_ids)
@@ -132,7 +142,8 @@ def evaluate(args, speedyspeech_config):
             phone_ids = paddle.to_tensor(phone_ids)
 
         if args.speaker_dict:
-            speaker_id = int([item[1] for item in spk_id_list if speaker == item[0]][0])    
+            speaker_id = int(
+                [item[1] for item in spk_id_list if speaker == item[0]][0])
             speaker_id = paddle.to_tensor(speaker_id)
         else:
             speaker_id = None
@@ -155,7 +166,8 @@ def evaluate(args, speedyspeech_config):
         sub_output_dir.mkdir(parents=True, exist_ok=True)
 
         with paddle.no_grad():
-            mel = speedyspeech_inference(phone_ids, tone_ids, durations=durations, spk_id=speaker_id)
+            mel = speedyspeech_inference(
+                phone_ids, tone_ids, durations=durations, spk_id=speaker_id)
         np.save(sub_output_dir / (utt_id + "_feats.npy"), mel)
 
 
@@ -193,10 +205,7 @@ def main():
         default="tone_id_map.txt",
         help="tone vocabulary file.")
     parser.add_argument(
-        "--speaker-dict",
-        type=str,
-        default=None,
-        help="speaker id map file.")
+        "--speaker-dict", type=str, default=None, help="speaker id map file.")
 
     parser.add_argument(
         "--dur-file", default=None, type=str, help="path to durations.txt.")
diff --git a/paddlespeech/t2s/models/speedyspeech/speedyspeech.py b/paddlespeech/t2s/models/speedyspeech/speedyspeech.py
index 263b4c6b9..acddd976f 100644
--- a/paddlespeech/t2s/models/speedyspeech/speedyspeech.py
+++ b/paddlespeech/t2s/models/speedyspeech/speedyspeech.py
@@ -272,9 +272,6 @@ class SpeedySpeechInference(nn.Layer):
 
     def forward(self, phones, tones, durations=None, spk_id=None):
         normalized_mel = self.acoustic_model.inference(
-            phones, 
-            tones, 
-            durations=durations, 
-            spk_id=spk_id)
+            phones, tones, durations=durations, spk_id=spk_id)
         logmel = self.normalizer.inverse(normalized_mel)
-        return logmel
\ No newline at end of file
+        return logmel
diff --git a/utils/link_wav.py b/utils/link_wav.py
index a9ad4136c..8fe2156b2 100644
--- a/utils/link_wav.py
+++ b/utils/link_wav.py
@@ -20,6 +20,7 @@ import jsonlines
 import numpy as np
 from tqdm import tqdm
 
+
 def main():
     # parse config and args
     parser = argparse.ArgumentParser(
@@ -61,9 +62,10 @@ def main():
             try:
                 wav = np.load(old_dump_dir / sub / ("raw/" + wave_name))
                 os.symlink(old_dump_dir / sub / ("raw/" + wave_name),
-                        output_dir / ("raw/" + wave_name))
+                           output_dir / ("raw/" + wave_name))
             except FileNotFoundError:
-                print("delete " + name + " because it cannot be found in the dump folder")
+                print("delete " + name +
+                      " because it cannot be found in the dump folder")
                 os.remove(output_dir / "raw" / name)
                 continue
             except FileExistsError:

From 0c6981cea197548bb145857550f45d1d0b5e206c Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Wed, 12 Jan 2022 06:46:43 +0000
Subject: [PATCH 20/60] fix install_kaldi, test=doc_fix

---
 tools/extras/install_kaldi.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/extras/install_kaldi.sh b/tools/extras/install_kaldi.sh
index b93e7ecf6..f8cd961fc 100755
--- a/tools/extras/install_kaldi.sh
+++ b/tools/extras/install_kaldi.sh
@@ -34,7 +34,7 @@ make -j4
 pushd ../src
 OPENBLAS_DIR=${KALDI_DIR}/../OpenBLAS
 mkdir -p ${OPENBLAS_DIR}/install
-if [ $SHARED == true ];
+if [ $SHARED == true ]; then
    ./configure --shared --use-cuda=no --static-math --mathlib=OPENBLAS --openblas-root=${OPENBLAS_DIR}/install
 else
    ./configure --static --use-cuda=no --static-math --mathlib=OPENBLAS --openblas-root=${OPENBLAS_DIR}/install

From d31dbabbc9a0dab5814d91c8c1d5a52f90f4c021 Mon Sep 17 00:00:00 2001
From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com>
Date: Wed, 12 Jan 2022 15:21:50 +0800
Subject: [PATCH 21/60] [Doc] Update released model for r0.1.1 (#1316)

* updated ASR released model

* update the doc, test=doc_fix
---
 docs/source/released_model.md | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/docs/source/released_model.md b/docs/source/released_model.md
index 6f8a6f9c5..3a3bc9246 100644
--- a/docs/source/released_model.md
+++ b/docs/source/released_model.md
@@ -5,14 +5,13 @@
 ### Speech Recognition Model
 Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech | Example Link 
 :-------------:| :------------:| :-----: | -----: | :-----: |:-----:| :-----:  | :-----:  | :-----: 
-[Ds2 Online Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/aishell_ds2_online_cer8.00_release.tar.gz) | Aishell Dataset | Char-based | 345 MB  | 2 Conv + 5 LSTM layers with only forward direction | 0.080 |-| 151 h | [D2 Online Aishell ASR0](../../examples/aishell/asr0) 
-[Ds2 Offline Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/ds2.model.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.064 |-| 151 h | [Ds2 Offline Aishell ASR0](../../examples/aishell/asr0) 
-[Conformer Online Aishell ASR1 Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.chunk.release.tar.gz) | Aishell Dataset | Char-based | 283 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0594 |-| 151 h | [Conformer Online Aishell ASR1](../../examples/aishell/asr1) 
-[Conformer Offline Aishell ASR1 Model](https://deepspeech.bj.bcebos.com/release2.1/aishell/s1/aishell.release.tar.gz) | Aishell Dataset | Char-based | 284 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0547 |-| 151 h | [Conformer Offline Aishell ASR1](../../examples/aishell/asr1) 
-[Transformer Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/transformer.model.tar.gz) | Aishell Dataset | Char-based | 128 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0523 || 151 h | [Transformer  Aishell ASR1](../../examples/aishell/asr1) 
-[Conformer Librispeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/conformer.model.tar.gz) | Librispeech Dataset | subword-based | 191 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0337 | 960 h | [Conformer Librispeech ASR1](../../example/librispeech/asr1) 
-[Transformer Librispeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/transformer.model.tar.gz) | Librispeech Dataset | subword-based | 131 MB  | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0381 | 960 h | [Transformer Librispeech ASR1](../../example/librispeech/asr1) 
-[Transformer Librispeech ASR2 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr2/transformer.model.tar.gz) | Librispeech Dataset | subword-based | 131 MB  | Encoder:Transformer, Decoder:Transformer, Decoding method: JoinCTC w/ LM |-| 0.0240 | 960 h | [Transformer Librispeech ASR2](../../example/librispeech/asr2) 
+[Ds2 Online Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.1.1.model.tar.gz) | Aishell Dataset | Char-based | 345 MB  | 2 Conv + 5 LSTM layers with only forward direction | 0.080 |-| 151 h | [D2 Online Aishell ASR0](../../examples/aishell/asr0) 
+[Ds2 Offline Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.064 |-| 151 h | [Ds2 Offline Aishell ASR0](../../examples/aishell/asr0) 
+[Conformer Offline Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_conformer_aishell_ckpt_0.1.1.model.tar.gz) | Aishell Dataset | Char-based | 284 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.056 |-| 151 h | [Conformer Offline Aishell ASR1](../../examples/aishell/asr1) 
+[Transformer Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_transformer_aishell_ckpt_0.1.1.model.tar.gz) | Aishell Dataset | Char-based | 128 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0523 || 151 h | [Transformer  Aishell ASR1](../../examples/aishell/asr1) 
+[Conformer Librispeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/asr1_conformer_librispeech_ckpt_0.1.1.model.tar.gz) | Librispeech Dataset | subword-based | 191 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0337 | 960 h | [Conformer Librispeech ASR1](../../example/librispeech/asr1) 
+[Transformer Librispeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/asr1_transformer_librispeech_ckpt_0.1.1.model.tar.gz) | Librispeech Dataset | subword-based | 131 MB  | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0381 | 960 h | [Transformer Librispeech ASR1](../../example/librispeech/asr1) 
+[Transformer Librispeech ASR2 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr2/asr2_transformer_librispeech_ckpt_0.1.1.model.tar.gz) | Librispeech Dataset | subword-based | 131 MB  | Encoder:Transformer, Decoder:Transformer, Decoding method: JoinCTC w/ LM |-| 0.0240 | 960 h | [Transformer Librispeech ASR2](../../example/librispeech/asr2) 
 
 ### Language Model based on NGram
 Language Model | Training Data | Token-based | Size | Descriptions
@@ -25,7 +24,7 @@ Language Model | Training Data | Token-based | Size | Descriptions
 
 | Model | Training Data | Token-based | Size | Descriptions | BLEU | Example Link |
 | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: |
-| [Transformer FAT-ST MTL En-Zh](https://paddlespeech.bj.bcebos.com/s2t/ted_en_zh/st1/fat_st_ted-en-zh.tar.gz) | Ted-En-Zh| Spm| | Encoder:Transformer, Decoder:Transformer, <br />Decoding method: Attention | 20.80 | [Transformer Ted-En-Zh ST1](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/ted_en_zh/st1) |
+| [Transformer FAT-ST MTL En-Zh](https://paddlespeech.bj.bcebos.com/s2t/ted_en_zh/st1/st1_transformer_mtl_noam_ted-en-zh_ckpt_0.1.1.model.tar.gz) | Ted-En-Zh| Spm| | Encoder:Transformer, Decoder:Transformer, <br />Decoding method: Attention | 20.80 | [Transformer Ted-En-Zh ST1](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/ted_en_zh/st1) |
 
 ## Text-to-Speech Models
 

From 4935b80cb46e4c43d6f084c2e472eea879a4fc94 Mon Sep 17 00:00:00 2001
From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com>
Date: Wed, 12 Jan 2022 15:22:21 +0800
Subject: [PATCH 22/60] fix install_kaldi, test=doc_fix (#1319)

---
 tools/extras/install_kaldi.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/extras/install_kaldi.sh b/tools/extras/install_kaldi.sh
index b93e7ecf6..f8cd961fc 100755
--- a/tools/extras/install_kaldi.sh
+++ b/tools/extras/install_kaldi.sh
@@ -34,7 +34,7 @@ make -j4
 pushd ../src
 OPENBLAS_DIR=${KALDI_DIR}/../OpenBLAS
 mkdir -p ${OPENBLAS_DIR}/install
-if [ $SHARED == true ];
+if [ $SHARED == true ]; then
    ./configure --shared --use-cuda=no --static-math --mathlib=OPENBLAS --openblas-root=${OPENBLAS_DIR}/install
 else
    ./configure --static --use-cuda=no --static-math --mathlib=OPENBLAS --openblas-root=${OPENBLAS_DIR}/install

From 0c4895cd0b16459aa97b0fd2a3ea9bf00f37327c Mon Sep 17 00:00:00 2001
From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com>
Date: Wed, 12 Jan 2022 15:24:25 +0800
Subject: [PATCH 23/60] mv the ctcdecoders to third_part (#1313)

---
 setup.py                                                        | 2 +-
 .../ctcdecoder/swig => third_party/ctc_decoders}/.gitignore     | 0
 .../ctcdecoder/swig => third_party/ctc_decoders}/__init__.py    | 0
 .../ctc_decoders}/ctc_beam_search_decoder.cpp                   | 0
 .../swig => third_party/ctc_decoders}/ctc_beam_search_decoder.h | 0
 .../swig => third_party/ctc_decoders}/ctc_greedy_decoder.cpp    | 0
 .../swig => third_party/ctc_decoders}/ctc_greedy_decoder.h      | 0
 .../swig => third_party/ctc_decoders}/decoder_utils.cpp         | 0
 .../swig => third_party/ctc_decoders}/decoder_utils.h           | 0
 .../ctcdecoder/swig => third_party/ctc_decoders}/decoders.i     | 0
 .../ctcdecoder/swig => third_party/ctc_decoders}/path_trie.cpp  | 0
 .../ctcdecoder/swig => third_party/ctc_decoders}/path_trie.h    | 0
 .../ctcdecoder/swig => third_party/ctc_decoders}/scorer.cpp     | 0
 .../ctcdecoder/swig => third_party/ctc_decoders}/scorer.h       | 0
 .../ctcdecoder/swig => third_party/ctc_decoders}/setup.py       | 0
 .../ctcdecoder/swig => third_party/ctc_decoders}/setup.sh       | 0
 16 files changed, 1 insertion(+), 1 deletion(-)
 rename {paddlespeech/s2t/decoders/ctcdecoder/swig => third_party/ctc_decoders}/.gitignore (100%)
 rename {paddlespeech/s2t/decoders/ctcdecoder/swig => third_party/ctc_decoders}/__init__.py (100%)
 rename {paddlespeech/s2t/decoders/ctcdecoder/swig => third_party/ctc_decoders}/ctc_beam_search_decoder.cpp (100%)
 rename {paddlespeech/s2t/decoders/ctcdecoder/swig => third_party/ctc_decoders}/ctc_beam_search_decoder.h (100%)
 rename {paddlespeech/s2t/decoders/ctcdecoder/swig => third_party/ctc_decoders}/ctc_greedy_decoder.cpp (100%)
 rename {paddlespeech/s2t/decoders/ctcdecoder/swig => third_party/ctc_decoders}/ctc_greedy_decoder.h (100%)
 rename {paddlespeech/s2t/decoders/ctcdecoder/swig => third_party/ctc_decoders}/decoder_utils.cpp (100%)
 rename {paddlespeech/s2t/decoders/ctcdecoder/swig => third_party/ctc_decoders}/decoder_utils.h (100%)
 rename {paddlespeech/s2t/decoders/ctcdecoder/swig => third_party/ctc_decoders}/decoders.i (100%)
 rename {paddlespeech/s2t/decoders/ctcdecoder/swig => third_party/ctc_decoders}/path_trie.cpp (100%)
 rename {paddlespeech/s2t/decoders/ctcdecoder/swig => third_party/ctc_decoders}/path_trie.h (100%)
 rename {paddlespeech/s2t/decoders/ctcdecoder/swig => third_party/ctc_decoders}/scorer.cpp (100%)
 rename {paddlespeech/s2t/decoders/ctcdecoder/swig => third_party/ctc_decoders}/scorer.h (100%)
 rename {paddlespeech/s2t/decoders/ctcdecoder/swig => third_party/ctc_decoders}/setup.py (100%)
 rename {paddlespeech/s2t/decoders/ctcdecoder/swig => third_party/ctc_decoders}/setup.sh (100%)

diff --git a/setup.py b/setup.py
index 5d4ff80f7..986eecf05 100644
--- a/setup.py
+++ b/setup.py
@@ -127,7 +127,7 @@ def _post_install(install_lib_dir):
     print("tools install.")
 
     # ctcdecoder
-    ctcdecoder_dir = HERE / 'paddlespeech/s2t/decoders/ctcdecoder/swig'
+    ctcdecoder_dir = HERE / 'third_party/ctc_decoders'
     with pushd(ctcdecoder_dir):
         check_call("bash -e setup.sh")
     print("ctcdecoder install.")
diff --git a/paddlespeech/s2t/decoders/ctcdecoder/swig/.gitignore b/third_party/ctc_decoders/.gitignore
similarity index 100%
rename from paddlespeech/s2t/decoders/ctcdecoder/swig/.gitignore
rename to third_party/ctc_decoders/.gitignore
diff --git a/paddlespeech/s2t/decoders/ctcdecoder/swig/__init__.py b/third_party/ctc_decoders/__init__.py
similarity index 100%
rename from paddlespeech/s2t/decoders/ctcdecoder/swig/__init__.py
rename to third_party/ctc_decoders/__init__.py
diff --git a/paddlespeech/s2t/decoders/ctcdecoder/swig/ctc_beam_search_decoder.cpp b/third_party/ctc_decoders/ctc_beam_search_decoder.cpp
similarity index 100%
rename from paddlespeech/s2t/decoders/ctcdecoder/swig/ctc_beam_search_decoder.cpp
rename to third_party/ctc_decoders/ctc_beam_search_decoder.cpp
diff --git a/paddlespeech/s2t/decoders/ctcdecoder/swig/ctc_beam_search_decoder.h b/third_party/ctc_decoders/ctc_beam_search_decoder.h
similarity index 100%
rename from paddlespeech/s2t/decoders/ctcdecoder/swig/ctc_beam_search_decoder.h
rename to third_party/ctc_decoders/ctc_beam_search_decoder.h
diff --git a/paddlespeech/s2t/decoders/ctcdecoder/swig/ctc_greedy_decoder.cpp b/third_party/ctc_decoders/ctc_greedy_decoder.cpp
similarity index 100%
rename from paddlespeech/s2t/decoders/ctcdecoder/swig/ctc_greedy_decoder.cpp
rename to third_party/ctc_decoders/ctc_greedy_decoder.cpp
diff --git a/paddlespeech/s2t/decoders/ctcdecoder/swig/ctc_greedy_decoder.h b/third_party/ctc_decoders/ctc_greedy_decoder.h
similarity index 100%
rename from paddlespeech/s2t/decoders/ctcdecoder/swig/ctc_greedy_decoder.h
rename to third_party/ctc_decoders/ctc_greedy_decoder.h
diff --git a/paddlespeech/s2t/decoders/ctcdecoder/swig/decoder_utils.cpp b/third_party/ctc_decoders/decoder_utils.cpp
similarity index 100%
rename from paddlespeech/s2t/decoders/ctcdecoder/swig/decoder_utils.cpp
rename to third_party/ctc_decoders/decoder_utils.cpp
diff --git a/paddlespeech/s2t/decoders/ctcdecoder/swig/decoder_utils.h b/third_party/ctc_decoders/decoder_utils.h
similarity index 100%
rename from paddlespeech/s2t/decoders/ctcdecoder/swig/decoder_utils.h
rename to third_party/ctc_decoders/decoder_utils.h
diff --git a/paddlespeech/s2t/decoders/ctcdecoder/swig/decoders.i b/third_party/ctc_decoders/decoders.i
similarity index 100%
rename from paddlespeech/s2t/decoders/ctcdecoder/swig/decoders.i
rename to third_party/ctc_decoders/decoders.i
diff --git a/paddlespeech/s2t/decoders/ctcdecoder/swig/path_trie.cpp b/third_party/ctc_decoders/path_trie.cpp
similarity index 100%
rename from paddlespeech/s2t/decoders/ctcdecoder/swig/path_trie.cpp
rename to third_party/ctc_decoders/path_trie.cpp
diff --git a/paddlespeech/s2t/decoders/ctcdecoder/swig/path_trie.h b/third_party/ctc_decoders/path_trie.h
similarity index 100%
rename from paddlespeech/s2t/decoders/ctcdecoder/swig/path_trie.h
rename to third_party/ctc_decoders/path_trie.h
diff --git a/paddlespeech/s2t/decoders/ctcdecoder/swig/scorer.cpp b/third_party/ctc_decoders/scorer.cpp
similarity index 100%
rename from paddlespeech/s2t/decoders/ctcdecoder/swig/scorer.cpp
rename to third_party/ctc_decoders/scorer.cpp
diff --git a/paddlespeech/s2t/decoders/ctcdecoder/swig/scorer.h b/third_party/ctc_decoders/scorer.h
similarity index 100%
rename from paddlespeech/s2t/decoders/ctcdecoder/swig/scorer.h
rename to third_party/ctc_decoders/scorer.h
diff --git a/paddlespeech/s2t/decoders/ctcdecoder/swig/setup.py b/third_party/ctc_decoders/setup.py
similarity index 100%
rename from paddlespeech/s2t/decoders/ctcdecoder/swig/setup.py
rename to third_party/ctc_decoders/setup.py
diff --git a/paddlespeech/s2t/decoders/ctcdecoder/swig/setup.sh b/third_party/ctc_decoders/setup.sh
similarity index 100%
rename from paddlespeech/s2t/decoders/ctcdecoder/swig/setup.sh
rename to third_party/ctc_decoders/setup.sh

From caa391f4614c3766f5476ab11222ce9ee57d3591 Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Wed, 12 Jan 2022 15:26:49 +0800
Subject: [PATCH 24/60] fix speedyspeech inference, test=tts (#1322)

---
 .../t2s/models/speedyspeech/speedyspeech.py   | 35 +++++--------------
 1 file changed, 9 insertions(+), 26 deletions(-)

diff --git a/paddlespeech/t2s/models/speedyspeech/speedyspeech.py b/paddlespeech/t2s/models/speedyspeech/speedyspeech.py
index acddd976f..3e64e670c 100644
--- a/paddlespeech/t2s/models/speedyspeech/speedyspeech.py
+++ b/paddlespeech/t2s/models/speedyspeech/speedyspeech.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import numpy as np
 import paddle
 from paddle import nn
 
@@ -23,18 +22,16 @@ def expand(encodings: paddle.Tensor, durations: paddle.Tensor) -> paddle.Tensor:
     encodings: (B, T, C)
     durations: (B, T)
     """
-    batch_size, t_enc = durations.shape
-    durations = durations.numpy()
-    slens = np.sum(durations, -1)
-    t_dec = np.max(slens)
-    M = np.zeros([batch_size, t_dec, t_enc])
+    batch_size, t_enc = paddle.shape(durations)
+    slens = paddle.sum(durations, -1)
+    t_dec = paddle.max(slens)
+    M = paddle.zeros([batch_size, t_dec, t_enc])
     for i in range(batch_size):
         k = 0
         for j in range(t_enc):
             d = durations[i, j]
             M[i, k:k + d, j] = 1
             k += d
-    M = paddle.to_tensor(M, dtype=encodings.dtype)
     encodings = paddle.matmul(M, encodings)
     return encodings
 
@@ -234,28 +231,14 @@ class SpeedySpeech(nn.Layer):
 
         encodings = self.encoder(text, tones, spk_id)
 
-        if type(durations) == type(None):
-            pred_durations = self.duration_predictor(encodings)  # (1, T)
+        if durations is None:
+            # (1, T)
+            pred_durations = self.duration_predictor(encodings)
             durations_to_expand = paddle.round(pred_durations.exp())
-            durations_to_expand = (durations_to_expand).astype(paddle.int64)
-
-            slens = paddle.sum(durations_to_expand, -1)  # [1]
-            t_dec = slens[0]  # [1]
-            t_enc = paddle.shape(pred_durations)[-1]
-            M = paddle.zeros([1, t_dec, t_enc])
-
-            k = paddle.full([1], 0, dtype=paddle.int64)
-            for j in range(t_enc):
-                d = durations_to_expand[0, j]
-                # If the d == 0, slice action is meaningless and not supported
-                if d >= 1:
-                    M[0, k:k + d, j] = 1
-                k += d
-
-            encodings = paddle.matmul(M, encodings)
+            durations_to_expand = durations_to_expand.astype(paddle.int64)
         else:
             durations_to_expand = durations
-            encodings = expand(encodings, durations_to_expand)
+        encodings = expand(encodings, durations_to_expand)
 
         shape = paddle.shape(encodings)
         t_dec, feature_size = shape[1], shape[2]

From 940390a4430394348fb9cf5dc216f7ed8b411936 Mon Sep 17 00:00:00 2001
From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com>
Date: Wed, 12 Jan 2022 15:31:38 +0800
Subject: [PATCH 25/60] [CLI] Add unit test (#1321)

* add test_cli.sh

* set -e
---
 tests/unit/cli/test_cli.sh | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)
 create mode 100644 tests/unit/cli/test_cli.sh

diff --git a/tests/unit/cli/test_cli.sh b/tests/unit/cli/test_cli.sh
new file mode 100644
index 000000000..5ce57fd0d
--- /dev/null
+++ b/tests/unit/cli/test_cli.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+set -e
+# Audio classification
+wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/cat.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/dog.wav
+paddlespeech cls --input ./cat.wav --topk 10
+
+# Punctuation_restoration
+paddlespeech text --input 今天的天气真不错啊你下午有空吗我想约你一起去吃饭
+
+# Speech_recognition
+wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
+paddlespeech asr --input ./zh.wav
+paddlespeech asr --model transformer_librispeech --lang en --input ./en.wav
+
+# Text To Speech
+paddlespeech tts --input "你好，欢迎使用百度飞桨深度学习框架！"
+paddlespeech tts --am speedyspeech_csmsc --input "你好，欢迎使用百度飞桨深度学习框架！"
+paddlespeech tts --am fastspeech2_aishell3 --voc pwgan_aishell3 --input "你好，欢迎使用百度飞桨深度学习框架！" --spk_id 0
+paddlespeech tts --am fastspeech2_ljspeech --voc pwgan_ljspeech --lang en --input "hello world"
+paddlespeech tts --am fastspeech2_vctk --voc pwgan_vctk --input "hello, boys" --lang en --spk_id 0
+
+
+# Speech Translation (only support linux)
+paddlespeech st --input ./en.wav

From acfe2b90849443547591ab1c23ba247638d20ddc Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Wed, 12 Jan 2022 15:35:44 +0800
Subject: [PATCH 26/60] Update duration_predictor.py

---
 paddlespeech/t2s/modules/predictor/duration_predictor.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddlespeech/t2s/modules/predictor/duration_predictor.py b/paddlespeech/t2s/modules/predictor/duration_predictor.py
index 6d7adf236..6b7c6a6be 100644
--- a/paddlespeech/t2s/modules/predictor/duration_predictor.py
+++ b/paddlespeech/t2s/modules/predictor/duration_predictor.py
@@ -115,8 +115,8 @@ class DurationPredictor(nn.Layer):
 
         Returns
         ----------
-            Tensor
-                Batch of predicted durations in log domain (B, Tmax).
+        Tensor
+            Batch of predicted durations in log domain (B, Tmax).
         """
         return self._forward(xs, x_masks, False)
 

From 7ae4f7221e41eee81596ab08e4fb1bbef4abcb05 Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Wed, 12 Jan 2022 15:36:43 +0800
Subject: [PATCH 27/60] Update length_regulator.py

---
 paddlespeech/t2s/modules/predictor/length_regulator.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddlespeech/t2s/modules/predictor/length_regulator.py b/paddlespeech/t2s/modules/predictor/length_regulator.py
index bf595b24e..f1ecfb7c1 100644
--- a/paddlespeech/t2s/modules/predictor/length_regulator.py
+++ b/paddlespeech/t2s/modules/predictor/length_regulator.py
@@ -70,8 +70,8 @@ class LengthRegulator(nn.Layer):
         ----------
         xs : Tensor
             Batch of sequences of char or phoneme embeddings (B, Tmax, D).
-        ds : LongTensor
-                Batch of durations of each frame (B, T).
+        ds : Tensor(int64)
+            Batch of durations of each frame (B, T).
         alpha : float, optional
             Alpha value to control speed of speech.
 

From 887a101eee385dc80a18b4474680b85a1519637f Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Wed, 12 Jan 2022 08:46:43 +0000
Subject: [PATCH 28/60] update cli unit test, test=doc

---
 tests/unit/cli/test_cli.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)
 mode change 100644 => 100755 tests/unit/cli/test_cli.sh

diff --git a/tests/unit/cli/test_cli.sh b/tests/unit/cli/test_cli.sh
old mode 100644
new mode 100755
index 5ce57fd0d..845c5d6a2
--- a/tests/unit/cli/test_cli.sh
+++ b/tests/unit/cli/test_cli.sh
@@ -15,10 +15,12 @@ paddlespeech asr --model transformer_librispeech --lang en --input ./en.wav
 # Text To Speech
 paddlespeech tts --input "你好，欢迎使用百度飞桨深度学习框架！"
 paddlespeech tts --am speedyspeech_csmsc --input "你好，欢迎使用百度飞桨深度学习框架！"
+paddlespeech tts --voc mb_melgan_csmsc --input "你好，欢迎使用百度飞桨深度学习框架！"
+paddlespeech tts --voc style_melgan_csmsc --input "你好，欢迎使用百度飞桨深度学习框架！"
+paddlespeech tts --voc hifigan_csmsc --input "你好，欢迎使用百度飞桨深度学习框架！"
 paddlespeech tts --am fastspeech2_aishell3 --voc pwgan_aishell3 --input "你好，欢迎使用百度飞桨深度学习框架！" --spk_id 0
 paddlespeech tts --am fastspeech2_ljspeech --voc pwgan_ljspeech --lang en --input "hello world"
 paddlespeech tts --am fastspeech2_vctk --voc pwgan_vctk --input "hello, boys" --lang en --spk_id 0
 
-
 # Speech Translation (only support linux)
 paddlespeech st --input ./en.wav

From d50d195145589939656991058a48e5e31da032df Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Wed, 12 Jan 2022 09:03:20 +0000
Subject: [PATCH 29/60] update frontend readme, test=doc

---
 examples/other/g2p/README.md | 4 ++--
 examples/other/tn/README.md  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/other/g2p/README.md b/examples/other/g2p/README.md
index d734cc0ca..c0f55bd42 100644
--- a/examples/other/g2p/README.md
+++ b/examples/other/g2p/README.md
@@ -10,11 +10,11 @@ Run the command below to get the results of the test.
 ```bash
 ./run.sh
 ```
-The `avg WER` of g2p is: 0.027495061517943988
+The `avg WER` of g2p is: 0.027124048652822204
 ```text
      ,--------------------------------------------------------------------.
      |        | # Snt    # Wrd  | Corr    Sub    Del    Ins    Err  S.Err |
      |--------+-----------------+-----------------------------------------|
-     | Sum/Avg|  9996   299181  | 97.3    2.7    0.0    0.0    2.7   52.5 |
+     | Sum/Avg|  9996   299181  | 97.3    2.7    0.0    0.0    2.7   52.2 |
      `--------------------------------------------------------------------'
 ```
diff --git a/examples/other/tn/README.md b/examples/other/tn/README.md
index 596b1815c..3b80de661 100644
--- a/examples/other/tn/README.md
+++ b/examples/other/tn/README.md
@@ -7,11 +7,11 @@ Run the command below to get the results of the test.
 ```bash
 ./run.sh
 ```
-The `avg CER` of text normalization is: 0.006388318503308237
+The `avg CER` of text normalization is: 0.00730093543235227
 ```text
       ,-----------------------------------------------------------------.
       |        | # Snt  # Wrd | Corr    Sub    Del    Ins    Err  S.Err |
       |--------+--------------+-----------------------------------------|
-      | Sum/Avg|  125    2254 | 99.4    0.1    0.5    0.1    0.7    3.2 |
+      | Sum/Avg|  125    2254 | 99.4    0.1    0.5    0.2    0.8    4.8 |
       `-----------------------------------------------------------------'
 ```

From ef90bd626a3ca04f376ff63a764592a8a7d68b74 Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Wed, 12 Jan 2022 18:30:32 +0800
Subject: [PATCH 30/60] add tts papers, test=doc (#1330)

---
 docs/source/tts/tts_papers.md | 42 +++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)
 create mode 100644 docs/source/tts/tts_papers.md

diff --git a/docs/source/tts/tts_papers.md b/docs/source/tts/tts_papers.md
new file mode 100644
index 000000000..2b35b8852
--- /dev/null
+++ b/docs/source/tts/tts_papers.md
@@ -0,0 +1,42 @@
+# TTS Papers
+## Text Frontend
+### Polyphone
+- [【g2pM】g2pM: A Neural Grapheme-to-Phoneme Conversion Package for Mandarin Chinese Based on a New Open Benchmark Dataset](https://arxiv.org/abs/2004.03136)
+- [Disambiguation of Chinese Polyphones in an End-to-End Framework with Semantic Features Extracted by Pre-trained BERT](https://www1.se.cuhk.edu.hk/~hccl/publications/pub/201909_INTERSPEECH_DongyangDAI.pdf)
+### Text Normalization
+#### English
+- [applenob/text_normalization](https://github.com/applenob/text_normalization)
+### G2P
+#### English
+- [cmusphinx/g2p-seq2seq](https://github.com/cmusphinx/g2p-seq2seq)
+
+## Acoustic Models
+- [【AdaSpeech3】AdaSpeech 3: Adaptive Text to Speech for Spontaneous Style](https://arxiv.org/abs/2107.02530)
+- [【AdaSpeech2】AdaSpeech 2: Adaptive Text to Speech with Untranscribed Data](https://arxiv.org/abs/2104.09715)
+- [【AdaSpeech】AdaSpeech: Adaptive Text to Speech for Custom Voice](https://arxiv.org/abs/2103.00993)
+- [【FastSpeech2】FastSpeech 2: Fast and High-Quality End-to-End Text to Speech](https://arxiv.org/abs/2006.04558)
+- [【FastPitch】FastPitch: Parallel Text-to-speech with Pitch Prediction](https://arxiv.org/abs/2006.06873)
+- [【SpeedySpeech】SpeedySpeech: Efficient Neural Speech Synthesis](https://arxiv.org/abs/2008.03802)
+- [【FastSpeech】FastSpeech: Fast, Robust and Controllable Text to Speech](https://arxiv.org/abs/1905.09263)
+- [【Transformer TTS】Neural Speech Synthesis with Transformer Network](https://arxiv.org/abs/1809.08895)
+- [【Tacotron2】Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions](https://arxiv.org/abs/1712.05884)
+
+## Vocoders
+- [【RefineGAN】RefineGAN: Universally Generating Waveform Better than Ground Truth with Highly Accurate Pitch and Intensity Responses](https://arxiv.org/abs/2111.00962)
+- [【Fre-GAN】Fre-GAN: Adversarial Frequency-consistent Audio Synthesis](https://arxiv.org/abs/2106.02297)
+- [【StyleMelGAN】StyleMelGAN: An Efficient High-Fidelity Adversarial Vocoder with Temporal Adaptive Normalization](https://arxiv.org/abs/2011.01557)
+- [【Multi-band MelGAN】Multi-band MelGAN: Faster Waveform Generation for High-Quality Text-to-Speech](https://arxiv.org/abs/2005.05106)
+- [【HiFi-GAN】HiFi-GAN: Generative Adversarial Networks for Efficient and High Fidelity Speech Synthesis](https://arxiv.org/abs/2010.05646)
+- [【VocGAN】VocGAN: A High-Fidelity Real-time Vocoder with a Hierarchically-nested Adversarial Network](https://arxiv.org/abs/2007.15256)
+- [【Parallel WaveGAN】Parallel WaveGAN: A fast waveform generation model based on generative adversarial networks with multi-resolution spectrogram](https://arxiv.org/abs/1910.11480)
+- [【MelGAN】MelGAN: Generative Adversarial Networks for Conditional Waveform Synthesis](https://arxiv.org/abs/1910.06711)
+- [【WaveFlow】WaveFlow: A Compact Flow-based Model for Raw Audio](https://arxiv.org/abs/1912.01219)
+- [【LPCNet】LPCNet: Improving Neural Speech Synthesis Through Linear Prediction](https://arxiv.org/abs/1810.11846)
+- [【WaveRNN】Efficient Neural Audio Synthesis](https://arxiv.org/abs/1802.08435)
+## GAN TTS
+
+- [【GAN TTS】High Fidelity Speech Synthesis with Adversarial Networks](https://arxiv.org/abs/1909.11646)
+
+## Voice Cloning
+- [【SV2TTS】Transfer Learning from Speaker Verification to Multispeaker Text-to-Speech Synthesis](https://arxiv.org/abs/1806.04558)
+- [【GE2E】Generalized End-to-End Loss for Speaker Verification](https://arxiv.org/abs/1710.10467)

From 08d82608722420ddb7502e424157c7aa063c9267 Mon Sep 17 00:00:00 2001
From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com>
Date: Wed, 12 Jan 2022 19:06:03 +0800
Subject: [PATCH 31/60] add licecne for ctc (#1329)

---
 third_party/ctc_decoders/COPYING.APACHE2.0 | 201 +++++++++++++++++++++
 third_party/ctc_decoders/COPYING.LESSER.3  | 165 +++++++++++++++++
 third_party/ctc_decoders/LICENSE           |   8 +
 third_party/ctc_decoders/scorer.cpp        |   2 -
 third_party/ctc_decoders/scorer.h          |   1 -
 third_party/ctc_decoders/setup.py          |   4 +-
 6 files changed, 376 insertions(+), 5 deletions(-)
 create mode 100644 third_party/ctc_decoders/COPYING.APACHE2.0
 create mode 100644 third_party/ctc_decoders/COPYING.LESSER.3
 create mode 100644 third_party/ctc_decoders/LICENSE

diff --git a/third_party/ctc_decoders/COPYING.APACHE2.0 b/third_party/ctc_decoders/COPYING.APACHE2.0
new file mode 100644
index 000000000..261eeb9e9
--- /dev/null
+++ b/third_party/ctc_decoders/COPYING.APACHE2.0
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/third_party/ctc_decoders/COPYING.LESSER.3 b/third_party/ctc_decoders/COPYING.LESSER.3
new file mode 100644
index 000000000..cca7fc278
--- /dev/null
+++ b/third_party/ctc_decoders/COPYING.LESSER.3
@@ -0,0 +1,165 @@
+		   GNU LESSER GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+
+  This version of the GNU Lesser General Public License incorporates
+the terms and conditions of version 3 of the GNU General Public
+License, supplemented by the additional permissions listed below.
+
+  0. Additional Definitions.
+
+  As used herein, "this License" refers to version 3 of the GNU Lesser
+General Public License, and the "GNU GPL" refers to version 3 of the GNU
+General Public License.
+
+  "The Library" refers to a covered work governed by this License,
+other than an Application or a Combined Work as defined below.
+
+  An "Application" is any work that makes use of an interface provided
+by the Library, but which is not otherwise based on the Library.
+Defining a subclass of a class defined by the Library is deemed a mode
+of using an interface provided by the Library.
+
+  A "Combined Work" is a work produced by combining or linking an
+Application with the Library.  The particular version of the Library
+with which the Combined Work was made is also called the "Linked
+Version".
+
+  The "Minimal Corresponding Source" for a Combined Work means the
+Corresponding Source for the Combined Work, excluding any source code
+for portions of the Combined Work that, considered in isolation, are
+based on the Application, and not on the Linked Version.
+
+  The "Corresponding Application Code" for a Combined Work means the
+object code and/or source code for the Application, including any data
+and utility programs needed for reproducing the Combined Work from the
+Application, but excluding the System Libraries of the Combined Work.
+
+  1. Exception to Section 3 of the GNU GPL.
+
+  You may convey a covered work under sections 3 and 4 of this License
+without being bound by section 3 of the GNU GPL.
+
+  2. Conveying Modified Versions.
+
+  If you modify a copy of the Library, and, in your modifications, a
+facility refers to a function or data to be supplied by an Application
+that uses the facility (other than as an argument passed when the
+facility is invoked), then you may convey a copy of the modified
+version:
+
+   a) under this License, provided that you make a good faith effort to
+   ensure that, in the event an Application does not supply the
+   function or data, the facility still operates, and performs
+   whatever part of its purpose remains meaningful, or
+
+   b) under the GNU GPL, with none of the additional permissions of
+   this License applicable to that copy.
+
+  3. Object Code Incorporating Material from Library Header Files.
+
+  The object code form of an Application may incorporate material from
+a header file that is part of the Library.  You may convey such object
+code under terms of your choice, provided that, if the incorporated
+material is not limited to numerical parameters, data structure
+layouts and accessors, or small macros, inline functions and templates
+(ten or fewer lines in length), you do both of the following:
+
+   a) Give prominent notice with each copy of the object code that the
+   Library is used in it and that the Library and its use are
+   covered by this License.
+
+   b) Accompany the object code with a copy of the GNU GPL and this license
+   document.
+
+  4. Combined Works.
+
+  You may convey a Combined Work under terms of your choice that,
+taken together, effectively do not restrict modification of the
+portions of the Library contained in the Combined Work and reverse
+engineering for debugging such modifications, if you also do each of
+the following:
+
+   a) Give prominent notice with each copy of the Combined Work that
+   the Library is used in it and that the Library and its use are
+   covered by this License.
+
+   b) Accompany the Combined Work with a copy of the GNU GPL and this license
+   document.
+
+   c) For a Combined Work that displays copyright notices during
+   execution, include the copyright notice for the Library among
+   these notices, as well as a reference directing the user to the
+   copies of the GNU GPL and this license document.
+
+   d) Do one of the following:
+
+       0) Convey the Minimal Corresponding Source under the terms of this
+       License, and the Corresponding Application Code in a form
+       suitable for, and under terms that permit, the user to
+       recombine or relink the Application with a modified version of
+       the Linked Version to produce a modified Combined Work, in the
+       manner specified by section 6 of the GNU GPL for conveying
+       Corresponding Source.
+
+       1) Use a suitable shared library mechanism for linking with the
+       Library.  A suitable mechanism is one that (a) uses at run time
+       a copy of the Library already present on the user's computer
+       system, and (b) will operate properly with a modified version
+       of the Library that is interface-compatible with the Linked
+       Version.
+
+   e) Provide Installation Information, but only if you would otherwise
+   be required to provide such information under section 6 of the
+   GNU GPL, and only to the extent that such information is
+   necessary to install and execute a modified version of the
+   Combined Work produced by recombining or relinking the
+   Application with a modified version of the Linked Version. (If
+   you use option 4d0, the Installation Information must accompany
+   the Minimal Corresponding Source and Corresponding Application
+   Code. If you use option 4d1, you must provide the Installation
+   Information in the manner specified by section 6 of the GNU GPL
+   for conveying Corresponding Source.)
+
+  5. Combined Libraries.
+
+  You may place library facilities that are a work based on the
+Library side by side in a single library together with other library
+facilities that are not Applications and are not covered by this
+License, and convey such a combined library under terms of your
+choice, if you do both of the following:
+
+   a) Accompany the combined library with a copy of the same work based
+   on the Library, uncombined with any other library facilities,
+   conveyed under the terms of this License.
+
+   b) Give prominent notice with the combined library that part of it
+   is a work based on the Library, and explaining where to find the
+   accompanying uncombined form of the same work.
+
+  6. Revised Versions of the GNU Lesser General Public License.
+
+  The Free Software Foundation may publish revised and/or new versions
+of the GNU Lesser General Public License from time to time. Such new
+versions will be similar in spirit to the present version, but may
+differ in detail to address new problems or concerns.
+
+  Each version is given a distinguishing version number. If the
+Library as you received it specifies that a certain numbered version
+of the GNU Lesser General Public License "or any later version"
+applies to it, you have the option of following the terms and
+conditions either of that published version or of any later version
+published by the Free Software Foundation. If the Library as you
+received it does not specify a version number of the GNU Lesser
+General Public License, you may choose any version of the GNU Lesser
+General Public License ever published by the Free Software Foundation.
+
+  If the Library as you received it specifies that a proxy can decide
+whether future versions of the GNU Lesser General Public License shall
+apply, that proxy's public statement of acceptance of any version is
+permanent authorization for you to choose that version for the
+Library.
diff --git a/third_party/ctc_decoders/LICENSE b/third_party/ctc_decoders/LICENSE
new file mode 100644
index 000000000..eeef74b30
--- /dev/null
+++ b/third_party/ctc_decoders/LICENSE
@@ -0,0 +1,8 @@
+Most of the code here is licensed under the Apache License 2.0.  
+There are exceptions that have their own licenses, listed below.  
+
+score.h and score.cpp is under the LGPL license. 
+The two files include the header files from KenLM project.
+
+For the rest:
+The default licence of paddlespeech-ctcdecoders is Apache License 2.0.
diff --git a/third_party/ctc_decoders/scorer.cpp b/third_party/ctc_decoders/scorer.cpp
index 7c9a75d58..4186f115b 100644
--- a/third_party/ctc_decoders/scorer.cpp
+++ b/third_party/ctc_decoders/scorer.cpp
@@ -20,8 +20,6 @@
 #include "lm/config.hh"
 #include "lm/model.hh"
 #include "lm/state.hh"
-#include "util/string_piece.hh"
-#include "util/tokenize_piece.hh"
 
 #include "decoder_utils.h"
 
diff --git a/third_party/ctc_decoders/scorer.h b/third_party/ctc_decoders/scorer.h
index 3f3001e77..fdb5b46bb 100644
--- a/third_party/ctc_decoders/scorer.h
+++ b/third_party/ctc_decoders/scorer.h
@@ -23,7 +23,6 @@
 #include "lm/enumerate_vocab.hh"
 #include "lm/virtual_interface.hh"
 #include "lm/word_index.hh"
-#include "util/string_piece.hh"
 
 #include "path_trie.h"
 
diff --git a/third_party/ctc_decoders/setup.py b/third_party/ctc_decoders/setup.py
index 8a2086d6b..6484b87c5 100644
--- a/third_party/ctc_decoders/setup.py
+++ b/third_party/ctc_decoders/setup.py
@@ -127,11 +127,11 @@ decoders_module = [
 
 setup(
     name='paddlespeech_ctcdecoders',
-    version='0.1.0',
+    version='0.1.1',
     description="CTC decoders in paddlespeech",
     author="PaddlePaddle Speech and Language Team",
     author_email="paddlesl@baidu.com",
     url="https://github.com/PaddlePaddle/PaddleSpeech",
-    license='Apache 2.0',
+    license='Apache 2.0, GNU Lesser General Public License v3 (LGPLv3) (LGPL-3)',
     ext_modules=decoders_module,
     py_modules=['swig_decoders'])

From e0e12254751d0c89539f609af46b3f1f149c4432 Mon Sep 17 00:00:00 2001
From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com>
Date: Wed, 12 Jan 2022 19:47:00 +0800
Subject: [PATCH 32/60] Update README.md

---
 demos/speech_recognition/README.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/demos/speech_recognition/README.md b/demos/speech_recognition/README.md
index 844c18b73..e060c2728 100644
--- a/demos/speech_recognition/README.md
+++ b/demos/speech_recognition/README.md
@@ -23,7 +23,10 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
 ### 3. Usage
 - Command Line(Recommended)
   ```bash
+  # Chinese
   paddlespeech asr --input ./zh.wav
+  # English
+  paddlespeech asr --model transformer_librispeech --lang en --input ./en.wav
   ```
   (It doesn't matter if package `paddlespeech-ctcdecoders` is not found, this package is optional.)
   

From 3ad18d0a9f1ee95f281e733a72f4b3816422eee4 Mon Sep 17 00:00:00 2001
From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com>
Date: Wed, 12 Jan 2022 19:48:48 +0800
Subject: [PATCH 33/60] Update README_cn.md

---
 demos/speech_recognition/README_cn.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/demos/speech_recognition/README_cn.md b/demos/speech_recognition/README_cn.md
index b15ec4523..2c72e6f4d 100644
--- a/demos/speech_recognition/README_cn.md
+++ b/demos/speech_recognition/README_cn.md
@@ -21,7 +21,10 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
 ### 3. 使用方法
 - 命令行 (推荐使用)
   ```bash
+  # 中文
   paddlespeech asr --input ./zh.wav
+  # 英文
+  paddlespeech asr --model transformer_librispeech --lang en --input ./en.wav
   ```
   (如果显示 `paddlespeech-ctcdecoders` 这个 python 包没有找到的 Error，没有关系，这个包是非必须的。)
   
@@ -74,3 +77,4 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
 | 模型 | 语言 | 采样率
 | :--- | :---: | :---: |
 | conformer_wenetspeech| zh| 16000
+| transformer_librispeech| en| 16000

From 5bfceddeb581e3912582dea540d84b820fc72c55 Mon Sep 17 00:00:00 2001
From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com>
Date: Wed, 12 Jan 2022 19:49:45 +0800
Subject: [PATCH 34/60] Update README.md

---
 demos/speech_recognition/README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/demos/speech_recognition/README.md b/demos/speech_recognition/README.md
index e060c2728..a3cda1ef9 100644
--- a/demos/speech_recognition/README.md
+++ b/demos/speech_recognition/README.md
@@ -80,3 +80,4 @@ Here is a list of pretrained models released by PaddleSpeech that can be used by
 | Model | Language | Sample Rate
 | :--- | :---: | :---: |
 | conformer_wenetspeech| zh| 16000
+| transformer_librispeech| en| 16000

From f4983ed2bd730285e730c0997906fed46adcc3cb Mon Sep 17 00:00:00 2001
From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com>
Date: Wed, 12 Jan 2022 19:53:20 +0800
Subject: [PATCH 35/60] Update README.md

---
 demos/speech_recognition/README.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/demos/speech_recognition/README.md b/demos/speech_recognition/README.md
index a3cda1ef9..c49afa35c 100644
--- a/demos/speech_recognition/README.md
+++ b/demos/speech_recognition/README.md
@@ -46,7 +46,10 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
 
   Output:
   ```bash
+  # Chinese
   [2021-12-08 13:12:34,063] [    INFO] [utils.py] [L225] - ASR Result: 我认为跑步最重要的就是给我带来了身体健康
+  # English
+  [2022-01-12 11:51:10,815] [    INFO] - ASR Result: i knocked at the door on the ancient side of the building
   ```
 
 - Python API

From 6147cfc6786091b981c135a1cfea59e51ace27f1 Mon Sep 17 00:00:00 2001
From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com>
Date: Wed, 12 Jan 2022 19:54:09 +0800
Subject: [PATCH 36/60] Update README_cn.md

---
 demos/speech_recognition/README_cn.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/demos/speech_recognition/README_cn.md b/demos/speech_recognition/README_cn.md
index 2c72e6f4d..cc19487da 100644
--- a/demos/speech_recognition/README_cn.md
+++ b/demos/speech_recognition/README_cn.md
@@ -44,7 +44,10 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
 
   输出：
   ```bash
+  # 中文
   [2021-12-08 13:12:34,063] [    INFO] [utils.py] [L225] - ASR Result: 我认为跑步最重要的就是给我带来了身体健康
+  # 英文
+  [2022-01-12 11:51:10,815] [    INFO] - ASR Result: i knocked at the door on the ancient side of the building
   ```
 
 - Python API

From 795e4d02d4100651b5382d541d9d32355ffb88f7 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Wed, 12 Jan 2022 19:55:22 +0800
Subject: [PATCH 37/60] [install] remove nltk deps

---
 setup.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/setup.py b/setup.py
index 986eecf05..a750f0b8f 100644
--- a/setup.py
+++ b/setup.py
@@ -41,7 +41,6 @@ requirements = {
         "loguru",
         "matplotlib",
         "nara_wpe",
-        "nltk",
         "pandas",
         "paddleaudio",
         "paddlenlp",

From 6ba1f16d135a5f500ddebd81abe166b4285e7108 Mon Sep 17 00:00:00 2001
From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com>
Date: Wed, 12 Jan 2022 19:56:32 +0800
Subject: [PATCH 38/60] Update README_cn.md

---
 demos/speech_recognition/README_cn.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/demos/speech_recognition/README_cn.md b/demos/speech_recognition/README_cn.md
index cc19487da..c2e38c91b 100644
--- a/demos/speech_recognition/README_cn.md
+++ b/demos/speech_recognition/README_cn.md
@@ -2,7 +2,7 @@
 
 # 语音识别
 ## 介绍
-语音识别解决让计算机程序自动转录语音的问题。
+语音识别是一项用计算机程序自动转录语音的技术。
 
 这个 demo 是一个从给定音频文件识别文本的实现，它可以通过使用 `PaddleSpeech` 的单个命令或 python 中的几行代码来实现。
 ## 使用方法

From ec1c88ae1a4dc029802b1a9518907902c0e261a0 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Wed, 12 Jan 2022 19:56:58 +0800
Subject: [PATCH 39/60] [s2t] remove nltk (#1332)

---
 paddlespeech/s2t/utils/bleu_score.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/paddlespeech/s2t/utils/bleu_score.py b/paddlespeech/s2t/utils/bleu_score.py
index ea32fcf95..a50c000ae 100644
--- a/paddlespeech/s2t/utils/bleu_score.py
+++ b/paddlespeech/s2t/utils/bleu_score.py
@@ -14,7 +14,6 @@
 """This module provides functions to calculate bleu score in different level.
 e.g. wer for word-level, cer for char-level.
 """
-import nltk
 import numpy as np
 import sacrebleu
 
@@ -114,6 +113,5 @@ class ErrorCalculator():
             seq_true_text = "".join(seq_true).replace(self.space, " ")
             seqs_hat.append(seq_hat_text)
             seqs_true.append(seq_true_text)
-        bleu = nltk.bleu_score.corpus_bleu([[ref] for ref in seqs_true],
-                                           seqs_hat)
-        return bleu * 100
+        bleu = sacrebleu.corpus_bleu(seqs_hat, [[ref] for ref in seqs_true])
+        return bleu.score * 100

From 1ececba79b692c9928f0ec2a684caf5eb939b62a Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Wed, 12 Jan 2022 12:13:12 +0000
Subject: [PATCH 40/60] fix requirement, test=doc

---
 requirements.txt | 1 +
 setup.py         | 1 +
 2 files changed, 2 insertions(+)

diff --git a/requirements.txt b/requirements.txt
index c68893183..760821662 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -45,3 +45,4 @@ visualdl
 webrtcvad
 yacs~=0.1.8
 yq
+zhon
diff --git a/setup.py b/setup.py
index 986eecf05..d0410567b 100644
--- a/setup.py
+++ b/setup.py
@@ -78,6 +78,7 @@ requirements = {
         "unidecode",
         "yq",
         "pre-commit",
+        "zhon",
     ]
 }
 

From db35a232b5440f1849b4e87eb1100485ae2db510 Mon Sep 17 00:00:00 2001
From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com>
Date: Thu, 13 Jan 2022 10:24:48 +0800
Subject: [PATCH 41/60] Update released_model.md

---
 docs/source/released_model.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/released_model.md b/docs/source/released_model.md
index 3a3bc9246..3310bfb23 100644
--- a/docs/source/released_model.md
+++ b/docs/source/released_model.md
@@ -24,7 +24,7 @@ Language Model | Training Data | Token-based | Size | Descriptions
 
 | Model | Training Data | Token-based | Size | Descriptions | BLEU | Example Link |
 | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: | :-----: |
-| [Transformer FAT-ST MTL En-Zh](https://paddlespeech.bj.bcebos.com/s2t/ted_en_zh/st1/st1_transformer_mtl_noam_ted-en-zh_ckpt_0.1.1.model.tar.gz) | Ted-En-Zh| Spm| | Encoder:Transformer, Decoder:Transformer, <br />Decoding method: Attention | 20.80 | [Transformer Ted-En-Zh ST1](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/ted_en_zh/st1) |
+| (only for CLI)[Transformer FAT-ST MTL En-Zh](https://paddlespeech.bj.bcebos.com/s2t/ted_en_zh/st1/st1_transformer_mtl_noam_ted-en-zh_ckpt_0.1.1.model.tar.gz) | Ted-En-Zh| Spm| | Encoder:Transformer, Decoder:Transformer, <br />Decoding method: Attention | 20.80 | [Transformer Ted-En-Zh ST1](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/ted_en_zh/st1) |
 
 ## Text-to-Speech Models
 

From a1867c20c341e7305c5442d3617a8ab1f900346e Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Thu, 13 Jan 2022 14:21:29 +0800
Subject: [PATCH 42/60] fix slice bug of speedyspeech expand, test=tts (#1337)

---
 paddlespeech/t2s/models/speedyspeech/speedyspeech.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/paddlespeech/t2s/models/speedyspeech/speedyspeech.py b/paddlespeech/t2s/models/speedyspeech/speedyspeech.py
index 3e64e670c..cc9e20662 100644
--- a/paddlespeech/t2s/models/speedyspeech/speedyspeech.py
+++ b/paddlespeech/t2s/models/speedyspeech/speedyspeech.py
@@ -30,7 +30,9 @@ def expand(encodings: paddle.Tensor, durations: paddle.Tensor) -> paddle.Tensor:
         k = 0
         for j in range(t_enc):
             d = durations[i, j]
-            M[i, k:k + d, j] = 1
+            # If the d == 0, slice action is meaningless and not supported
+            if d >= 1:
+                M[0, k:k + d, j] = 1
             k += d
     encodings = paddle.matmul(M, encodings)
     return encodings

From dae6bea546dc2123eb7c3f76661bb152efb96167 Mon Sep 17 00:00:00 2001
From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com>
Date: Thu, 13 Jan 2022 14:22:01 +0800
Subject: [PATCH 43/60] [ctcdecoders]fix the licence (#1336)

* fix the licence, test=doc

* Update ctc_beam_search_decoder.cpp

* Update ctc_beam_search_decoder.h

* Update ctc_greedy_decoder.cpp

* Update ctc_greedy_decoder.h

* Update decoder_utils.cpp

* Update decoder_utils.h

* Update path_trie.cpp

* Update path_trie.h

* Update scorer.h

* Update scorer.cpp
---
 .../ctc_decoders/ctc_beam_search_decoder.cpp       |  2 +-
 third_party/ctc_decoders/ctc_beam_search_decoder.h |  2 +-
 third_party/ctc_decoders/ctc_greedy_decoder.cpp    |  2 +-
 third_party/ctc_decoders/ctc_greedy_decoder.h      |  2 +-
 third_party/ctc_decoders/decoder_utils.cpp         |  2 +-
 third_party/ctc_decoders/decoder_utils.h           |  2 +-
 third_party/ctc_decoders/path_trie.cpp             |  2 +-
 third_party/ctc_decoders/path_trie.h               |  2 +-
 third_party/ctc_decoders/scorer.cpp                | 14 +-------------
 third_party/ctc_decoders/scorer.h                  | 14 +-------------
 10 files changed, 10 insertions(+), 34 deletions(-)

diff --git a/third_party/ctc_decoders/ctc_beam_search_decoder.cpp b/third_party/ctc_decoders/ctc_beam_search_decoder.cpp
index 663c52bb4..db742fbbe 100644
--- a/third_party/ctc_decoders/ctc_beam_search_decoder.cpp
+++ b/third_party/ctc_decoders/ctc_beam_search_decoder.cpp
@@ -1,6 +1,6 @@
 // Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 //
-// Licensed under the Apache License, Version 2.0 (the "License");
+// Licensed under the Apache License, Version 2.0 (the "COPYING.APACHE2.0");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
diff --git a/third_party/ctc_decoders/ctc_beam_search_decoder.h b/third_party/ctc_decoders/ctc_beam_search_decoder.h
index eaba9da8c..584226574 100644
--- a/third_party/ctc_decoders/ctc_beam_search_decoder.h
+++ b/third_party/ctc_decoders/ctc_beam_search_decoder.h
@@ -1,6 +1,6 @@
 // Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 //
-// Licensed under the Apache License, Version 2.0 (the "License");
+// Licensed under the Apache License, Version 2.0 (the "COPYING.APACHE2.0");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
diff --git a/third_party/ctc_decoders/ctc_greedy_decoder.cpp b/third_party/ctc_decoders/ctc_greedy_decoder.cpp
index 53a04fba0..a178c6734 100644
--- a/third_party/ctc_decoders/ctc_greedy_decoder.cpp
+++ b/third_party/ctc_decoders/ctc_greedy_decoder.cpp
@@ -1,6 +1,6 @@
 // Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 //
-// Licensed under the Apache License, Version 2.0 (the "License");
+// Licensed under the Apache License, Version 2.0 (the "COPYING.APACHE2.0");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
diff --git a/third_party/ctc_decoders/ctc_greedy_decoder.h b/third_party/ctc_decoders/ctc_greedy_decoder.h
index dd1b33315..4d60beaf1 100644
--- a/third_party/ctc_decoders/ctc_greedy_decoder.h
+++ b/third_party/ctc_decoders/ctc_greedy_decoder.h
@@ -1,6 +1,6 @@
 // Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 //
-// Licensed under the Apache License, Version 2.0 (the "License");
+// Licensed under the Apache License, Version 2.0 (the "COPYING.APACHE2.0");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
diff --git a/third_party/ctc_decoders/decoder_utils.cpp b/third_party/ctc_decoders/decoder_utils.cpp
index e86c22401..c7ef65428 100644
--- a/third_party/ctc_decoders/decoder_utils.cpp
+++ b/third_party/ctc_decoders/decoder_utils.cpp
@@ -1,6 +1,6 @@
 // Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 //
-// Licensed under the Apache License, Version 2.0 (the "License");
+// Licensed under the Apache License, Version 2.0 (the "COPYING.APACHE2.0");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
diff --git a/third_party/ctc_decoders/decoder_utils.h b/third_party/ctc_decoders/decoder_utils.h
index 1d75d03db..098741552 100644
--- a/third_party/ctc_decoders/decoder_utils.h
+++ b/third_party/ctc_decoders/decoder_utils.h
@@ -1,6 +1,6 @@
 // Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 //
-// Licensed under the Apache License, Version 2.0 (the "License");
+// Licensed under the Apache License, Version 2.0 (the "COPYING.APACHE2.0");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
diff --git a/third_party/ctc_decoders/path_trie.cpp b/third_party/ctc_decoders/path_trie.cpp
index f52d11573..a5e7dd3da 100644
--- a/third_party/ctc_decoders/path_trie.cpp
+++ b/third_party/ctc_decoders/path_trie.cpp
@@ -1,6 +1,6 @@
 // Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 //
-// Licensed under the Apache License, Version 2.0 (the "License");
+// Licensed under the Apache License, Version 2.0 (the "COPYING.APACHE2.0");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
diff --git a/third_party/ctc_decoders/path_trie.h b/third_party/ctc_decoders/path_trie.h
index 717d4b004..5193e0a47 100644
--- a/third_party/ctc_decoders/path_trie.h
+++ b/third_party/ctc_decoders/path_trie.h
@@ -1,6 +1,6 @@
 // Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 //
-// Licensed under the Apache License, Version 2.0 (the "License");
+// Licensed under the Apache License, Version 2.0 (the "COPYING.APACHE2.0");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
diff --git a/third_party/ctc_decoders/scorer.cpp b/third_party/ctc_decoders/scorer.cpp
index 4186f115b..977112d17 100644
--- a/third_party/ctc_decoders/scorer.cpp
+++ b/third_party/ctc_decoders/scorer.cpp
@@ -1,16 +1,4 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+// Licensed under GNU Lesser General Public License v3 (LGPLv3) (LGPL-3) (the "COPYING.LESSER.3");
 
 #include "scorer.h"
 
diff --git a/third_party/ctc_decoders/scorer.h b/third_party/ctc_decoders/scorer.h
index fdb5b46bb..5739339df 100644
--- a/third_party/ctc_decoders/scorer.h
+++ b/third_party/ctc_decoders/scorer.h
@@ -1,16 +1,4 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+// Licensed under GNU Lesser General Public License v3 (LGPLv3) (LGPL-3) (the "COPYING.LESSER.3");
 
 #ifndef SCORER_H_
 #define SCORER_H_

From 39f5679ed6174d70017692e79776839a10032327 Mon Sep 17 00:00:00 2001
From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com>
Date: Thu, 13 Jan 2022 16:56:54 +0800
Subject: [PATCH 44/60] test=doc

---
 .../package_release/python_package_release.md | 103 +++++++++++-------
 1 file changed, 64 insertions(+), 39 deletions(-)

diff --git a/docs/topic/package_release/python_package_release.md b/docs/topic/package_release/python_package_release.md
index 382509623..2d2edbb4a 100644
--- a/docs/topic/package_release/python_package_release.md
+++ b/docs/topic/package_release/python_package_release.md
@@ -1,4 +1,13 @@
-# 发包方法
+# 简化安装与发包
+
+## 问题：
+
+1. [如何去除ubuntu的apt安装依赖？](#conda-代替系统依赖)
+2. [如何支持普通用户和开发者两种安装的需求，尽量减少普通用户所需的依赖？](#区分install模式和develop模式)
+3. [如何进行python包的动态安装？](#python-包的动态安装)
+4. [如何进行python项目编包？](#python-编包方法)
+5. [发包前要有什么准备？](#关于发包前的准备工作)
+6. [发C++包需要注意的东西？](#manylinux)
 
 
 
@@ -34,6 +43,44 @@ conda install -c conda-forge eigen boost cmake
 
 
 
+## 区分install模式和develop模式
+
+可以在setup.py 中划分 install 的依赖（基本依赖）和 develop 的依赖 （开发者额外依赖）。 setup_info 中 `install_requires` 设置 install 的依赖，而在 `extras_require` 中设置 `develop` key为 develop的依赖。
+普通安装可以使用：
+
+```bash
+pip install . 
+```
+
+另外使用 pip 安装已发的包也是使用普通安装的：
+
+```
+pip install paddlespeech
+```
+
+而开发者可以使用如下方式安装，这样不仅会安装install的依赖，也会安装develop的依赖， 即：最后安装的依赖=install依赖 + develop依赖：
+
+```bash
+pip install -e .[develop]
+```
+
+
+
+## python 包的动态安装
+
+可以使用 pip包来实现动态安装：
+
+```python
+import pip
+if int(pip.__version__.split('.')[0]) > 9:
+        from pip._internal import main
+    else:
+        from pip import main
+    main(['install', package_name])
+```
+
+
+
 ## python 编包方法
 
 #### 创建 pypi的账号
@@ -74,7 +121,22 @@ twine upload dist/wheel包
 
 
 
-## Manylinux 降低含有 C++ 依赖的 pip 包的 glibc 依赖
+
+
+## 关于发包前的准备工作
+
+#### 拉分支
+在发包之前需要拉分支。例如需要发0.1.0版本的正式包，则需要拉一个r0.1的分支。并且在这个r0.1分支的包上面打0.1.0的tag。在拉分支之前可以选择性的使用rc版本发一个正式版前的试用包，例如0.1.0rc0，等到rc包测试通过后，再拉分支（如果是发0.1.1包，则merge r0.1分支），打tag，完成发包。总体步骤可以总结为：
+
+- 用develop分支发rc包
+- rc包通过后拉分支
+- 打tag
+- 发包
+- 编写release note
+
+
+
+## ManyLinux
 
 为了让有C++依赖的 pip wheel 包可以适用于更多的 linux 系统，需要降低其本身的 glibc 的依赖。这就需要让 pip wheel 包在 manylinux 的 docker 下编包。关于查看系统的 glibc 版本，可以使用命令：`ldd --version`。
 
@@ -120,40 +182,3 @@ auditwheel show wheel包
 auditwheel repair wheel包
 ```
 
-
-
-## 区分 install 模式和 develop 模式
-
-可以在setup.py 中划分 install 的依赖（基本依赖）和 develop 的依赖 （开发者额外依赖）。 setup_info 中 `install_requires` 设置 install 的依赖，而在 `extras_require` 中设置 `develop` key为 develop的依赖。
-普通安装可以使用：
-
-```bash
-pip install . 
-```
-
-另外使用 pip 安装已发的包也是使用普通安装的：
-
-```
-pip install paddlespeech
-```
-
-而开发者可以使用如下方式安装，这样不仅会安装install的依赖，也会安装develop的依赖， 即：最后安装的依赖=install依赖 + develop依赖：
-
-```bash
-pip install -e .[develop]
-```
-
-
-
-## python 包的动态安装
-
-可以使用 pip包来实现动态安装：
-
-```python
-import pip
-if int(pip.__version__.split('.')[0]) > 9:
-        from pip._internal import main
-    else:
-        from pip import main
-    main(['install', package_name])
-```

From d59a31b6133c82ac94a238d9a6f5c1e00416bc83 Mon Sep 17 00:00:00 2001
From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com>
Date: Thu, 13 Jan 2022 17:02:43 +0800
Subject: [PATCH 45/60] test=doc_fix

---
 .../package_release/python_package_release.md | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/docs/topic/package_release/python_package_release.md b/docs/topic/package_release/python_package_release.md
index 2d2edbb4a..e3086efe1 100644
--- a/docs/topic/package_release/python_package_release.md
+++ b/docs/topic/package_release/python_package_release.md
@@ -2,12 +2,12 @@
 
 ## 问题：
 
-1. [如何去除ubuntu的apt安装依赖？](#conda-代替系统依赖)
+1. [如何去除 ubuntu 的 apt 安装依赖？](#conda-代替系统依赖)
 2. [如何支持普通用户和开发者两种安装的需求，尽量减少普通用户所需的依赖？](#区分install模式和develop模式)
-3. [如何进行python包的动态安装？](#python-包的动态安装)
-4. [如何进行python项目编包？](#python-编包方法)
+3. [如何进行 python 包的动态安装？](#python-包的动态安装)
+4. [如何进行 python 项目编包？](#python-编包方法)
 5. [发包前要有什么准备？](#关于发包前的准备工作)
-6. [发C++包需要注意的东西？](#manylinux)
+6. [发 C++ 包需要注意的东西？](#manylinux)
 
 
 
@@ -126,23 +126,23 @@ twine upload dist/wheel包
 ## 关于发包前的准备工作
 
 #### 拉分支
-在发包之前需要拉分支。例如需要发0.1.0版本的正式包，则需要拉一个r0.1的分支。并且在这个r0.1分支的包上面打0.1.0的tag。在拉分支之前可以选择性的使用rc版本发一个正式版前的试用包，例如0.1.0rc0，等到rc包测试通过后，再拉分支（如果是发0.1.1包，则merge r0.1分支），打tag，完成发包。总体步骤可以总结为：
+在发包之前需要拉分支。例如需要发0.1.0版本的正式包，则需要拉一个 r0.1 的分支。并且在这个 r0.1 分支的包上面打 0.1.0 的tag。在拉分支之前可以选择性的使用 rc 版本发一个正式版前的试用包，例如0.1.0rc0，等到rc包测试通过后，再拉分支（如果是发 0.1.1 包，则 merge r0.1分支），打tag，完成发包。总体步骤可以总结为：
 
-- 用develop分支发rc包
-- rc包通过后拉分支
-- 打tag
+- 用 develop 分支发 rc 包
+- rc 包通过后拉分支
+- 打 tag
 - 发包
-- 编写release note
+- 编写 release note
 
 
 
 ## ManyLinux
 
-为了让有C++依赖的 pip wheel 包可以适用于更多的 linux 系统，需要降低其本身的 glibc 的依赖。这就需要让 pip wheel 包在 manylinux 的 docker 下编包。关于查看系统的 glibc 版本，可以使用命令：`ldd --version`。
+为了让有 C++ 依赖的 pip wheel 包可以适用于更多的 linux 系统，需要降低其本身的 glibc 的依赖。这就需要让 pip wheel 包在 manylinux 的 docker 下编包。关于查看系统的 glibc 版本，可以使用命令：`ldd --version`。
 
 ### Manylinux
 
-关于Many Linux，主要可以参考 Github 项目的说明[ github many linux](https://github.com/pypa/manylinux)。
+关于 Manylinux，主要可以参考 Github 项目的说明[ github many linux](https://github.com/pypa/manylinux)。
 manylinux1 支持 Centos5以上， manylinux2010 支持 Centos 6 以上，manylinux2014 支持Centos 7 以上。
 目前使用 manylinux2010 基本可以满足所有的 linux 生产环境需求。（不建议使用manylinux1，系统较老，难度较大）
 
@@ -160,7 +160,7 @@ docker pull quay.io/pypa/manylinux1_x86_64
 docker run -it xxxxxx
 ```
 
-在 Many Linux 2010 的docker环境自带 swig 和各种类型的 python 版本。这里注意不要自己下载conda 来安装环境来编译 pip 包，要用 docker 本身的环境来编包。
+在 manylinux2010 的docker环境自带 swig 和各种类型的 python 版本。这里注意不要自己下载 conda 来安装环境来编译 pip 包，要用 docker 本身的环境来编包。
 设置python：
 
 ```bash

From 8283a12d0feb636eb9a487a7f2a21205e500d3b6 Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Thu, 13 Jan 2022 17:08:27 +0800
Subject: [PATCH 46/60] uodate readme, test=doc

---
 .../package_release/python_package_release.md | 31 +++++++------------
 1 file changed, 11 insertions(+), 20 deletions(-)

diff --git a/docs/topic/package_release/python_package_release.md b/docs/topic/package_release/python_package_release.md
index e3086efe1..b34b4e21b 100644
--- a/docs/topic/package_release/python_package_release.md
+++ b/docs/topic/package_release/python_package_release.md
@@ -13,27 +13,27 @@
 
 ## conda 代替系统依赖
 
-conda可以用来代替一些 apt-get 安装的系统依赖，这样可以让项目适用于除了 ubuntu 以外的系统。
+conda 可以用来代替一些 apt-get 安装的系统依赖，这样可以让项目适用于除了 ubuntu 以外的系统。
 
-使用 conda 可以安装 sox, libsndfile，swig等 paddlespeech 需要的依赖：
+使用 conda 可以安装 sox, libsndfile，swig 等 paddlespeech 需要的依赖：
 
 ```bash
 conda install -y -c conda-forge sox libsndfile
 ```
 
-部分系统会缺少libbzip2库，这个 paddlespeech 也是需要的，这也可以用 conda 安装：
+部分系统会缺少 libbzip2 库，这个 paddlespeech 也是需要的，这也可以用 conda 安装：
 
 ```bash
 conda install -y -c bzip2
 ```
 
-conda也可以安装linux的C++的依赖：
+conda 也可以安装 linux 的 C++ 的依赖：
 
 ```bash
 conda install -y -c gcc_linux-64=8.4.0 gxx_linux-64=8.4.0
 ```
 
-#### 剩余问题：使用conda环境编译kenlm失败。目前在conda环境下编译kenlm会出现链接失败的问题
+#### 剩余问题：使用 conda 环境编译 kenlm 失败。目前在 conda 环境下编译 kenlm 会出现链接失败的问题
 
 目前知道需要的依赖：
 
@@ -41,11 +41,9 @@ conda install -y -c gcc_linux-64=8.4.0 gxx_linux-64=8.4.0
 conda install -c conda-forge eigen boost cmake
 ```
 
-
-
 ## 区分install模式和develop模式
 
-可以在setup.py 中划分 install 的依赖（基本依赖）和 develop 的依赖 （开发者额外依赖）。 setup_info 中 `install_requires` 设置 install 的依赖，而在 `extras_require` 中设置 `develop` key为 develop的依赖。
+可以在 setup.py 中划分 install 的依赖（基本依赖）和 develop 的依赖 （开发者额外依赖）。 setup_info 中 `install_requires` 设置 install 的依赖，而在 `extras_require` 中设置 `develop` key 为 develop 的依赖。
 普通安装可以使用：
 
 ```bash
@@ -58,17 +56,15 @@ pip install .
 pip install paddlespeech
 ```
 
-而开发者可以使用如下方式安装，这样不仅会安装install的依赖，也会安装develop的依赖， 即：最后安装的依赖=install依赖 + develop依赖：
+而开发者可以使用如下方式安装，这样不仅会安装 install 的依赖，也会安装 develop 的依赖， 即：最后安装的依赖 = install 依赖 + develop 依赖：
 
 ```bash
 pip install -e .[develop]
 ```
 
-
-
 ## python 包的动态安装
 
-可以使用 pip包来实现动态安装：
+可以使用 pip 包来实现动态安装：
 
 ```python
 import pip
@@ -79,8 +75,6 @@ if int(pip.__version__.split('.')[0]) > 9:
     main(['install', package_name])
 ```
 
-
-
 ## python 编包方法
 
 #### 创建 pypi的账号
@@ -95,7 +89,7 @@ pip install twine
 
 #### python 编包
 
-编写好python包的setup.py, 然后使用如下命令编wheel包：
+编写好 python 包的 setup.py, 然后使用如下命令编 wheel 包：
 
 ```bash
 python setup.py bdist_wheel
@@ -113,20 +107,17 @@ python setup.py sdist
 twine upload dist/wheel包
 ```
 
-输入账号和密码后就可以上传wheel包了
+输入账号和密码后就可以上传 wheel 包了
 
 #### 关于python 包的发包信息
 
 主要可以参考这个[文档](https://packaging.python.org/en/latest/guides/distributing-packages-using-setuptools/?highlight=find_packages)
 
 
-
-
-
 ## 关于发包前的准备工作
 
 #### 拉分支
-在发包之前需要拉分支。例如需要发0.1.0版本的正式包，则需要拉一个 r0.1 的分支。并且在这个 r0.1 分支的包上面打 0.1.0 的tag。在拉分支之前可以选择性的使用 rc 版本发一个正式版前的试用包，例如0.1.0rc0，等到rc包测试通过后，再拉分支（如果是发 0.1.1 包，则 merge r0.1分支），打tag，完成发包。总体步骤可以总结为：
+在发包之前需要拉分支。例如需要发 0.1.0 版本的正式包，则需要拉一个 r0.1 的分支。并且在这个 r0.1 分支的包上面打 0.1.0 的tag。在拉分支之前可以选择性的使用 rc 版本发一个正式版前的试用包，例如0.1.0rc0，等到rc包测试通过后，再拉分支（如果是发 0.1.1 包，则 merge r0.1分支），打tag，完成发包。总体步骤可以总结为：
 
 - 用 develop 分支发 rc 包
 - rc 包通过后拉分支

From 7f5162156ba3285de4bfaaa30037bde37d294d8a Mon Sep 17 00:00:00 2001
From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com>
Date: Thu, 13 Jan 2022 17:10:42 +0800
Subject: [PATCH 47/60] test=doc

---
 docs/topic/package_release/python_package_release.md | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/docs/topic/package_release/python_package_release.md b/docs/topic/package_release/python_package_release.md
index b34b4e21b..96d6f5f4e 100644
--- a/docs/topic/package_release/python_package_release.md
+++ b/docs/topic/package_release/python_package_release.md
@@ -10,12 +10,11 @@
 6. [发 C++ 包需要注意的东西？](#manylinux)
 
 
-
 ## conda 代替系统依赖
 
 conda 可以用来代替一些 apt-get 安装的系统依赖，这样可以让项目适用于除了 ubuntu 以外的系统。
 
-使用 conda 可以安装 sox, libsndfile，swig 等 paddlespeech 需要的依赖：
+使用 conda 可以安装 sox、 libsndfile、swig 等 paddlespeech 需要的依赖：
 
 ```bash
 conda install -y -c conda-forge sox libsndfile
@@ -172,4 +171,3 @@ auditwheel show wheel包
 ```bash
 auditwheel repair wheel包
 ```
-

From 9a96ebac6690ae74d00caf80ce69b5642caa3f8b Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Thu, 13 Jan 2022 11:21:37 +0000
Subject: [PATCH 48/60] update version, test=doc

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 2f51e855d..a6b18f979 100644
--- a/setup.py
+++ b/setup.py
@@ -172,7 +172,7 @@ class UploadCommand(Command):
 setup_info = dict(
     # Metadata
     name='paddlespeech',
-    version='0.1.0',
+    version='0.1.1',
     author='PaddlePaddle Speech and Language Team',
     author_email='paddlesl@baidu.com',
     url='https://github.com/PaddlePaddle/PaddleSpeech',

From 5e7e8a3e240a6463f33452969f2bd93bc43e4a90 Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Fri, 14 Jan 2022 06:55:46 +0000
Subject: [PATCH 49/60] fix the u2 export, test=asr

---
 paddlespeech/s2t/exps/u2/model.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py
index 992be5cd4..85bb877b1 100644
--- a/paddlespeech/s2t/exps/u2/model.py
+++ b/paddlespeech/s2t/exps/u2/model.py
@@ -524,10 +524,10 @@ class U2Tester(U2Trainer):
             List[paddle.static.InputSpec]: input spec.
         """
         from paddlespeech.s2t.models.u2 import U2InferModel
-        infer_model = U2InferModel.from_pretrained(self.train_loader,
+        infer_model = U2InferModel.from_pretrained(self.test_loader,
                                                    self.config.clone(),
                                                    self.args.checkpoint_path)
-        feat_dim = self.train_loader.feat_dim
+        feat_dim = self.test_loader.feat_dim
         input_spec = [
             paddle.static.InputSpec(shape=[1, None, feat_dim],
                                     dtype='float32'),  # audio, [B,T,D]

From 4cc763baeeebc0e2571fcf0a3e85f8c009f8a1fd Mon Sep 17 00:00:00 2001
From: Jackwaterveg <87408988+Jackwaterveg@users.noreply.github.com>
Date: Fri, 14 Jan 2022 15:49:38 +0800
Subject: [PATCH 50/60] Update python_package_release.md

---
 .../topic/package_release/python_package_release.md | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/docs/topic/package_release/python_package_release.md b/docs/topic/package_release/python_package_release.md
index 96d6f5f4e..3e3f9dbf6 100644
--- a/docs/topic/package_release/python_package_release.md
+++ b/docs/topic/package_release/python_package_release.md
@@ -116,8 +116,19 @@ twine upload dist/wheel包
 ## 关于发包前的准备工作
 
 #### 拉分支
-在发包之前需要拉分支。例如需要发 0.1.0 版本的正式包，则需要拉一个 r0.1 的分支。并且在这个 r0.1 分支的包上面打 0.1.0 的tag。在拉分支之前可以选择性的使用 rc 版本发一个正式版前的试用包，例如0.1.0rc0，等到rc包测试通过后，再拉分支（如果是发 0.1.1 包，则 merge r0.1分支），打tag，完成发包。总体步骤可以总结为：
+在发包之前需要拉分支。例如需要发 0.1.0 版本的正式包，则需要拉一个 r0.1 的分支。并且在这个 r0.1 分支的包上面打 0.1.0 的tag。在拉分支之前可以选择性的使用 rc 版本发一个正式版前的试用包，例如0.1.0rc0，等到rc包测试通过后，再拉分支（如果是发 0.1.1 包，则 merge r0.1分支），打tag，完成发包。
 
+关于打tag的命令，可以参考[git 基础](https://git-scm.com/book/zh/v2/Git-%E5%9F%BA%E7%A1%80-%E6%89%93%E6%A0%87%E7%AD%BE)。使用轻量标签即可:
+```bash
+git tag r0.1.1 commit_id
+```
+然后使用`git push` 把本地 tag 传到远程 repo 上即可 
+```bash
+git push origin r0.1.1
+```
+打完 tag 后要记得编写 release note。
+
+最后，发包准备工作的步骤可以总结为：  
 - 用 develop 分支发 rc 包
 - rc 包通过后拉分支
 - 打 tag

From 43aad7a01873489daeb5f620f50b1ef3012702d9 Mon Sep 17 00:00:00 2001
From: Junkun <junkun.chen.cn@gmail.com>
Date: Fri, 14 Jan 2022 13:59:53 -0800
Subject: [PATCH 51/60] beam search with optimality guarantees

---
 paddlespeech/s2t/exps/u2_st/model.py   |   4 +-
 paddlespeech/s2t/models/u2_st/u2_st.py | 145 ++++++++++++-------------
 2 files changed, 75 insertions(+), 74 deletions(-)

diff --git a/paddlespeech/s2t/exps/u2_st/model.py b/paddlespeech/s2t/exps/u2_st/model.py
index b03ca38b6..b642e9337 100644
--- a/paddlespeech/s2t/exps/u2_st/model.py
+++ b/paddlespeech/s2t/exps/u2_st/model.py
@@ -285,7 +285,7 @@ class U2STTrainer(Trainer):
                 subsampling_factor=1,
                 load_aux_output=load_transcript,
                 num_encs=1,
-                dist_sampler=True)
+                dist_sampler=False)
             logger.info("Setup train/valid Dataloader!")
         else:
             # test dataset, return raw text
@@ -408,6 +408,7 @@ class U2STTester(U2STTrainer):
             decoding_method=decode_cfg.decoding_method,
             beam_size=decode_cfg.beam_size,
             word_reward=decode_cfg.word_reward,
+            maxlen_ratio=decode_cfg.maxlen_ratio,
             decoding_chunk_size=decode_cfg.decoding_chunk_size,
             num_decoding_left_chunks=decode_cfg.num_decoding_left_chunks,
             simulate_streaming=decode_cfg.simulate_streaming)
@@ -435,6 +436,7 @@ class U2STTester(U2STTrainer):
             decoding_method=decode_cfg.decoding_method,
             beam_size=decode_cfg.beam_size,
             word_reward=decode_cfg.word_reward,
+            maxlen_ratio=decode_cfg.maxlen_ratio,
             decoding_chunk_size=decode_cfg.decoding_chunk_size,
             num_decoding_left_chunks=decode_cfg.num_decoding_left_chunks,
             simulate_streaming=decode_cfg.simulate_streaming)
diff --git a/paddlespeech/s2t/models/u2_st/u2_st.py b/paddlespeech/s2t/models/u2_st/u2_st.py
index 79ca423f8..211813f63 100644
--- a/paddlespeech/s2t/models/u2_st/u2_st.py
+++ b/paddlespeech/s2t/models/u2_st/u2_st.py
@@ -264,14 +264,17 @@ class U2STBaseModel(nn.Layer):
             speech_lengths: paddle.Tensor,
             beam_size: int=10,
             word_reward: float=0.0,
+            maxlen_ratio: float=0.5,
             decoding_chunk_size: int=-1,
             num_decoding_left_chunks: int=-1,
             simulate_streaming: bool=False, ) -> paddle.Tensor:
-        """ Apply beam search on attention decoder
+        """ Apply beam search on attention decoder with length penalty
         Args:
             speech (paddle.Tensor): (batch, max_len, feat_dim)
             speech_length (paddle.Tensor): (batch, )
             beam_size (int): beam size for beam search
+            word_reward (float): word reward used in beam search
+            maxlen_ratio (float): max length ratio to bound the length of translated text
             decoding_chunk_size (int): decoding chunk for dynamic chunk
                 trained model.
                 <0: for decoding, use full chunk.
@@ -284,90 +287,84 @@ class U2STBaseModel(nn.Layer):
         """
         assert speech.shape[0] == speech_lengths.shape[0]
         assert decoding_chunk_size != 0
+        assert speech.shape[0] == 1
         device = speech.place
-        batch_size = speech.shape[0]
 
         # Let's assume B = batch_size and N = beam_size
-        # 1. Encoder
+        # 1. Encoder and init hypothesis 
         encoder_out, encoder_mask = self._forward_encoder(
             speech, speech_lengths, decoding_chunk_size,
             num_decoding_left_chunks,
             simulate_streaming)  # (B, maxlen, encoder_dim)
-        maxlen = encoder_out.shape[1]
-        encoder_dim = encoder_out.shape[2]
-        running_size = batch_size * beam_size
-        encoder_out = encoder_out.unsqueeze(1).repeat(1, beam_size, 1, 1).view(
-            running_size, maxlen, encoder_dim)  # (B*N, maxlen, encoder_dim)
-        encoder_mask = encoder_mask.unsqueeze(1).repeat(
-            1, beam_size, 1, 1).view(running_size, 1,
-                                     maxlen)  # (B*N, 1, max_len)
-
-        hyps = paddle.ones(
-            [running_size, 1], dtype=paddle.long).fill_(self.sos)  # (B*N, 1)
-        # log scale score
-        scores = paddle.to_tensor(
-            [0.0] + [-float('inf')] * (beam_size - 1), dtype=paddle.float)
-        scores = scores.to(device).repeat(batch_size).unsqueeze(1).to(
-            device)  # (B*N, 1)
-        end_flag = paddle.zeros_like(scores, dtype=paddle.bool)  # (B*N, 1)
-        cache: Optional[List[paddle.Tensor]] = None
+
+        maxlen = max(int(encoder_out.shape[1] * maxlen_ratio), 5)
+
+        hyp = {"score": 0.0, "yseq": [self.sos], "cache": None}
+        hyps = [hyp]
+        ended_hyps = []
+        cur_best_score = -float("inf")
+        cache = None
+
         # 2. Decoder forward step by step
         for i in range(1, maxlen + 1):
-            # Stop if all batch and all beam produce eos
-            # TODO(Hui Zhang): if end_flag.sum() == running_size:
-            if end_flag.cast(paddle.int64).sum() == running_size:
-                break
-
-            # 2.1 Forward decoder step
-            hyps_mask = subsequent_mask(i).unsqueeze(0).repeat(
-                running_size, 1, 1).to(device)  # (B*N, i, i)
-            # logp: (B*N, vocab)
+            ys = paddle.ones((len(hyps), i), dtype=paddle.long)
+         
+            if hyps[0]["cache"] is not None:
+                cache = [paddle.ones((len(hyps), i-1, hyps[0]["cache"][0].shape[-1]), dtype=paddle.float32) for _ in range(len(hyps[0]["cache"]))]
+            for j, hyp in enumerate(hyps):
+                ys[j, :] = paddle.to_tensor(hyp["yseq"])
+                if hyps[0]["cache"] is not None:
+                    for k in range(len(cache)):
+                        cache[k][j] = hyps[j]["cache"][k]
+            ys_mask = subsequent_mask(i).unsqueeze(0).to(device)
+ 
             logp, cache = self.st_decoder.forward_one_step(
-                encoder_out, encoder_mask, hyps, hyps_mask, cache)
-
-            # 2.2 First beam prune: select topk best prob at current time
-            top_k_logp, top_k_index = logp.topk(beam_size)  # (B*N, N)
-            top_k_logp += word_reward
-            top_k_logp = mask_finished_scores(top_k_logp, end_flag)
-            top_k_index = mask_finished_preds(top_k_index, end_flag, self.eos)
-
-            # 2.3 Seconde beam prune: select topk score with history
-            scores = scores + top_k_logp  # (B*N, N), broadcast add
-            scores = scores.view(batch_size, beam_size * beam_size)  # (B, N*N)
-            scores, offset_k_index = scores.topk(k=beam_size)  # (B, N)
-            scores = scores.view(-1, 1)  # (B*N, 1)
-
-            # 2.4. Compute base index in top_k_index,
-            # regard top_k_index as (B*N*N),regard offset_k_index as (B*N),
-            # then find offset_k_index in top_k_index
-            base_k_index = paddle.arange(batch_size).view(-1, 1).repeat(
-                1, beam_size)  # (B, N)
-            base_k_index = base_k_index * beam_size * beam_size
-            best_k_index = base_k_index.view(-1) + offset_k_index.view(
-                -1)  # (B*N)
-
-            # 2.5 Update best hyps
-            best_k_pred = paddle.index_select(
-                top_k_index.view(-1), index=best_k_index, axis=0)  # (B*N)
-            best_hyps_index = best_k_index // beam_size
-            last_best_k_hyps = paddle.index_select(
-                hyps, index=best_hyps_index, axis=0)  # (B*N, i)
-            hyps = paddle.cat(
-                (last_best_k_hyps, best_k_pred.view(-1, 1)),
-                dim=1)  # (B*N, i+1)
-
-            # 2.6 Update end flag
-            end_flag = paddle.eq(hyps[:, -1], self.eos).view(-1, 1)
+                encoder_out.repeat(len(hyps), 1, 1), encoder_mask.repeat(len(hyps), 1, 1), ys, ys_mask, cache)
+
+            hyps_best_kept = []
+            for j, hyp in enumerate(hyps):
+                top_k_logp, top_k_index = logp[j : j + 1].topk(beam_size)
+
+                for b in range(beam_size):
+                    new_hyp = {}
+                    new_hyp["score"] = hyp["score"] + float(top_k_logp[0, b])
+                    new_hyp["yseq"] = [0] * (1 + len(hyp["yseq"]))
+                    new_hyp["yseq"][: len(hyp["yseq"])] = hyp["yseq"]
+                    new_hyp["yseq"][len(hyp["yseq"])] = int(top_k_index[0, b])
+                    new_hyp["cache"] = [cache_[j] for cache_ in cache]
+                    # will be (2 x beam) hyps at most
+                    hyps_best_kept.append(new_hyp)
+
+                hyps_best_kept = sorted(
+                    hyps_best_kept, key=lambda x: -x["score"])[:beam_size]
+            
+            # sort and get nbest
+            hyps = hyps_best_kept
+            if i == maxlen:
+                for hyp in hyps:
+                    hyp["yseq"].append(self.eos)
+            
+            # finalize the ended hypotheses with word reward (by length)
+            remained_hyps = []
+            for hyp in hyps:
+                if hyp["yseq"][-1] == self.eos:
+                    hyp["score"] += (i - 1) * word_reward
+                    cur_best_score = max(cur_best_score, hyp["score"])
+                    ended_hyps.append(hyp)
+                else:
+                    # stop while guarantee the optimality
+                    if hyp["score"] + maxlen * word_reward > cur_best_score:
+                        remained_hyps.append(hyp)
+            
+            # stop predition when there is no unended hypothesis
+            if not remained_hyps:
+                break
+            hyps = remained_hyps
 
         # 3. Select best of best
-        scores = scores.view(batch_size, beam_size)
-        # TODO: length normalization
-        best_index = paddle.argmax(scores, axis=-1).long()  # (B)
-        best_hyps_index = best_index + paddle.arange(
-            batch_size, dtype=paddle.long) * beam_size
-        best_hyps = paddle.index_select(hyps, index=best_hyps_index, axis=0)
-        best_hyps = best_hyps[:, 1:]
-        return best_hyps
+        best_hyp = max(ended_hyps, key=lambda x: x["score"])
+
+        return paddle.to_tensor([best_hyp["yseq"][1:]]) 
 
     # @jit.to_static
     def subsampling_rate(self) -> int:
@@ -472,6 +469,7 @@ class U2STBaseModel(nn.Layer):
                decoding_method: str,
                beam_size: int,
                word_reward: float=0.0,
+               maxlen_ratio: float=0.5,
                decoding_chunk_size: int=-1,
                num_decoding_left_chunks: int=-1,
                simulate_streaming: bool=False):
@@ -507,6 +505,7 @@ class U2STBaseModel(nn.Layer):
                 feats_lengths,
                 beam_size=beam_size,
                 word_reward=word_reward,
+                maxlen_ratio=maxlen_ratio,
                 decoding_chunk_size=decoding_chunk_size,
                 num_decoding_left_chunks=num_decoding_left_chunks,
                 simulate_streaming=simulate_streaming)

From f866059b744958b76e87fd342bc6dbbb000fec91 Mon Sep 17 00:00:00 2001
From: Junkun <junkun.chen.cn@gmail.com>
Date: Fri, 14 Jan 2022 14:58:23 -0800
Subject: [PATCH 52/60] config and formalize

---
 .../ted_en_zh/st0/conf/tuning/decode.yaml     |  3 ++-
 .../ted_en_zh/st1/conf/tuning/decode.yaml     |  3 ++-
 paddlespeech/s2t/models/u2_st/u2_st.py        | 26 ++++++++++++-------
 3 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/examples/ted_en_zh/st0/conf/tuning/decode.yaml b/examples/ted_en_zh/st0/conf/tuning/decode.yaml
index ed081cf4a..7606ee35f 100644
--- a/examples/ted_en_zh/st0/conf/tuning/decode.yaml
+++ b/examples/ted_en_zh/st0/conf/tuning/decode.yaml
@@ -1,8 +1,9 @@
-batch_size: 5
+batch_size: 1
 error_rate_type: char-bleu
 decoding_method: fullsentence  # 'fullsentence', 'simultaneous'
 beam_size: 10
 word_reward: 0.7
+maxlen_ratio: 0.3
 decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
     # <0: for decoding, use full chunk.
     # >0: for decoding, use fixed chunk size as set.
diff --git a/examples/ted_en_zh/st1/conf/tuning/decode.yaml b/examples/ted_en_zh/st1/conf/tuning/decode.yaml
index d6104dbce..9f00dd764 100644
--- a/examples/ted_en_zh/st1/conf/tuning/decode.yaml
+++ b/examples/ted_en_zh/st1/conf/tuning/decode.yaml
@@ -1,9 +1,10 @@
 
-batch_size: 5
+batch_size: 1
 error_rate_type: char-bleu
 decoding_method: fullsentence  # 'fullsentence', 'simultaneous'
 beam_size: 10
 word_reward: 0.7
+maxlen_ratio: 0.3
 decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
     # <0: for decoding, use full chunk.
     # >0: for decoding, use fixed chunk size as set.
diff --git a/paddlespeech/s2t/models/u2_st/u2_st.py b/paddlespeech/s2t/models/u2_st/u2_st.py
index 211813f63..f92268eb7 100644
--- a/paddlespeech/s2t/models/u2_st/u2_st.py
+++ b/paddlespeech/s2t/models/u2_st/u2_st.py
@@ -308,28 +308,34 @@ class U2STBaseModel(nn.Layer):
         # 2. Decoder forward step by step
         for i in range(1, maxlen + 1):
             ys = paddle.ones((len(hyps), i), dtype=paddle.long)
-         
+
             if hyps[0]["cache"] is not None:
-                cache = [paddle.ones((len(hyps), i-1, hyps[0]["cache"][0].shape[-1]), dtype=paddle.float32) for _ in range(len(hyps[0]["cache"]))]
+                cache = [
+                    paddle.ones(
+                        (len(hyps), i - 1, hyp_cache.shape[-1]),
+                        dtype=paddle.float32)
+                    for hyp_cache in hyps[0]["cache"]
+                ]
             for j, hyp in enumerate(hyps):
                 ys[j, :] = paddle.to_tensor(hyp["yseq"])
                 if hyps[0]["cache"] is not None:
                     for k in range(len(cache)):
                         cache[k][j] = hyps[j]["cache"][k]
             ys_mask = subsequent_mask(i).unsqueeze(0).to(device)
- 
+
             logp, cache = self.st_decoder.forward_one_step(
-                encoder_out.repeat(len(hyps), 1, 1), encoder_mask.repeat(len(hyps), 1, 1), ys, ys_mask, cache)
+                encoder_out.repeat(len(hyps), 1, 1),
+                encoder_mask.repeat(len(hyps), 1, 1), ys, ys_mask, cache)
 
             hyps_best_kept = []
             for j, hyp in enumerate(hyps):
-                top_k_logp, top_k_index = logp[j : j + 1].topk(beam_size)
+                top_k_logp, top_k_index = logp[j:j + 1].topk(beam_size)
 
                 for b in range(beam_size):
                     new_hyp = {}
                     new_hyp["score"] = hyp["score"] + float(top_k_logp[0, b])
                     new_hyp["yseq"] = [0] * (1 + len(hyp["yseq"]))
-                    new_hyp["yseq"][: len(hyp["yseq"])] = hyp["yseq"]
+                    new_hyp["yseq"][:len(hyp["yseq"])] = hyp["yseq"]
                     new_hyp["yseq"][len(hyp["yseq"])] = int(top_k_index[0, b])
                     new_hyp["cache"] = [cache_[j] for cache_ in cache]
                     # will be (2 x beam) hyps at most
@@ -337,13 +343,13 @@ class U2STBaseModel(nn.Layer):
 
                 hyps_best_kept = sorted(
                     hyps_best_kept, key=lambda x: -x["score"])[:beam_size]
-            
+
             # sort and get nbest
             hyps = hyps_best_kept
             if i == maxlen:
                 for hyp in hyps:
                     hyp["yseq"].append(self.eos)
-            
+
             # finalize the ended hypotheses with word reward (by length)
             remained_hyps = []
             for hyp in hyps:
@@ -355,7 +361,7 @@ class U2STBaseModel(nn.Layer):
                     # stop while guarantee the optimality
                     if hyp["score"] + maxlen * word_reward > cur_best_score:
                         remained_hyps.append(hyp)
-            
+
             # stop predition when there is no unended hypothesis
             if not remained_hyps:
                 break
@@ -364,7 +370,7 @@ class U2STBaseModel(nn.Layer):
         # 3. Select best of best
         best_hyp = max(ended_hyps, key=lambda x: x["score"])
 
-        return paddle.to_tensor([best_hyp["yseq"][1:]]) 
+        return paddle.to_tensor([best_hyp["yseq"][1:]])
 
     # @jit.to_static
     def subsampling_rate(self) -> int:

From 44408e5211b8c1457351c273d959591b518d8aeb Mon Sep 17 00:00:00 2001
From: Junkun <junkun.chen.cn@gmail.com>
Date: Fri, 14 Jan 2022 16:16:43 -0800
Subject: [PATCH 53/60] sync the variable name to others

---
 examples/ted_en_zh/st0/conf/tuning/decode.yaml |  2 +-
 examples/ted_en_zh/st1/conf/tuning/decode.yaml |  2 +-
 paddlespeech/s2t/exps/u2_st/model.py           |  4 ++--
 paddlespeech/s2t/models/u2_st/u2_st.py         | 10 +++++-----
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/examples/ted_en_zh/st0/conf/tuning/decode.yaml b/examples/ted_en_zh/st0/conf/tuning/decode.yaml
index 7606ee35f..7d8d1daf1 100644
--- a/examples/ted_en_zh/st0/conf/tuning/decode.yaml
+++ b/examples/ted_en_zh/st0/conf/tuning/decode.yaml
@@ -3,7 +3,7 @@ error_rate_type: char-bleu
 decoding_method: fullsentence  # 'fullsentence', 'simultaneous'
 beam_size: 10
 word_reward: 0.7
-maxlen_ratio: 0.3
+maxlenratio: 0.3
 decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
     # <0: for decoding, use full chunk.
     # >0: for decoding, use fixed chunk size as set.
diff --git a/examples/ted_en_zh/st1/conf/tuning/decode.yaml b/examples/ted_en_zh/st1/conf/tuning/decode.yaml
index 9f00dd764..4f10acf74 100644
--- a/examples/ted_en_zh/st1/conf/tuning/decode.yaml
+++ b/examples/ted_en_zh/st1/conf/tuning/decode.yaml
@@ -4,7 +4,7 @@ error_rate_type: char-bleu
 decoding_method: fullsentence  # 'fullsentence', 'simultaneous'
 beam_size: 10
 word_reward: 0.7
-maxlen_ratio: 0.3
+maxlenratio: 0.3
 decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
     # <0: for decoding, use full chunk.
     # >0: for decoding, use fixed chunk size as set.
diff --git a/paddlespeech/s2t/exps/u2_st/model.py b/paddlespeech/s2t/exps/u2_st/model.py
index b642e9337..6a32eda77 100644
--- a/paddlespeech/s2t/exps/u2_st/model.py
+++ b/paddlespeech/s2t/exps/u2_st/model.py
@@ -408,7 +408,7 @@ class U2STTester(U2STTrainer):
             decoding_method=decode_cfg.decoding_method,
             beam_size=decode_cfg.beam_size,
             word_reward=decode_cfg.word_reward,
-            maxlen_ratio=decode_cfg.maxlen_ratio,
+            maxlenratio=decode_cfg.maxlenratio,
             decoding_chunk_size=decode_cfg.decoding_chunk_size,
             num_decoding_left_chunks=decode_cfg.num_decoding_left_chunks,
             simulate_streaming=decode_cfg.simulate_streaming)
@@ -436,7 +436,7 @@ class U2STTester(U2STTrainer):
             decoding_method=decode_cfg.decoding_method,
             beam_size=decode_cfg.beam_size,
             word_reward=decode_cfg.word_reward,
-            maxlen_ratio=decode_cfg.maxlen_ratio,
+            maxlenratio=decode_cfg.maxlenratio,
             decoding_chunk_size=decode_cfg.decoding_chunk_size,
             num_decoding_left_chunks=decode_cfg.num_decoding_left_chunks,
             simulate_streaming=decode_cfg.simulate_streaming)
diff --git a/paddlespeech/s2t/models/u2_st/u2_st.py b/paddlespeech/s2t/models/u2_st/u2_st.py
index f92268eb7..bc76de7ad 100644
--- a/paddlespeech/s2t/models/u2_st/u2_st.py
+++ b/paddlespeech/s2t/models/u2_st/u2_st.py
@@ -264,7 +264,7 @@ class U2STBaseModel(nn.Layer):
             speech_lengths: paddle.Tensor,
             beam_size: int=10,
             word_reward: float=0.0,
-            maxlen_ratio: float=0.5,
+            maxlenratio: float=0.5,
             decoding_chunk_size: int=-1,
             num_decoding_left_chunks: int=-1,
             simulate_streaming: bool=False, ) -> paddle.Tensor:
@@ -274,7 +274,7 @@ class U2STBaseModel(nn.Layer):
             speech_length (paddle.Tensor): (batch, )
             beam_size (int): beam size for beam search
             word_reward (float): word reward used in beam search
-            maxlen_ratio (float): max length ratio to bound the length of translated text
+            maxlenratio (float): max length ratio to bound the length of translated text
             decoding_chunk_size (int): decoding chunk for dynamic chunk
                 trained model.
                 <0: for decoding, use full chunk.
@@ -297,7 +297,7 @@ class U2STBaseModel(nn.Layer):
             num_decoding_left_chunks,
             simulate_streaming)  # (B, maxlen, encoder_dim)
 
-        maxlen = max(int(encoder_out.shape[1] * maxlen_ratio), 5)
+        maxlen = max(int(encoder_out.shape[1] * maxlenratio), 5)
 
         hyp = {"score": 0.0, "yseq": [self.sos], "cache": None}
         hyps = [hyp]
@@ -475,7 +475,7 @@ class U2STBaseModel(nn.Layer):
                decoding_method: str,
                beam_size: int,
                word_reward: float=0.0,
-               maxlen_ratio: float=0.5,
+               maxlenratio: float=0.5,
                decoding_chunk_size: int=-1,
                num_decoding_left_chunks: int=-1,
                simulate_streaming: bool=False):
@@ -511,7 +511,7 @@ class U2STBaseModel(nn.Layer):
                 feats_lengths,
                 beam_size=beam_size,
                 word_reward=word_reward,
-                maxlen_ratio=maxlen_ratio,
+                maxlenratio=maxlenratio,
                 decoding_chunk_size=decoding_chunk_size,
                 num_decoding_left_chunks=num_decoding_left_chunks,
                 simulate_streaming=simulate_streaming)

From 8028f33b7fa0e56bff401070de2d8d57d3f2d6af Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Mon, 17 Jan 2022 03:15:39 +0000
Subject: [PATCH 54/60] synchronize the version

---
 paddlespeech/__init__.py | 2 +-
 setup.py                 | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/paddlespeech/__init__.py b/paddlespeech/__init__.py
index 8d32f2879..5dfc1974c 100644
--- a/paddlespeech/__init__.py
+++ b/paddlespeech/__init__.py
@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = '0.1.0'
+__version__ = '0.1.1'
diff --git a/setup.py b/setup.py
index a6b18f979..8f68923d9 100644
--- a/setup.py
+++ b/setup.py
@@ -17,6 +17,7 @@ import io
 import os
 import subprocess as sp
 import sys
+import paddlespeech
 from pathlib import Path
 
 from setuptools import Command
@@ -172,7 +173,7 @@ class UploadCommand(Command):
 setup_info = dict(
     # Metadata
     name='paddlespeech',
-    version='0.1.1',
+    version=paddlespeech.__version__,
     author='PaddlePaddle Speech and Language Team',
     author_email='paddlesl@baidu.com',
     url='https://github.com/PaddlePaddle/PaddleSpeech',

From 5aff0bde24e4aa1a23b548cd2761368c741180fe Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Mon, 17 Jan 2022 16:08:09 +0800
Subject: [PATCH 55/60] dir arch (#1347)

---
 .mergify.yml                                  |  2 +-
 speechx/CMakeLists.txt                        | 77 +++++++++++++++++++
 speechx/docker/.gitkeep                       |  0
 speechx/examples/.gitkeep                     |  0
 speechx/speechx/CMakeLists.txt                |  0
 speechx/speechx/decoder/CMakeLists.txt        |  2 +
 speechx/speechx/frontend/CMakeLists.txt       |  0
 speechx/speechx/frontend/audio/CMakeLists.txt |  0
 speechx/speechx/frontend/text/CMakeLists.txt  |  0
 speechx/speechx/kaldi/.gitkeep                |  0
 speechx/speechx/model/CMakeLists.txt          |  0
 speechx/speechx/protocol/CMakeLists.txt       |  0
 speechx/speechx/third_party/CMakeLists.txt    |  0
 speechx/speechx/utils/CMakeLists.txt          |  0
 14 files changed, 80 insertions(+), 1 deletion(-)
 create mode 100644 speechx/CMakeLists.txt
 create mode 100644 speechx/docker/.gitkeep
 create mode 100644 speechx/examples/.gitkeep
 create mode 100644 speechx/speechx/CMakeLists.txt
 create mode 100644 speechx/speechx/decoder/CMakeLists.txt
 create mode 100644 speechx/speechx/frontend/CMakeLists.txt
 create mode 100644 speechx/speechx/frontend/audio/CMakeLists.txt
 create mode 100644 speechx/speechx/frontend/text/CMakeLists.txt
 create mode 100644 speechx/speechx/kaldi/.gitkeep
 create mode 100644 speechx/speechx/model/CMakeLists.txt
 create mode 100644 speechx/speechx/protocol/CMakeLists.txt
 create mode 100644 speechx/speechx/third_party/CMakeLists.txt
 create mode 100644 speechx/speechx/utils/CMakeLists.txt

diff --git a/.mergify.yml b/.mergify.yml
index 3347c6dc3..f012c2f8e 100644
--- a/.mergify.yml
+++ b/.mergify.yml
@@ -130,7 +130,7 @@ pull_request_rules:
         add: ["Docker"]
   - name: "auto add label=Deployment"
     conditions:
-      - files~=^speechnn/
+      - files~=^speechx/
     actions:
       label:
         add: ["Deployment"]
diff --git a/speechx/CMakeLists.txt b/speechx/CMakeLists.txt
new file mode 100644
index 000000000..878374bab
--- /dev/null
+++ b/speechx/CMakeLists.txt
@@ -0,0 +1,77 @@
+cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
+
+project(deepspeech VERSION 0.1)
+
+set(CMAKE_VERBOSE_MAKEFILE on)
+# set std-14
+set(CMAKE_CXX_STANDARD 14)
+
+# include file 
+include(FetchContent)
+include(ExternalProject)
+# fc_patch dir
+set(FETCHCONTENT_QUIET off)
+get_filename_component(fc_patch "fc_patch" REALPATH BASE_DIR "${CMAKE_SOURCE_DIR}")
+set(FETCHCONTENT_BASE_DIR ${fc_patch})
+
+
+###############################################################################
+# Option Configurations
+###############################################################################
+# option configurations 
+option(TEST_DEBUG "option for debug" OFF)
+
+
+###############################################################################
+# Include third party
+###############################################################################
+# #example for include third party
+# FetchContent_Declare()
+# # FetchContent_MakeAvailable was not added until CMake 3.14
+# FetchContent_MakeAvailable()
+# include_directories()
+
+# ABSEIL-CPP
+include(FetchContent)
+FetchContent_Declare(
+  absl
+  GIT_REPOSITORY "https://github.com/abseil/abseil-cpp.git"
+  GIT_TAG "20210324.1"
+)
+FetchContent_MakeAvailable(absl)
+
+# libsndfile
+include(FetchContent)
+FetchContent_Declare(
+  libsndfile
+  GIT_REPOSITORY "https://github.com/libsndfile/libsndfile.git"
+  GIT_TAG "1.0.31"
+)
+FetchContent_MakeAvailable(libsndfile)
+
+
+###############################################################################
+# Add local library
+###############################################################################
+# system lib 
+find_package()
+# if dir have CmakeLists.txt 
+add_subdirectory()
+# if dir do not have CmakeLists.txt 
+add_library(lib_name STATIC file.cc)
+target_link_libraries(lib_name item0 item1)
+add_dependencies(lib_name depend-target)
+
+
+###############################################################################
+# Library installation
+###############################################################################
+install()
+
+
+###############################################################################
+# Build binary file
+###############################################################################
+add_executable()
+target_link_libraries()
+
diff --git a/speechx/docker/.gitkeep b/speechx/docker/.gitkeep
new file mode 100644
index 000000000..e69de29bb
diff --git a/speechx/examples/.gitkeep b/speechx/examples/.gitkeep
new file mode 100644
index 000000000..e69de29bb
diff --git a/speechx/speechx/CMakeLists.txt b/speechx/speechx/CMakeLists.txt
new file mode 100644
index 000000000..e69de29bb
diff --git a/speechx/speechx/decoder/CMakeLists.txt b/speechx/speechx/decoder/CMakeLists.txt
new file mode 100644
index 000000000..259261bdf
--- /dev/null
+++ b/speechx/speechx/decoder/CMakeLists.txt
@@ -0,0 +1,2 @@
+aux_source_directory(. DIR_LIB_SRCS)
+add_library(decoder STATIC ${DIR_LIB_SRCS})
diff --git a/speechx/speechx/frontend/CMakeLists.txt b/speechx/speechx/frontend/CMakeLists.txt
new file mode 100644
index 000000000..e69de29bb
diff --git a/speechx/speechx/frontend/audio/CMakeLists.txt b/speechx/speechx/frontend/audio/CMakeLists.txt
new file mode 100644
index 000000000..e69de29bb
diff --git a/speechx/speechx/frontend/text/CMakeLists.txt b/speechx/speechx/frontend/text/CMakeLists.txt
new file mode 100644
index 000000000..e69de29bb
diff --git a/speechx/speechx/kaldi/.gitkeep b/speechx/speechx/kaldi/.gitkeep
new file mode 100644
index 000000000..e69de29bb
diff --git a/speechx/speechx/model/CMakeLists.txt b/speechx/speechx/model/CMakeLists.txt
new file mode 100644
index 000000000..e69de29bb
diff --git a/speechx/speechx/protocol/CMakeLists.txt b/speechx/speechx/protocol/CMakeLists.txt
new file mode 100644
index 000000000..e69de29bb
diff --git a/speechx/speechx/third_party/CMakeLists.txt b/speechx/speechx/third_party/CMakeLists.txt
new file mode 100644
index 000000000..e69de29bb
diff --git a/speechx/speechx/utils/CMakeLists.txt b/speechx/speechx/utils/CMakeLists.txt
new file mode 100644
index 000000000..e69de29bb

From d368d57d67ec8239c42a25a95e56d534cf23b005 Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Mon, 17 Jan 2022 16:11:27 +0800
Subject: [PATCH 56/60] fix low ips bug of speedyspeech and fastspeech2,
 test=tts (#1349)

---
 .../t2s/models/fastspeech2/fastspeech2.py     |  4 +-
 .../t2s/models/speedyspeech/speedyspeech.py   | 62 ++++++++-----------
 .../t2s/modules/predictor/length_regulator.py | 35 +++++++++--
 3 files changed, 60 insertions(+), 41 deletions(-)

diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
index 405ad957d..6bb651a01 100644
--- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
+++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
@@ -627,7 +627,7 @@ class FastSpeech2(nn.Layer):
             hs = hs + e_embs + p_embs
 
             # (B, Lmax, adim)
-            hs = self.length_regulator(hs, d_outs, alpha)
+            hs = self.length_regulator(hs, d_outs, alpha, is_inference=True)
         else:
             d_outs = self.duration_predictor(hs, d_masks)
             # use groundtruth in training
@@ -638,7 +638,7 @@ class FastSpeech2(nn.Layer):
             hs = hs + e_embs + p_embs
 
             # (B, Lmax, adim)
-            hs = self.length_regulator(hs, ds)
+            hs = self.length_regulator(hs, ds, is_inference=False)
 
         # forward decoder
         if olens is not None and not is_inference:
diff --git a/paddlespeech/t2s/models/speedyspeech/speedyspeech.py b/paddlespeech/t2s/models/speedyspeech/speedyspeech.py
index cc9e20662..42e8f7432 100644
--- a/paddlespeech/t2s/models/speedyspeech/speedyspeech.py
+++ b/paddlespeech/t2s/models/speedyspeech/speedyspeech.py
@@ -14,28 +14,9 @@
 import paddle
 from paddle import nn
 
+from paddlespeech.t2s.modules.nets_utils import initialize
 from paddlespeech.t2s.modules.positional_encoding import sinusoid_position_encoding
-
-
-def expand(encodings: paddle.Tensor, durations: paddle.Tensor) -> paddle.Tensor:
-    """
-    encodings: (B, T, C)
-    durations: (B, T)
-    """
-    batch_size, t_enc = paddle.shape(durations)
-    slens = paddle.sum(durations, -1)
-    t_dec = paddle.max(slens)
-    M = paddle.zeros([batch_size, t_dec, t_enc])
-    for i in range(batch_size):
-        k = 0
-        for j in range(t_enc):
-            d = durations[i, j]
-            # If the d == 0, slice action is meaningless and not supported
-            if d >= 1:
-                M[0, k:k + d, j] = 1
-            k += d
-    encodings = paddle.matmul(M, encodings)
-    return encodings
+from paddlespeech.t2s.modules.predictor.length_regulator import LengthRegulator
 
 
 class ResidualBlock(nn.Layer):
@@ -175,19 +156,25 @@ class SpeedySpeechDecoder(nn.Layer):
 
 
 class SpeedySpeech(nn.Layer):
-    def __init__(self,
-                 vocab_size,
-                 encoder_hidden_size,
-                 encoder_kernel_size,
-                 encoder_dilations,
-                 duration_predictor_hidden_size,
-                 decoder_hidden_size,
-                 decoder_output_size,
-                 decoder_kernel_size,
-                 decoder_dilations,
-                 tone_size=None,
-                 spk_num=None):
+    def __init__(
+            self,
+            vocab_size,
+            encoder_hidden_size,
+            encoder_kernel_size,
+            encoder_dilations,
+            duration_predictor_hidden_size,
+            decoder_hidden_size,
+            decoder_output_size,
+            decoder_kernel_size,
+            decoder_dilations,
+            tone_size=None,
+            spk_num=None,
+            init_type: str="xavier_uniform", ):
         super().__init__()
+
+        # initialize parameters
+        initialize(self, init_type)
+
         encoder = SpeedySpeechEncoder(vocab_size, tone_size,
                                       encoder_hidden_size, encoder_kernel_size,
                                       encoder_dilations, spk_num)
@@ -198,6 +185,10 @@ class SpeedySpeech(nn.Layer):
         self.encoder = encoder
         self.duration_predictor = duration_predictor
         self.decoder = decoder
+        # define length regulator
+        self.length_regulator = LengthRegulator()
+
+        nn.initializer.set_global_initializer(None)
 
     def forward(self, text, tones, durations, spk_id: paddle.Tensor=None):
         # input of embedding must be int64
@@ -212,7 +203,7 @@ class SpeedySpeech(nn.Layer):
 
         # expand encodings
         durations_to_expand = durations
-        encodings = expand(encodings, durations_to_expand)
+        encodings = self.length_regulator(encodings, durations_to_expand)
 
         # decode
         # remove positional encoding here
@@ -240,7 +231,8 @@ class SpeedySpeech(nn.Layer):
             durations_to_expand = durations_to_expand.astype(paddle.int64)
         else:
             durations_to_expand = durations
-        encodings = expand(encodings, durations_to_expand)
+        encodings = self.length_regulator(
+            encodings, durations_to_expand, is_inference=True)
 
         shape = paddle.shape(encodings)
         t_dec, feature_size = shape[1], shape[2]
diff --git a/paddlespeech/t2s/modules/predictor/length_regulator.py b/paddlespeech/t2s/modules/predictor/length_regulator.py
index f1ecfb7c1..9510dd88b 100644
--- a/paddlespeech/t2s/modules/predictor/length_regulator.py
+++ b/paddlespeech/t2s/modules/predictor/length_regulator.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 # Modified from espnet(https://github.com/espnet/espnet)
 """Length regulator related modules."""
+import numpy as np
 import paddle
 from paddle import nn
 
@@ -43,6 +44,28 @@ class LengthRegulator(nn.Layer):
         super().__init__()
         self.pad_value = pad_value
 
+    # expand_numpy is faster than expand
+    def expand_numpy(self, encodings: paddle.Tensor,
+                     durations: paddle.Tensor) -> paddle.Tensor:
+        """
+        encodings: (B, T, C)
+        durations: (B, T)
+        """
+        batch_size, t_enc = durations.shape
+        durations = durations.numpy()
+        slens = np.sum(durations, -1)
+        t_dec = np.max(slens)
+        M = np.zeros([batch_size, t_dec, t_enc])
+        for i in range(batch_size):
+            k = 0
+            for j in range(t_enc):
+                d = durations[i, j]
+                M[i, k:k + d, j] = 1
+                k += d
+        M = paddle.to_tensor(M, dtype=encodings.dtype)
+        encodings = paddle.matmul(M, encodings)
+        return encodings
+
     def expand(self, encodings: paddle.Tensor,
                durations: paddle.Tensor) -> paddle.Tensor:
         """
@@ -50,20 +73,21 @@ class LengthRegulator(nn.Layer):
         durations: (B, T)
         """
         batch_size, t_enc = paddle.shape(durations)
-        slens = durations.sum(-1)
-        t_dec = slens.max()
+        slens = paddle.sum(durations, -1)
+        t_dec = paddle.max(slens)
         M = paddle.zeros([batch_size, t_dec, t_enc])
         for i in range(batch_size):
             k = 0
             for j in range(t_enc):
                 d = durations[i, j]
+                # If the d == 0, slice action is meaningless and not supported in paddle
                 if d >= 1:
                     M[i, k:k + d, j] = 1
                 k += d
         encodings = paddle.matmul(M, encodings)
         return encodings
 
-    def forward(self, xs, ds, alpha=1.0):
+    def forward(self, xs, ds, alpha=1.0, is_inference=False):
         """Calculate forward propagation.
 
         Parameters
@@ -85,4 +109,7 @@ class LengthRegulator(nn.Layer):
             assert alpha > 0
             ds = paddle.round(ds.cast(dtype=paddle.float32) * alpha)
         ds = ds.cast(dtype=paddle.int64)
-        return self.expand(xs, ds)
+        if is_inference:
+            return self.expand(xs, ds)
+        else:
+            return self.expand_numpy(xs, ds)

From 38edfd1a89a419ebd6614cb2017b940e8cc32135 Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Mon, 17 Jan 2022 08:21:34 +0000
Subject: [PATCH 57/60] Add Deepspeech2 online and offline in cli

---
 paddlespeech/cli/asr/infer.py | 82 ++++++++++++++++++++++++++++-------
 1 file changed, 67 insertions(+), 15 deletions(-)

diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py
index aa4e31d9e..447b0a1a0 100644
--- a/paddlespeech/cli/asr/infer.py
+++ b/paddlespeech/cli/asr/infer.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import argparse
-import io
 import os
 import sys
 from typing import List
@@ -23,9 +22,9 @@ import librosa
 import numpy as np
 import paddle
 import soundfile
-import yaml
 from yacs.config import CfgNode
 
+from ..download import get_path_from_url
 from ..executor import BaseExecutor
 from ..log import logger
 from ..utils import cli_register
@@ -64,14 +63,47 @@ pretrained_models = {
         'ckpt_path':
         'exp/transformer/checkpoints/avg_10',
     },
+    "deepspeech2offline_aishell-zh-16k": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz',
+        'md5':
+        '932c3593d62fe5c741b59b31318aa314',
+        'cfg_path':
+        'model.yaml',
+        'ckpt_path':
+        'exp/deepspeech2/checkpoints/avg_1',
+        'lm_url':
+        'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
+        'lm_md5':
+        '29e02312deb2e59b3c8686c7966d4fe3'
+    },
+    "deepspeech2online_aishell-zh-16k": {
+        'url':
+        'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.1.1.model.tar.gz',
+        'md5':
+        'd5e076217cf60486519f72c217d21b9b',
+        'cfg_path':
+        'model.yaml',
+        'ckpt_path':
+        'exp/deepspeech2_online/checkpoints/avg_1',
+        'lm_url':
+        'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
+        'lm_md5':
+        '29e02312deb2e59b3c8686c7966d4fe3'
+    },
 }
 
 model_alias = {
-    "deepspeech2offline": "paddlespeech.s2t.models.ds2:DeepSpeech2Model",
-    "deepspeech2online": "paddlespeech.s2t.models.ds2_online:DeepSpeech2ModelOnline",
-    "conformer": "paddlespeech.s2t.models.u2:U2Model",
-    "transformer": "paddlespeech.s2t.models.u2:U2Model",
-    "wenetspeech": "paddlespeech.s2t.models.u2:U2Model",
+    "deepspeech2offline":
+    "paddlespeech.s2t.models.ds2:DeepSpeech2Model",
+    "deepspeech2online":
+    "paddlespeech.s2t.models.ds2_online:DeepSpeech2ModelOnline",
+    "conformer":
+    "paddlespeech.s2t.models.u2:U2Model",
+    "transformer":
+    "paddlespeech.s2t.models.u2:U2Model",
+    "wenetspeech":
+    "paddlespeech.s2t.models.u2:U2Model",
 }
 
 
@@ -95,7 +127,8 @@ class ASRExecutor(BaseExecutor):
             '--lang',
             type=str,
             default='zh',
-            help='Choose model language. zh or en, zh:[conformer_wenetspeech-zh-16k], en:[transformer_librispeech-en-16k]')
+            help='Choose model language. zh or en, zh:[conformer_wenetspeech-zh-16k], en:[transformer_librispeech-en-16k]'
+        )
         self.parser.add_argument(
             "--sample_rate",
             type=int,
@@ -111,7 +144,10 @@ class ASRExecutor(BaseExecutor):
             '--decode_method',
             type=str,
             default='attention_rescoring',
-            choices=['ctc_greedy_search', 'ctc_prefix_beam_search', 'attention', 'attention_rescoring'],
+            choices=[
+                'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention',
+                'attention_rescoring'
+            ],
             help='only support transformer and conformer model')
         self.parser.add_argument(
             '--ckpt_path',
@@ -187,13 +223,21 @@ class ASRExecutor(BaseExecutor):
             if "deepspeech2online" in model_type or "deepspeech2offline" in model_type:
                 from paddlespeech.s2t.io.collator import SpeechCollator
                 self.vocab = self.config.vocab_filepath
-                self.config.decode.lang_model_path = os.path.join(res_path, self.config.decode.lang_model_path)
+                self.config.decode.lang_model_path = os.path.join(
+                    MODEL_HOME, 'language_model',
+                    self.config.decode.lang_model_path)
                 self.collate_fn_test = SpeechCollator.from_config(self.config)
                 self.text_feature = TextFeaturizer(
-                    unit_type=self.config.unit_type,
-                    vocab=self.vocab)
+                    unit_type=self.config.unit_type, vocab=self.vocab)
+                lm_url = pretrained_models[tag]['lm_url']
+                lm_md5 = pretrained_models[tag]['lm_md5']
+                self.download_lm(
+                    lm_url,
+                    os.path.dirname(self.config.decode.lang_model_path), lm_md5)
+
             elif "conformer" in model_type or "transformer" in model_type or "wenetspeech" in model_type:
-                self.config.spm_model_prefix = os.path.join(self.res_path, self.config.spm_model_prefix)
+                self.config.spm_model_prefix = os.path.join(
+                    self.res_path, self.config.spm_model_prefix)
                 self.text_feature = TextFeaturizer(
                     unit_type=self.config.unit_type,
                     vocab=self.config.vocab_filepath,
@@ -319,6 +363,13 @@ class ASRExecutor(BaseExecutor):
         """
         return self._outputs["result"]
 
+    def download_lm(self, url, lm_dir, md5sum):
+        download_path = get_path_from_url(
+            url=url,
+            root_dir=lm_dir,
+            md5sum=md5sum,
+            decompress=False, )
+
     def _pcm16to32(self, audio):
         assert (audio.dtype == np.int16)
         audio = audio.astype("float32")
@@ -411,7 +462,7 @@ class ASRExecutor(BaseExecutor):
 
         try:
             res = self(audio_file, model, lang, sample_rate, config, ckpt_path,
-                        decode_method, force_yes, device)
+                       decode_method, force_yes, device)
             logger.info('ASR Result: {}'.format(res))
             return True
         except Exception as e:
@@ -435,7 +486,8 @@ class ASRExecutor(BaseExecutor):
         audio_file = os.path.abspath(audio_file)
         self._check(audio_file, sample_rate, force_yes)
         paddle.set_device(device)
-        self._init_from_path(model, lang, sample_rate, config, decode_method, ckpt_path)
+        self._init_from_path(model, lang, sample_rate, config, decode_method,
+                             ckpt_path)
         self.preprocess(model, audio_file)
         self.infer(model)
         res = self.postprocess()  # Retrieve result of asr.

From 15a00431ad2fcdce3938b372b9711ce9b3bad324 Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Mon, 17 Jan 2022 12:07:53 +0000
Subject: [PATCH 58/60] rm s2t:augmentation.md, test=doc

---
 docs/source/asr/augmentation.md | 40 ---------------------------------
 docs/source/index.rst           |  1 -
 2 files changed, 41 deletions(-)
 delete mode 100644 docs/source/asr/augmentation.md

diff --git a/docs/source/asr/augmentation.md b/docs/source/asr/augmentation.md
deleted file mode 100644
index 8e65cb19e..000000000
--- a/docs/source/asr/augmentation.md
+++ /dev/null
@@ -1,40 +0,0 @@
-# Data Augmentation Pipeline
-
-Data augmentation has often been a highly effective technique to boost deep learning performance. We augment our speech data by synthesizing new audios with small random perturbation (label-invariant transformation) added upon raw audios. You don't have to do the syntheses on your own, as it is already embedded into the data provider and is done on the fly, randomly for each epoch during training.
-
-Six optional augmentation components are provided to be selected, configured, and inserted into the processing pipeline.
-
-* Audio
-  - Volume Perturbation
-  - Speed Perturbation
-  - Shifting Perturbation
-  - Online Bayesian normalization
-  - Noise Perturbation (need background noise audio files)
-  - Impulse Response (need impulse audio files)
-
-* Feature
-  - SpecAugment
-  - Adaptive SpecAugment
-
-To inform the trainer of what augmentation components are needed and what their processing orders are, it is required to prepare in advance an *augmentation configuration file* in [JSON](http://www.json.org/) format. For example:
-
-```
-[{
-    "type": "speed",
-    "params": {"min_speed_rate": 0.95,
-               "max_speed_rate": 1.05},
-    "prob": 0.6
-},
-{
-    "type": "shift",
-    "params": {"min_shift_ms": -5,
-               "max_shift_ms": 5},
-    "prob": 0.8
-}]
-```
-
-When the `augment_conf_file` argument is set to the path of the above example configuration file, every audio clip in every epoch will be processed: with 60% of chance, it will first be speed perturbed with a uniformly random sampled speed-rate between 0.95 and 1.05, and then with 80% of chance it will be shifted in time with a randomly sampled offset between -5 ms and 5 ms. Finally, this newly synthesized audio clip will be fed into the feature extractor for further training.
-
-For other configuration examples, please refer to `examples/conf/augmentation.example.json`.
-
-Be careful when utilizing the data augmentation technique, as improper augmentation will harm the training, due to the enlarged train-test gap.
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 5bbc93196..bf675b4bd 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -27,7 +27,6 @@ Contents
 
    asr/models_introduction
    asr/data_preparation
-   asr/augmentation
    asr/feature_list
    asr/ngram_lm
 

From 1a9e59612a9124ffc3f97aae57f4d24792cdb9cf Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Tue, 18 Jan 2022 03:53:27 +0000
Subject: [PATCH 59/60] fix fastspeech2  multi speaker to static, test=tts

---
 examples/aishell3/tts3/README.md              |  3 +-
 .../aishell3/tts3/local/synthesize_e2e.sh     |  3 +-
 examples/vctk/tts3/README.md                  |  9 +--
 examples/vctk/tts3/local/synthesize_e2e.sh    |  3 +-
 paddlespeech/t2s/exps/inference.py            | 66 ++++++++++++++++---
 paddlespeech/t2s/exps/synthesize_e2e.py       | 13 +++-
 .../t2s/models/fastspeech2/fastspeech2.py     |  2 +-
 7 files changed, 78 insertions(+), 21 deletions(-)

diff --git a/examples/aishell3/tts3/README.md b/examples/aishell3/tts3/README.md
index 2538e8f96..281ad836b 100644
--- a/examples/aishell3/tts3/README.md
+++ b/examples/aishell3/tts3/README.md
@@ -257,6 +257,7 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
   --output_dir=exp/default/test_e2e \
   --phones_dict=fastspeech2_nosil_aishell3_ckpt_0.4/phone_id_map.txt \
   --speaker_dict=fastspeech2_nosil_aishell3_ckpt_0.4/speaker_id_map.txt \
-  --spk_id=0
+  --spk_id=0 \
+  --inference_dir=exp/default/inference
 
 ```
diff --git a/examples/aishell3/tts3/local/synthesize_e2e.sh b/examples/aishell3/tts3/local/synthesize_e2e.sh
index d0d925859..60e1a5cee 100755
--- a/examples/aishell3/tts3/local/synthesize_e2e.sh
+++ b/examples/aishell3/tts3/local/synthesize_e2e.sh
@@ -20,4 +20,5 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
     --output_dir=${train_output_path}/test_e2e \
     --phones_dict=dump/phone_id_map.txt \
     --speaker_dict=dump/speaker_id_map.txt \
-    --spk_id=0
+    --spk_id=0 \
+    --inference_dir=${train_output_path}/inference
diff --git a/examples/vctk/tts3/README.md b/examples/vctk/tts3/README.md
index 74c1086a0..157949d1f 100644
--- a/examples/vctk/tts3/README.md
+++ b/examples/vctk/tts3/README.md
@@ -240,13 +240,14 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
   --am_ckpt=fastspeech2_nosil_vctk_ckpt_0.5/snapshot_iter_66200.pdz \
   --am_stat=fastspeech2_nosil_vctk_ckpt_0.5/speech_stats.npy \
   --voc=pwgan_vctk \
-  --voc_config=pwg_vctk_ckpt_0.5/pwg_default.yaml  \
-  --voc_ckpt=pwg_vctk_ckpt_0.5/pwg_snapshot_iter_1000000.pdz \
-  --voc_stat=pwg_vctk_ckpt_0.5/pwg_stats.npy \
+  --voc_config=pwg_vctk_ckpt_0.1.1/default.yaml  \
+  --voc_ckpt=pwg_vctk_ckpt_0.1.1/snapshot_iter_1500000.pdz \
+  --voc_stat=pwg_vctk_ckpt_0.1.1/feats_stats.npy \
   --lang=en \
   --text=${BIN_DIR}/../sentences_en.txt \
   --output_dir=exp/default/test_e2e \
   --phones_dict=fastspeech2_nosil_vctk_ckpt_0.5/phone_id_map.txt \
   --speaker_dict=fastspeech2_nosil_vctk_ckpt_0.5/speaker_id_map.txt \
-  --spk_id=0
+  --spk_id=0 \
+  --inference_dir=exp/default/inference
 ```
diff --git a/examples/vctk/tts3/local/synthesize_e2e.sh b/examples/vctk/tts3/local/synthesize_e2e.sh
index 51bb9e192..60d56d1c9 100755
--- a/examples/vctk/tts3/local/synthesize_e2e.sh
+++ b/examples/vctk/tts3/local/synthesize_e2e.sh
@@ -20,4 +20,5 @@ python3 ${BIN_DIR}/../synthesize_e2e.py \
     --output_dir=${train_output_path}/test_e2e \
     --phones_dict=dump/phone_id_map.txt \
     --speaker_dict=dump/speaker_id_map.txt \
-    --spk_id=0
+    --spk_id=0 \
+    --inference_dir=${train_output_path}/inference
diff --git a/paddlespeech/t2s/exps/inference.py b/paddlespeech/t2s/exps/inference.py
index e1d5306c2..2c9b51f9f 100644
--- a/paddlespeech/t2s/exps/inference.py
+++ b/paddlespeech/t2s/exps/inference.py
@@ -14,9 +14,11 @@
 import argparse
 from pathlib import Path
 
+import numpy
 import soundfile as sf
 from paddle import inference
 
+from paddlespeech.t2s.frontend import English
 from paddlespeech.t2s.frontend.zh_frontend import Frontend
 
 
@@ -29,20 +31,38 @@ def main():
         '--am',
         type=str,
         default='fastspeech2_csmsc',
-        choices=['speedyspeech_csmsc', 'fastspeech2_csmsc'],
+        choices=[
+            'speedyspeech_csmsc', 'fastspeech2_csmsc', 'fastspeech2_aishell3',
+            'fastspeech2_vctk'
+        ],
         help='Choose acoustic model type of tts task.')
     parser.add_argument(
         "--phones_dict", type=str, default=None, help="phone vocabulary file.")
     parser.add_argument(
         "--tones_dict", type=str, default=None, help="tone vocabulary file.")
+    parser.add_argument(
+        "--speaker_dict", type=str, default=None, help="speaker id map file.")
+    parser.add_argument(
+        '--spk_id',
+        type=int,
+        default=0,
+        help='spk id for multi speaker acoustic model')
     # voc
     parser.add_argument(
         '--voc',
         type=str,
         default='pwgan_csmsc',
-        choices=['pwgan_csmsc', 'mb_melgan_csmsc', 'hifigan_csmsc'],
+        choices=[
+            'pwgan_csmsc', 'mb_melgan_csmsc', 'hifigan_csmsc', 'pwgan_aishell3',
+            'pwgan_vctk'
+        ],
         help='Choose vocoder type of tts task.')
     # other
+    parser.add_argument(
+        '--lang',
+        type=str,
+        default='zh',
+        help='Choose model language. zh or en')
     parser.add_argument(
         "--text",
         type=str,
@@ -53,8 +73,12 @@ def main():
 
     args, _ = parser.parse_known_args()
 
-    frontend = Frontend(
-        phone_vocab_path=args.phones_dict, tone_vocab_path=args.tones_dict)
+    # frontend
+    if args.lang == 'zh':
+        frontend = Frontend(
+            phone_vocab_path=args.phones_dict, tone_vocab_path=args.tones_dict)
+    elif args.lang == 'en':
+        frontend = English(phone_vocab_path=args.phones_dict)
     print("frontend done!")
 
     # model: {model_name}_{dataset}
@@ -83,30 +107,52 @@ def main():
 
     print("in new inference")
 
+    # construct dataset for evaluation
+    sentences = []
     with open(args.text, 'rt') as f:
         for line in f:
             items = line.strip().split()
             utt_id = items[0]
-            sentence = "".join(items[1:])
+            if args.lang == 'zh':
+                sentence = "".join(items[1:])
+            elif args.lang == 'en':
+                sentence = " ".join(items[1:])
             sentences.append((utt_id, sentence))
 
     get_tone_ids = False
     if am_name == 'speedyspeech':
         get_tone_ids = True
+    if am_dataset in {"aishell3", "vctk"} and args.speaker_dict:
+        get_spk_id = True
+        spk_id = numpy.array([args.spk_id])
 
     am_input_names = am_predictor.get_input_names()
-
+    print("am_input_names:", am_input_names)
+    merge_sentences = True
     for utt_id, sentence in sentences:
-        input_ids = frontend.get_input_ids(
-            sentence, merge_sentences=True, get_tone_ids=get_tone_ids)
-        phone_ids = input_ids["phone_ids"]
+        if args.lang == 'zh':
+            input_ids = frontend.get_input_ids(
+                sentence,
+                merge_sentences=merge_sentences,
+                get_tone_ids=get_tone_ids)
+            phone_ids = input_ids["phone_ids"]
+        elif args.lang == 'en':
+            input_ids = frontend.get_input_ids(
+                sentence, merge_sentences=merge_sentences)
+            phone_ids = input_ids["phone_ids"]
+        else:
+            print("lang should in {'zh', 'en'}!")
+
         if get_tone_ids:
             tone_ids = input_ids["tone_ids"]
             tones = tone_ids[0].numpy()
             tones_handle = am_predictor.get_input_handle(am_input_names[1])
             tones_handle.reshape(tones.shape)
             tones_handle.copy_from_cpu(tones)
-
+        if get_spk_id:
+            spk_id_handle = am_predictor.get_input_handle(am_input_names[1])
+            spk_id_handle.reshape(spk_id.shape)
+            spk_id_handle.copy_from_cpu(spk_id)
         phones = phone_ids[0].numpy()
         phones_handle = am_predictor.get_input_handle(am_input_names[0])
         phones_handle.reshape(phones.shape)
diff --git a/paddlespeech/t2s/exps/synthesize_e2e.py b/paddlespeech/t2s/exps/synthesize_e2e.py
index 15ed1e4d4..9b503213a 100644
--- a/paddlespeech/t2s/exps/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/synthesize_e2e.py
@@ -159,9 +159,16 @@ def evaluate(args):
         # acoustic model
         if am_name == 'fastspeech2':
             if am_dataset in {"aishell3", "vctk"} and args.speaker_dict:
-                print(
-                    "Haven't test dygraph to static for multi speaker fastspeech2 now!"
-                )
+                am_inference = jit.to_static(
+                    am_inference,
+                    input_spec=[
+                        InputSpec([-1], dtype=paddle.int64),
+                        InputSpec([1], dtype=paddle.int64)
+                    ])
+                paddle.jit.save(am_inference,
+                                os.path.join(args.inference_dir, args.am))
+                am_inference = paddle.jit.load(
+                    os.path.join(args.inference_dir, args.am))
             else:
                 am_inference = jit.to_static(
                     am_inference,
diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
index 6bb651a01..dc136ffda 100644
--- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
+++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
@@ -781,7 +781,7 @@ class FastSpeech2(nn.Layer):
         elif self.spk_embed_integration_type == "concat":
             # concat hidden states with spk embeds and then apply projection
             spk_emb = F.normalize(spk_emb).unsqueeze(1).expand(
-                shape=[-1, hs.shape[1], -1])
+                shape=[-1, paddle.shape(hs)[1], -1])
             hs = self.spk_projection(paddle.concat([hs, spk_emb], axis=-1))
         else:
             raise NotImplementedError("support only add or concat.")

From 41d24337cb52555b28bd3a72ab1334ea67dac352 Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Tue, 18 Jan 2022 04:02:35 +0000
Subject: [PATCH 60/60] fix fastspeech2  multi speaker to static, test=tts

---
 examples/aishell3/tts3/local/inference.sh | 19 +++++++++++++++++++
 examples/vctk/tts3/local/inference.sh     | 20 ++++++++++++++++++++
 paddlespeech/t2s/exps/inference.py        |  1 +
 3 files changed, 40 insertions(+)
 create mode 100755 examples/aishell3/tts3/local/inference.sh
 create mode 100755 examples/vctk/tts3/local/inference.sh

diff --git a/examples/aishell3/tts3/local/inference.sh b/examples/aishell3/tts3/local/inference.sh
new file mode 100755
index 000000000..3b03b53ce
--- /dev/null
+++ b/examples/aishell3/tts3/local/inference.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+train_output_path=$1
+
+stage=0
+stop_stage=0
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    python3 ${BIN_DIR}/../inference.py \
+        --inference_dir=${train_output_path}/inference \
+        --am=fastspeech2_aishell3 \
+        --voc=pwgan_aishell3 \
+        --text=${BIN_DIR}/../sentences.txt \
+        --output_dir=${train_output_path}/pd_infer_out \
+        --phones_dict=dump/phone_id_map.txt \
+        --speaker_dict=dump/speaker_id_map.txt \
+        --spk_id=0
+fi
+
diff --git a/examples/vctk/tts3/local/inference.sh b/examples/vctk/tts3/local/inference.sh
new file mode 100755
index 000000000..caef89d8b
--- /dev/null
+++ b/examples/vctk/tts3/local/inference.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+train_output_path=$1
+
+stage=0
+stop_stage=0
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    python3 ${BIN_DIR}/../inference.py \
+        --inference_dir=${train_output_path}/inference \
+        --am=fastspeech2_vctk \
+        --voc=pwgan_vctk \
+        --text=${BIN_DIR}/../sentences_en.txt \
+        --output_dir=${train_output_path}/pd_infer_out \
+        --phones_dict=dump/phone_id_map.txt \
+        --speaker_dict=dump/speaker_id_map.txt \
+        --spk_id=0 \
+        --lang=en
+fi
+
diff --git a/paddlespeech/t2s/exps/inference.py b/paddlespeech/t2s/exps/inference.py
index 2c9b51f9f..37afd0abc 100644
--- a/paddlespeech/t2s/exps/inference.py
+++ b/paddlespeech/t2s/exps/inference.py
@@ -120,6 +120,7 @@ def main():
             sentences.append((utt_id, sentence))
 
     get_tone_ids = False
+    get_spk_id = False
     if am_name == 'speedyspeech':
         get_tone_ids = True
     if am_dataset in {"aishell3", "vctk"} and args.speaker_dict: