From 1cdd41bd03488b38c6082c766bf819b6bc94f61c Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Tue, 24 May 2022 09:46:12 +0000
Subject: [PATCH 01/14] fix pad_sequence, test=asr

---
 paddlespeech/s2t/utils/tensor_utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddlespeech/s2t/utils/tensor_utils.py b/paddlespeech/s2t/utils/tensor_utils.py
index 0dbaa0b6b..e105253c2 100644
--- a/paddlespeech/s2t/utils/tensor_utils.py
+++ b/paddlespeech/s2t/utils/tensor_utils.py
@@ -82,7 +82,7 @@ def pad_sequence(sequences: List[paddle.Tensor],
     max_size = sequences[0].size()
     # (TODO Hui Zhang): slice not supprot `end==start`
     # trailing_dims = max_size[1:]
-    trailing_dims = max_size[1:] if max_size.ndim >= 2 else ()
+    trailing_dims = tuple(max_size[1:].numpy().tolist()) if sequences[0].ndim >= 2 else ()
     max_len = max([s.shape[0] for s in sequences])
     if batch_first:
         out_dims = (len(sequences), max_len) + trailing_dims
@@ -99,7 +99,7 @@ def pad_sequence(sequences: List[paddle.Tensor],
         if batch_first:
             # TODO (Hui Zhang): set_value op not supprot `end==start`
             # TODO (Hui Zhang): set_value op not support int16
-            # TODO (Hui Zhang): set_varbase 2 rank not support [0,0,...] 
+            # TODO (Hui Zhang): set_varbase 2 rank not support [0,0,...]
             # out_tensor[i, :length, ...] = tensor
             if length != 0:
                 out_tensor[i, :length] = tensor
@@ -145,7 +145,7 @@ def add_sos_eos(ys_pad: paddle.Tensor, sos: int, eos: int,
                 [ 4,  5,  6, 11, -1, -1],
                 [ 7,  8,  9, 11, -1, -1]])
     """
-    # TODO(Hui Zhang): using comment code, 
+    # TODO(Hui Zhang): using comment code,
     #_sos = paddle.to_tensor(
     #    [sos], dtype=paddle.long, stop_gradient=True, place=ys_pad.place)
     #_eos = paddle.to_tensor(

From e1888f9ae6d239b8c28f9739f7fd2a0120caac9e Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Tue, 24 May 2022 12:37:42 +0000
Subject: [PATCH 02/14] remove size,test=asr

---
 paddlespeech/s2t/__init__.py                  | 19 -------------
 .../s2t/decoders/beam_search/beam_search.py   | 10 +++----
 paddlespeech/s2t/decoders/scorers/ctc.py      |  4 +--
 .../s2t/decoders/scorers/ctc_prefix_score.py  | 27 +++++++++----------
 paddlespeech/s2t/models/u2/u2.py              |  2 +-
 paddlespeech/s2t/modules/decoder.py           |  2 +-
 paddlespeech/s2t/modules/embedding.py         |  4 +--
 paddlespeech/s2t/utils/tensor_utils.py        |  6 ++---
 8 files changed, 27 insertions(+), 47 deletions(-)

diff --git a/paddlespeech/s2t/__init__.py b/paddlespeech/s2t/__init__.py
index 2365071f3..7ec9e1aba 100644
--- a/paddlespeech/s2t/__init__.py
+++ b/paddlespeech/s2t/__init__.py
@@ -189,25 +189,6 @@ if not hasattr(paddle.Tensor, 'contiguous'):
     paddle.static.Variable.contiguous = contiguous
 
 
-def size(xs: paddle.Tensor, *args: int) -> paddle.Tensor:
-    nargs = len(args)
-    assert (nargs <= 1)
-    s = paddle.shape(xs)
-    if nargs == 1:
-        return s[args[0]]
-    else:
-        return s
-
-
-#`to_static` do not process `size` property, maybe some `paddle` api dependent on it.
-logger.debug(
-    "override size of paddle.Tensor "
-    "(`to_static` do not process `size` property, maybe some `paddle` api dependent on it), remove this when fixed!"
-)
-paddle.Tensor.size = size
-paddle.static.Variable.size = size
-
-
 def view(xs: paddle.Tensor, *args: int) -> paddle.Tensor:
     return xs.reshape(args)
 
diff --git a/paddlespeech/s2t/decoders/beam_search/beam_search.py b/paddlespeech/s2t/decoders/beam_search/beam_search.py
index f331cb1c9..5029e1577 100644
--- a/paddlespeech/s2t/decoders/beam_search/beam_search.py
+++ b/paddlespeech/s2t/decoders/beam_search/beam_search.py
@@ -194,7 +194,7 @@ class BeamSearch(paddle.nn.Layer):
 
         Args:
             hyp (Hypothesis): Hypothesis with prefix tokens to score
-            ids (paddle.Tensor): 1D tensor of new partial tokens to score, 
+            ids (paddle.Tensor): 1D tensor of new partial tokens to score,
                 len(ids) < n_vocab
             x (paddle.Tensor): Corresponding input feature, (T, D)
 
@@ -224,14 +224,14 @@ class BeamSearch(paddle.nn.Layer):
             ids (paddle.Tensor): The partial token ids(Global) to compute topk.
 
         Returns:
-            Tuple[paddle.Tensor, paddle.Tensor]: 
+            Tuple[paddle.Tensor, paddle.Tensor]:
                 The topk full token ids and partial token ids.
                 Their shapes are `(self.beam_size,)`.
                 i.e. (global ids, global relative local ids).
 
         """
         # no pre beam performed, `ids` equal to `weighted_scores`
-        if weighted_scores.size(0) == ids.size(0):
+        if weighted_scores.shape[0] == ids.shape[0]:
             top_ids = weighted_scores.topk(
                 self.beam_size)[1]  # index in n_vocab
             return top_ids, top_ids
@@ -374,8 +374,8 @@ class BeamSearch(paddle.nn.Layer):
         elif maxlenratio < 0:
             maxlen = -1 * int(maxlenratio)
         else:
-            maxlen = max(1, int(maxlenratio * x.size(0)))
-        minlen = int(minlenratio * x.size(0))
+            maxlen = max(1, int(maxlenratio * x.shape[0]))
+        minlen = int(minlenratio * x.shape[0])
         logger.info("decoder input length: " + str(x.shape[0]))
         logger.info("max output length: " + str(maxlen))
         logger.info("min output length: " + str(minlen))
diff --git a/paddlespeech/s2t/decoders/scorers/ctc.py b/paddlespeech/s2t/decoders/scorers/ctc.py
index 81d8b0783..6f1d8c007 100644
--- a/paddlespeech/s2t/decoders/scorers/ctc.py
+++ b/paddlespeech/s2t/decoders/scorers/ctc.py
@@ -69,7 +69,7 @@ class CTCPrefixScorer(BatchPartialScorerInterface):
                 return sc[i], st[i]
             else:  # for CTCPrefixScorePD (need new_id > 0)
                 r, log_psi, f_min, f_max, scoring_idmap = state
-                s = log_psi[i, new_id].expand(log_psi.size(1))
+                s = log_psi[i, new_id].expand(log_psi.shape[1])
                 if scoring_idmap is not None:
                     return r[:, :, i, scoring_idmap[i, new_id]], s, f_min, f_max
                 else:
@@ -107,7 +107,7 @@ class CTCPrefixScorer(BatchPartialScorerInterface):
 
         """
         logp = self.ctc.log_softmax(x.unsqueeze(0))  # assuming batch_size = 1
-        xlen = paddle.to_tensor([logp.size(1)])
+        xlen = paddle.to_tensor([logp.shape[1]])
         self.impl = CTCPrefixScorePD(logp, xlen, 0, self.eos)
         return None
 
diff --git a/paddlespeech/s2t/decoders/scorers/ctc_prefix_score.py b/paddlespeech/s2t/decoders/scorers/ctc_prefix_score.py
index 78b8fe36c..0e63a52a8 100644
--- a/paddlespeech/s2t/decoders/scorers/ctc_prefix_score.py
+++ b/paddlespeech/s2t/decoders/scorers/ctc_prefix_score.py
@@ -33,9 +33,9 @@ class CTCPrefixScorePD():
         self.logzero = -10000000000.0
         self.blank = blank
         self.eos = eos
-        self.batch = x.size(0)
-        self.input_length = x.size(1)
-        self.odim = x.size(2)
+        self.batch = x.shape[0]
+        self.input_length = x.shape[1]
+        self.odim = x.shape[2]
         self.dtype = x.dtype
 
         # Pad the rest of posteriors in the batch
@@ -76,8 +76,7 @@ class CTCPrefixScorePD():
         last_ids = [yi[-1] for yi in y]  # last output label ids
         n_bh = len(last_ids)  # batch * hyps
         n_hyps = n_bh // self.batch  # assuming each utterance has the same # of hyps
-        self.scoring_num = scoring_ids.size(
-            -1) if scoring_ids is not None else 0
+        self.scoring_num = scoring_ids.shape[-1] if scoring_ids is not None else 0
         # prepare state info
         if state is None:
             r_prev = paddle.full(
@@ -153,7 +152,7 @@ class CTCPrefixScorePD():
 
         # compute forward probabilities log(r_t^n(h)) and log(r_t^b(h))
         for t in range(start, end):
-            rp = r[t - 1]  # (2 x BW x O') 
+            rp = r[t - 1]  # (2 x BW x O')
             rr = paddle.stack([rp[0], log_phi[t - 1], rp[0], rp[1]]).view(
                 2, 2, n_bh, snum)  # (2,2,BW,O')
             r[t] = paddle.logsumexp(rr, 1) + x_[:, t]
@@ -227,7 +226,7 @@ class CTCPrefixScorePD():
         if self.x.shape[1] < x.shape[1]:  # self.x (2,T,B,O); x (B,T,O)
             # Pad the rest of posteriors in the batch
             # TODO(takaaki-hori): need a better way without for-loops
-            xlens = [x.size(1)]
+            xlens = [x.shape[1]]
             for i, l in enumerate(xlens):
                 if l < self.input_length:
                     x[i, l:, :] = self.logzero
@@ -237,7 +236,7 @@ class CTCPrefixScorePD():
             xb = xn[:, :, self.blank].unsqueeze(2).expand(-1, -1, self.odim)
             self.x = paddle.stack([xn, xb])  # (2, T, B, O)
             self.x[:, :tmp_x.shape[1], :, :] = tmp_x
-            self.input_length = x.size(1)
+            self.input_length = x.shape[1]
             self.end_frames = paddle.to_tensor(xlens) - 1
 
     def extend_state(self, state):
@@ -318,16 +317,16 @@ class CTCPrefixScore():
             r[0, 0] = xs[0]
             r[0, 1] = self.logzero
         else:
-            # Although the code does not exactly follow Algorithm 2, 
-            # we don't have to change it because we can assume 
-            # r_t(h)=0 for t < |h| in CTC forward computation 
+            # Although the code does not exactly follow Algorithm 2,
+            # we don't have to change it because we can assume
+            # r_t(h)=0 for t < |h| in CTC forward computation
             # (Note: we assume here that index t starts with 0).
             # The purpose of this difference is to reduce the number of for-loops.
             # https://github.com/espnet/espnet/pull/3655
-            # where we start to accumulate r_t(h) from t=|h| 
-            # and iterate r_t(h) = (r_{t-1}(h) + ...) to T-1, 
+            # where we start to accumulate r_t(h) from t=|h|
+            # and iterate r_t(h) = (r_{t-1}(h) + ...) to T-1,
             # avoiding accumulating zeros for t=1~|h|-1.
-            # Thus, we need to set r_{|h|-1}(h) = 0, 
+            # Thus, we need to set r_{|h|-1}(h) = 0,
             # i.e., r[output_length-1] = logzero, for initialization.
             # This is just for reducing the computation.
             r[output_length - 1] = self.logzero
diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py
index 530840d0f..e3f46b15a 100644
--- a/paddlespeech/s2t/models/u2/u2.py
+++ b/paddlespeech/s2t/models/u2/u2.py
@@ -775,7 +775,7 @@ class U2DecodeModel(U2BaseModel):
         """
         self.eval()
         x = paddle.to_tensor(x).unsqueeze(0)
-        ilen = x.size(1)
+        ilen = x.shape[1]
         enc_output, _ = self._forward_encoder(x, ilen)
         return enc_output.squeeze(0)
 
diff --git a/paddlespeech/s2t/modules/decoder.py b/paddlespeech/s2t/modules/decoder.py
index 42ac119b4..ce78059c0 100644
--- a/paddlespeech/s2t/modules/decoder.py
+++ b/paddlespeech/s2t/modules/decoder.py
@@ -242,7 +242,7 @@ class TransformerDecoder(BatchScorerInterface, nn.Layer):
             ]
 
         # batch decoding
-        ys_mask = subsequent_mask(ys.size(-1)).unsqueeze(0)  # (B,L,L)
+        ys_mask = subsequent_mask(ys.shape[-1]).unsqueeze(0)  # (B,L,L)
         xs_mask = make_xs_mask(xs).unsqueeze(1)  # (B,1,T)
         logp, states = self.forward_one_step(
             xs, xs_mask, ys, ys_mask, cache=batch_state)
diff --git a/paddlespeech/s2t/modules/embedding.py b/paddlespeech/s2t/modules/embedding.py
index 596f61b78..cc1fdffe2 100644
--- a/paddlespeech/s2t/modules/embedding.py
+++ b/paddlespeech/s2t/modules/embedding.py
@@ -115,7 +115,7 @@ class PositionalEncoding(nn.Layer, PositionalEncodingInterface):
         assert offset + x.shape[
             1] < self.max_len, "offset: {} + x.shape[1]: {} is larger than the max_len: {}".format(
                 offset, x.shape[1], self.max_len)
-        #TODO(Hui Zhang): using T = x.size(1), __getitem__ not support Tensor
+        #TODO(Hui Zhang): using T = x.shape[1], __getitem__ not support Tensor
         pos_emb = self.pe[:, offset:offset + T]
         x = x * self.xscale + pos_emb
         return self.dropout(x), self.dropout(pos_emb)
@@ -165,6 +165,6 @@ class RelPositionalEncoding(PositionalEncoding):
             1] < self.max_len, "offset: {} + x.shape[1]: {} is larger than the max_len: {}".format(
                 offset, x.shape[1], self.max_len)
         x = x * self.xscale
-        #TODO(Hui Zhang): using x.size(1), __getitem__ not support Tensor
+        #TODO(Hui Zhang): using x.shape[1], __getitem__ not support Tensor
         pos_emb = self.pe[:, offset:offset + x.shape[1]]
         return self.dropout(x), self.dropout(pos_emb)
diff --git a/paddlespeech/s2t/utils/tensor_utils.py b/paddlespeech/s2t/utils/tensor_utils.py
index e105253c2..ca8689569 100644
--- a/paddlespeech/s2t/utils/tensor_utils.py
+++ b/paddlespeech/s2t/utils/tensor_utils.py
@@ -58,8 +58,8 @@ def pad_sequence(sequences: List[paddle.Tensor],
         >>> a = paddle.ones(25, 300)
         >>> b = paddle.ones(22, 300)
         >>> c = paddle.ones(15, 300)
-        >>> pad_sequence([a, b, c]).size()
-        paddle.Tensor([25, 3, 300])
+        >>> pad_sequence([a, b, c]).shape
+        [25, 3, 300]
 
     Note:
         This function returns a Tensor of size ``T x B x *`` or ``B x T x *``
@@ -79,7 +79,7 @@ def pad_sequence(sequences: List[paddle.Tensor],
 
     # assuming trailing dimensions and type of all the Tensors
     # in sequences are same and fetching those from sequences[0]
-    max_size = sequences[0].size()
+    max_size = sequences[0].shape
     # (TODO Hui Zhang): slice not supprot `end==start`
     # trailing_dims = max_size[1:]
     trailing_dims = tuple(max_size[1:].numpy().tolist()) if sequences[0].ndim >= 2 else ()

From 4c09927f61668952ee263cd178798b0ea5634760 Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Tue, 24 May 2022 13:34:01 +0000
Subject: [PATCH 03/14] fix

---
 paddlespeech/s2t/__init__.py              | 2 +-
 paddlespeech/s2t/models/lm/transformer.py | 4 ++--
 paddlespeech/s2t/modules/encoder.py       | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/paddlespeech/s2t/__init__.py b/paddlespeech/s2t/__init__.py
index 7ec9e1aba..a2fce3057 100644
--- a/paddlespeech/s2t/__init__.py
+++ b/paddlespeech/s2t/__init__.py
@@ -200,7 +200,7 @@ if not hasattr(paddle.Tensor, 'view'):
 
 
 def view_as(xs: paddle.Tensor, ys: paddle.Tensor) -> paddle.Tensor:
-    return xs.reshape(ys.size())
+    return xs.reshape(ys.shape)
 
 
 if not hasattr(paddle.Tensor, 'view_as'):
diff --git a/paddlespeech/s2t/models/lm/transformer.py b/paddlespeech/s2t/models/lm/transformer.py
index 85bd7c232..bb281168f 100644
--- a/paddlespeech/s2t/models/lm/transformer.py
+++ b/paddlespeech/s2t/models/lm/transformer.py
@@ -90,7 +90,7 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface):
 
     def _target_mask(self, ys_in_pad):
         ys_mask = ys_in_pad != 0
-        m = subsequent_mask(ys_mask.size(-1)).unsqueeze(0)
+        m = subsequent_mask(ys_mask.shape[-1])).unsqueeze(0)
         return ys_mask.unsqueeze(-2) & m
 
     def forward(self, x: paddle.Tensor, t: paddle.Tensor
@@ -112,7 +112,7 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface):
             in perplexity: p(t)^{-n} = exp(-log p(t) / n)
 
         """
-        batch_size = x.size(0)
+        batch_size = x.shape[0]
         xm = x != 0
         xlen = xm.sum(axis=1)
         if self.embed_drop is not None:
diff --git a/paddlespeech/s2t/modules/encoder.py b/paddlespeech/s2t/modules/encoder.py
index 669a12d65..7298c61f2 100644
--- a/paddlespeech/s2t/modules/encoder.py
+++ b/paddlespeech/s2t/modules/encoder.py
@@ -218,7 +218,7 @@ class BaseEncoder(nn.Layer):
         assert xs.shape[0] == 1  # batch size must be one
         # tmp_masks is just for interface compatibility
         # TODO(Hui Zhang): stride_slice not support bool tensor
-        # tmp_masks = paddle.ones([1, xs.size(1)], dtype=paddle.bool)
+        # tmp_masks = paddle.ones([1, xs.shape[1]], dtype=paddle.bool)
         tmp_masks = paddle.ones([1, xs.shape[1]], dtype=paddle.int32)
         tmp_masks = tmp_masks.unsqueeze(1)  #[B=1, C=1, T]
 

From b23bde8ec5ff4ed3990f151246dfbb8c9dccf385 Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Wed, 25 May 2022 03:30:48 +0000
Subject: [PATCH 04/14] tensor.shape => paddle.shape(tensor)

---
 paddlespeech/s2t/__init__.py                         |  2 +-
 paddlespeech/s2t/decoders/beam_search/beam_search.py | 10 +++++-----
 paddlespeech/s2t/decoders/scorers/ctc.py             |  4 ++--
 .../s2t/decoders/scorers/ctc_prefix_score.py         | 12 ++++++------
 paddlespeech/s2t/models/lm/transformer.py            |  6 +++---
 paddlespeech/s2t/models/u2/u2.py                     |  2 +-
 paddlespeech/s2t/modules/decoder.py                  |  2 +-
 paddlespeech/s2t/modules/embedding.py                |  4 ++--
 paddlespeech/s2t/modules/encoder.py                  |  2 +-
 paddlespeech/s2t/utils/tensor_utils.py               |  4 ++--
 10 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/paddlespeech/s2t/__init__.py b/paddlespeech/s2t/__init__.py
index a2fce3057..2da68435c 100644
--- a/paddlespeech/s2t/__init__.py
+++ b/paddlespeech/s2t/__init__.py
@@ -200,7 +200,7 @@ if not hasattr(paddle.Tensor, 'view'):
 
 
 def view_as(xs: paddle.Tensor, ys: paddle.Tensor) -> paddle.Tensor:
-    return xs.reshape(ys.shape)
+    return xs.reshape(paddle.shape(ys))
 
 
 if not hasattr(paddle.Tensor, 'view_as'):
diff --git a/paddlespeech/s2t/decoders/beam_search/beam_search.py b/paddlespeech/s2t/decoders/beam_search/beam_search.py
index 5029e1577..f6a2b4b0a 100644
--- a/paddlespeech/s2t/decoders/beam_search/beam_search.py
+++ b/paddlespeech/s2t/decoders/beam_search/beam_search.py
@@ -231,7 +231,7 @@ class BeamSearch(paddle.nn.Layer):
 
         """
         # no pre beam performed, `ids` equal to `weighted_scores`
-        if weighted_scores.shape[0] == ids.shape[0]:
+        if paddle.shape(weighted_scores)[0] == paddle.shape(ids)[0]:
             top_ids = weighted_scores.topk(
                 self.beam_size)[1]  # index in n_vocab
             return top_ids, top_ids
@@ -370,13 +370,13 @@ class BeamSearch(paddle.nn.Layer):
         """
         # set length bounds
         if maxlenratio == 0:
-            maxlen = x.shape[0]
+            maxlen = paddle.shape(x)[0]
         elif maxlenratio < 0:
             maxlen = -1 * int(maxlenratio)
         else:
-            maxlen = max(1, int(maxlenratio * x.shape[0]))
-        minlen = int(minlenratio * x.shape[0])
-        logger.info("decoder input length: " + str(x.shape[0]))
+            maxlen = max(1, int(maxlenratio * paddle.shape(x)[0]))
+        minlen = int(minlenratio * paddle.shape(x)[0])
+        logger.info("decoder input length: " + str(paddle.shape(x)[0]))
         logger.info("max output length: " + str(maxlen))
         logger.info("min output length: " + str(minlen))
 
diff --git a/paddlespeech/s2t/decoders/scorers/ctc.py b/paddlespeech/s2t/decoders/scorers/ctc.py
index 6f1d8c007..3c1d4cf80 100644
--- a/paddlespeech/s2t/decoders/scorers/ctc.py
+++ b/paddlespeech/s2t/decoders/scorers/ctc.py
@@ -69,7 +69,7 @@ class CTCPrefixScorer(BatchPartialScorerInterface):
                 return sc[i], st[i]
             else:  # for CTCPrefixScorePD (need new_id > 0)
                 r, log_psi, f_min, f_max, scoring_idmap = state
-                s = log_psi[i, new_id].expand(log_psi.shape[1])
+                s = log_psi[i, new_id].expand(paddle.shape(log_psi)[1])
                 if scoring_idmap is not None:
                     return r[:, :, i, scoring_idmap[i, new_id]], s, f_min, f_max
                 else:
@@ -107,7 +107,7 @@ class CTCPrefixScorer(BatchPartialScorerInterface):
 
         """
         logp = self.ctc.log_softmax(x.unsqueeze(0))  # assuming batch_size = 1
-        xlen = paddle.to_tensor([logp.shape[1]])
+        xlen = paddle.to_tensor([paddle.shape(logp)[1]])
         self.impl = CTCPrefixScorePD(logp, xlen, 0, self.eos)
         return None
 
diff --git a/paddlespeech/s2t/decoders/scorers/ctc_prefix_score.py b/paddlespeech/s2t/decoders/scorers/ctc_prefix_score.py
index 0e63a52a8..d8ca5ccde 100644
--- a/paddlespeech/s2t/decoders/scorers/ctc_prefix_score.py
+++ b/paddlespeech/s2t/decoders/scorers/ctc_prefix_score.py
@@ -33,9 +33,9 @@ class CTCPrefixScorePD():
         self.logzero = -10000000000.0
         self.blank = blank
         self.eos = eos
-        self.batch = x.shape[0]
-        self.input_length = x.shape[1]
-        self.odim = x.shape[2]
+        self.batch = paddle.shape(x)[0]
+        self.input_length = paddle.shape(x)[1]
+        self.odim = paddle.shape(x)[2]
         self.dtype = x.dtype
 
         # Pad the rest of posteriors in the batch
@@ -76,7 +76,7 @@ class CTCPrefixScorePD():
         last_ids = [yi[-1] for yi in y]  # last output label ids
         n_bh = len(last_ids)  # batch * hyps
         n_hyps = n_bh // self.batch  # assuming each utterance has the same # of hyps
-        self.scoring_num = scoring_ids.shape[-1] if scoring_ids is not None else 0
+        self.scoring_num = paddle.shape(scoring_ids)[-1] if scoring_ids is not None else 0
         # prepare state info
         if state is None:
             r_prev = paddle.full(
@@ -226,7 +226,7 @@ class CTCPrefixScorePD():
         if self.x.shape[1] < x.shape[1]:  # self.x (2,T,B,O); x (B,T,O)
             # Pad the rest of posteriors in the batch
             # TODO(takaaki-hori): need a better way without for-loops
-            xlens = [x.shape[1]]
+            xlens = [paddle.shape(x)[1]]
             for i, l in enumerate(xlens):
                 if l < self.input_length:
                     x[i, l:, :] = self.logzero
@@ -236,7 +236,7 @@ class CTCPrefixScorePD():
             xb = xn[:, :, self.blank].unsqueeze(2).expand(-1, -1, self.odim)
             self.x = paddle.stack([xn, xb])  # (2, T, B, O)
             self.x[:, :tmp_x.shape[1], :, :] = tmp_x
-            self.input_length = x.shape[1]
+            self.input_length = paddle.shape(x)[1]
             self.end_frames = paddle.to_tensor(xlens) - 1
 
     def extend_state(self, state):
diff --git a/paddlespeech/s2t/models/lm/transformer.py b/paddlespeech/s2t/models/lm/transformer.py
index bb281168f..d14f99563 100644
--- a/paddlespeech/s2t/models/lm/transformer.py
+++ b/paddlespeech/s2t/models/lm/transformer.py
@@ -90,7 +90,7 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface):
 
     def _target_mask(self, ys_in_pad):
         ys_mask = ys_in_pad != 0
-        m = subsequent_mask(ys_mask.shape[-1])).unsqueeze(0)
+        m = subsequent_mask(paddle.shape(ys_mask)[-1])).unsqueeze(0)
         return ys_mask.unsqueeze(-2) & m
 
     def forward(self, x: paddle.Tensor, t: paddle.Tensor
@@ -112,7 +112,7 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface):
             in perplexity: p(t)^{-n} = exp(-log p(t) / n)
 
         """
-        batch_size = x.shape[0]
+        batch_size = paddle.shape(x)[0]
         xm = x != 0
         xlen = xm.sum(axis=1)
         if self.embed_drop is not None:
@@ -122,7 +122,7 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface):
         h, _ = self.encoder(emb, xlen)
         y = self.decoder(h)
         loss = F.cross_entropy(
-            y.view(-1, y.shape[-1]), t.view(-1), reduction="none")
+            y.view(-1, paddle.shape(y)[-1]), t.view(-1), reduction="none")
         mask = xm.to(loss.dtype)
         logp = loss * mask.view(-1)
         nll = logp.view(batch_size, -1).sum(-1)
diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py
index e3f46b15a..d5471369f 100644
--- a/paddlespeech/s2t/models/u2/u2.py
+++ b/paddlespeech/s2t/models/u2/u2.py
@@ -775,7 +775,7 @@ class U2DecodeModel(U2BaseModel):
         """
         self.eval()
         x = paddle.to_tensor(x).unsqueeze(0)
-        ilen = x.shape[1]
+        ilen = paddle.shape(x)[1]
         enc_output, _ = self._forward_encoder(x, ilen)
         return enc_output.squeeze(0)
 
diff --git a/paddlespeech/s2t/modules/decoder.py b/paddlespeech/s2t/modules/decoder.py
index ce78059c0..ccc8482d5 100644
--- a/paddlespeech/s2t/modules/decoder.py
+++ b/paddlespeech/s2t/modules/decoder.py
@@ -242,7 +242,7 @@ class TransformerDecoder(BatchScorerInterface, nn.Layer):
             ]
 
         # batch decoding
-        ys_mask = subsequent_mask(ys.shape[-1]).unsqueeze(0)  # (B,L,L)
+        ys_mask = subsequent_mask(paddle.shape(ys)[-1]).unsqueeze(0)  # (B,L,L)
         xs_mask = make_xs_mask(xs).unsqueeze(1)  # (B,1,T)
         logp, states = self.forward_one_step(
             xs, xs_mask, ys, ys_mask, cache=batch_state)
diff --git a/paddlespeech/s2t/modules/embedding.py b/paddlespeech/s2t/modules/embedding.py
index cc1fdffe2..51e558eb8 100644
--- a/paddlespeech/s2t/modules/embedding.py
+++ b/paddlespeech/s2t/modules/embedding.py
@@ -115,7 +115,7 @@ class PositionalEncoding(nn.Layer, PositionalEncodingInterface):
         assert offset + x.shape[
             1] < self.max_len, "offset: {} + x.shape[1]: {} is larger than the max_len: {}".format(
                 offset, x.shape[1], self.max_len)
-        #TODO(Hui Zhang): using T = x.shape[1], __getitem__ not support Tensor
+        #TODO(Hui Zhang): using T = paddle.shape(x)[1], __getitem__ not support Tensor
         pos_emb = self.pe[:, offset:offset + T]
         x = x * self.xscale + pos_emb
         return self.dropout(x), self.dropout(pos_emb)
@@ -165,6 +165,6 @@ class RelPositionalEncoding(PositionalEncoding):
             1] < self.max_len, "offset: {} + x.shape[1]: {} is larger than the max_len: {}".format(
                 offset, x.shape[1], self.max_len)
         x = x * self.xscale
-        #TODO(Hui Zhang): using x.shape[1], __getitem__ not support Tensor
+        #TODO(Hui Zhang): using paddle.shape(x)[1], __getitem__ not support Tensor
         pos_emb = self.pe[:, offset:offset + x.shape[1]]
         return self.dropout(x), self.dropout(pos_emb)
diff --git a/paddlespeech/s2t/modules/encoder.py b/paddlespeech/s2t/modules/encoder.py
index 7298c61f2..4d31acf1a 100644
--- a/paddlespeech/s2t/modules/encoder.py
+++ b/paddlespeech/s2t/modules/encoder.py
@@ -218,7 +218,7 @@ class BaseEncoder(nn.Layer):
         assert xs.shape[0] == 1  # batch size must be one
         # tmp_masks is just for interface compatibility
         # TODO(Hui Zhang): stride_slice not support bool tensor
-        # tmp_masks = paddle.ones([1, xs.shape[1]], dtype=paddle.bool)
+        # tmp_masks = paddle.ones([1, paddle.shape(xs)[1]], dtype=paddle.bool)
         tmp_masks = paddle.ones([1, xs.shape[1]], dtype=paddle.int32)
         tmp_masks = tmp_masks.unsqueeze(1)  #[B=1, C=1, T]
 
diff --git a/paddlespeech/s2t/utils/tensor_utils.py b/paddlespeech/s2t/utils/tensor_utils.py
index ca8689569..bc557b130 100644
--- a/paddlespeech/s2t/utils/tensor_utils.py
+++ b/paddlespeech/s2t/utils/tensor_utils.py
@@ -59,7 +59,7 @@ def pad_sequence(sequences: List[paddle.Tensor],
         >>> b = paddle.ones(22, 300)
         >>> c = paddle.ones(15, 300)
         >>> pad_sequence([a, b, c]).shape
-        [25, 3, 300]
+        paddle.Tensor([25, 3, 300])
 
     Note:
         This function returns a Tensor of size ``T x B x *`` or ``B x T x *``
@@ -79,7 +79,7 @@ def pad_sequence(sequences: List[paddle.Tensor],
 
     # assuming trailing dimensions and type of all the Tensors
     # in sequences are same and fetching those from sequences[0]
-    max_size = sequences[0].shape
+    max_size = paddle.shape(sequences[0])
     # (TODO Hui Zhang): slice not supprot `end==start`
     # trailing_dims = max_size[1:]
     trailing_dims = tuple(max_size[1:].numpy().tolist()) if sequences[0].ndim >= 2 else ()

From f9f014d159e28efa788f4d241794420716d369ad Mon Sep 17 00:00:00 2001
From: TianYuan <white-sky@qq.com>
Date: Wed, 25 May 2022 10:39:28 +0000
Subject: [PATCH 05/14] add VITS readme, test=tts

---
 examples/aishell3/tts3/README.md              |  30 ++--
 examples/aishell3/voc1/README.md              |   2 +-
 examples/aishell3/voc5/README.md              |  21 +--
 examples/csmsc/tts0/README.md                 |  30 ++--
 examples/csmsc/tts2/README.md                 |  30 ++--
 examples/csmsc/tts3/README.md                 |  31 ++--
 examples/csmsc/tts3/README_cn.md              |  30 ++--
 examples/csmsc/vits/README.md                 | 146 ++++++++++++++++++
 examples/csmsc/voc1/README.md                 |   2 +-
 examples/csmsc/voc3/README.md                 |   2 +-
 examples/csmsc/voc4/README.md                 |   2 +-
 examples/csmsc/voc5/README.md                 |   2 +-
 examples/csmsc/voc6/README.md                 |   2 +-
 examples/ljspeech/tts0/README.md              |  30 ++--
 examples/ljspeech/tts1/README.md              |   2 +-
 examples/ljspeech/tts3/README.md              |  30 ++--
 examples/ljspeech/voc1/README.md              |   2 +-
 examples/ljspeech/voc5/README.md              |  21 +--
 examples/vctk/tts3/README.md                  |  30 ++--
 examples/vctk/voc1/README.md                  |   2 +-
 examples/vctk/voc5/README.md                  |  21 +--
 .../t2s/exps/gan_vocoder/hifigan/train.py     |   3 +-
 .../gan_vocoder/multi_band_melgan/train.py    |   2 +-
 .../gan_vocoder/parallelwave_gan/train.py     |   2 +-
 .../exps/gan_vocoder/style_melgan/train.py    |   3 +-
 .../t2s/exps/transformer_tts/train.py         |   2 +-
 paddlespeech/t2s/exps/vits/train.py           |   5 +-
 paddlespeech/t2s/exps/wavernn/train.py        |   3 +-
 28 files changed, 285 insertions(+), 203 deletions(-)

diff --git a/examples/aishell3/tts3/README.md b/examples/aishell3/tts3/README.md
index d02ad1b63..93ce62c96 100644
--- a/examples/aishell3/tts3/README.md
+++ b/examples/aishell3/tts3/README.md
@@ -120,12 +120,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p
 ```
 ```text
 usage: synthesize.py [-h]
-                     [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}]
+                     [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}]
                      [--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
                      [--am_stat AM_STAT] [--phones_dict PHONES_DICT]
                      [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT]
                      [--voice-cloning VOICE_CLONING]
-                     [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+                     [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}]
                      [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
                      [--voc_stat VOC_STAT] [--ngpu NGPU]
                      [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR]
@@ -134,11 +134,10 @@ Synthesize with acoustic model & vocoder
 
 optional arguments:
   -h, --help            show this help message and exit
-  --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}
+  --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}
                         Choose acoustic model type of tts task.
   --am_config AM_CONFIG
-                        Config of acoustic model. Use deault config when it is
-                        None.
+                        Config of acoustic model.
   --am_ckpt AM_CKPT     Checkpoint file of acoustic model.
   --am_stat AM_STAT     mean and standard deviation used to normalize
                         spectrogram when training acoustic model.
@@ -150,10 +149,10 @@ optional arguments:
                         speaker id map file.
   --voice-cloning VOICE_CLONING
                         whether training voice cloning model.
-  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}
                         Choose vocoder type of tts task.
   --voc_config VOC_CONFIG
-                        Config of voc. Use deault config when it is None.
+                        Config of voc.
   --voc_ckpt VOC_CKPT   Checkpoint file of voc.
   --voc_stat VOC_STAT   mean and standard deviation used to normalize
                         spectrogram when training voc.
@@ -169,12 +168,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp
 ```
 ```text
 usage: synthesize_e2e.py [-h]
-                         [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}]
+                         [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}]
                          [--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
                          [--am_stat AM_STAT] [--phones_dict PHONES_DICT]
                          [--tones_dict TONES_DICT]
                          [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID]
-                         [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+                         [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}]
                          [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
                          [--voc_stat VOC_STAT] [--lang LANG]
                          [--inference_dir INFERENCE_DIR] [--ngpu NGPU]
@@ -184,11 +183,10 @@ Synthesize with acoustic model & vocoder
 
 optional arguments:
   -h, --help            show this help message and exit
-  --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}
+  --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}
                         Choose acoustic model type of tts task.
   --am_config AM_CONFIG
-                        Config of acoustic model. Use deault config when it is
-                        None.
+                        Config of acoustic model.
   --am_ckpt AM_CKPT     Checkpoint file of acoustic model.
   --am_stat AM_STAT     mean and standard deviation used to normalize
                         spectrogram when training acoustic model.
@@ -199,10 +197,10 @@ optional arguments:
   --speaker_dict SPEAKER_DICT
                         speaker id map file.
   --spk_id SPK_ID       spk id for multi speaker acoustic model
-  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}
                         Choose vocoder type of tts task.
   --voc_config VOC_CONFIG
-                        Config of voc. Use deault config when it is None.
+                        Config of voc.
   --voc_ckpt VOC_CKPT   Checkpoint file of voc.
   --voc_stat VOC_STAT   mean and standard deviation used to normalize
                         spectrogram when training voc.
@@ -215,9 +213,9 @@ optional arguments:
                         output dir.
 ```
 1. `--am` is acoustic model type with the format {model_name}_{dataset}
-2. `--am_config`, `--am_checkpoint`, `--am_stat`, `--phones_dict` `--speaker_dict` are arguments for acoustic model, which correspond to the 5 files in the fastspeech2 pretrained model.
+2. `--am_config`, `--am_ckpt`, `--am_stat`, `--phones_dict` `--speaker_dict` are arguments for acoustic model, which correspond to the 5 files in the fastspeech2 pretrained model.
 3. `--voc` is vocoder type with the format {model_name}_{dataset}
-4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
+4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
 5. `--lang` is the model language, which can be `zh` or `en`.
 6. `--test_metadata` should be the metadata file in the normalized subfolder of `test`  in the `dump` folder.
 7. `--text` is the text file, which contains sentences to synthesize.
diff --git a/examples/aishell3/voc1/README.md b/examples/aishell3/voc1/README.md
index eb30e7c40..503f8a19d 100644
--- a/examples/aishell3/voc1/README.md
+++ b/examples/aishell3/voc1/README.md
@@ -75,7 +75,7 @@ Train a ParallelWaveGAN model.
 
 optional arguments:
   -h, --help            show this help message and exit
-  --config CONFIG       config file to overwrite default config.
+  --config CONFIG       ParallelWaveGAN config file.
   --train-metadata TRAIN_METADATA
                         training data.
   --dev-metadata DEV_METADATA
diff --git a/examples/aishell3/voc5/README.md b/examples/aishell3/voc5/README.md
index c957c4a3a..f8f28f409 100644
--- a/examples/aishell3/voc5/README.md
+++ b/examples/aishell3/voc5/README.md
@@ -67,15 +67,13 @@ Here's the complete help message.
 ```text
 usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
                 [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
-                [--ngpu NGPU] [--batch-size BATCH_SIZE] [--max-iter MAX_ITER]
-                [--run-benchmark RUN_BENCHMARK]
-                [--profiler_options PROFILER_OPTIONS]
+                [--ngpu NGPU]
 
-Train a ParallelWaveGAN model.
+Train a HiFiGAN model.
 
 optional arguments:
   -h, --help            show this help message and exit
-  --config CONFIG       config file to overwrite default config.
+  --config CONFIG       HiFiGAN config file.
   --train-metadata TRAIN_METADATA
                         training data.
   --dev-metadata DEV_METADATA
@@ -83,19 +81,6 @@ optional arguments:
   --output-dir OUTPUT_DIR
                         output dir.
   --ngpu NGPU           if ngpu == 0, use cpu.
-
-benchmark:
-  arguments related to benchmark.
-
-  --batch-size BATCH_SIZE
-                        batch size.
-  --max-iter MAX_ITER   train max steps.
-  --run-benchmark RUN_BENCHMARK
-                        runing benchmark or not, if True, use the --batch-size
-                        and --max-iter.
-  --profiler_options PROFILER_OPTIONS
-                        The option of profiler, which should be in format
-                        "key1=value1;key2=value2;key3=value3".
 ```
 
 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
diff --git a/examples/csmsc/tts0/README.md b/examples/csmsc/tts0/README.md
index 01376bd61..a337c7d45 100644
--- a/examples/csmsc/tts0/README.md
+++ b/examples/csmsc/tts0/README.md
@@ -103,12 +103,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p
 ```
 ```text
 usage: synthesize.py [-h]
-                     [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}]
+                     [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}]
                      [--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
                      [--am_stat AM_STAT] [--phones_dict PHONES_DICT]
                      [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT]
                      [--voice-cloning VOICE_CLONING]
-                     [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+                     [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}]
                      [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
                      [--voc_stat VOC_STAT] [--ngpu NGPU]
                      [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR]
@@ -117,11 +117,10 @@ Synthesize with acoustic model & vocoder
 
 optional arguments:
   -h, --help            show this help message and exit
-  --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}
+  --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}
                         Choose acoustic model type of tts task.
   --am_config AM_CONFIG
-                        Config of acoustic model. Use deault config when it is
-                        None.
+                        Config of acoustic model.
   --am_ckpt AM_CKPT     Checkpoint file of acoustic model.
   --am_stat AM_STAT     mean and standard deviation used to normalize
                         spectrogram when training acoustic model.
@@ -133,10 +132,10 @@ optional arguments:
                         speaker id map file.
   --voice-cloning VOICE_CLONING
                         whether training voice cloning model.
-  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}
                         Choose vocoder type of tts task.
   --voc_config VOC_CONFIG
-                        Config of voc. Use deault config when it is None.
+                        Config of voc.
   --voc_ckpt VOC_CKPT   Checkpoint file of voc.
   --voc_stat VOC_STAT   mean and standard deviation used to normalize
                         spectrogram when training voc.
@@ -152,12 +151,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp
 ```
 ```text
 usage: synthesize_e2e.py [-h]
-                         [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}]
+                         [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}]
                          [--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
                          [--am_stat AM_STAT] [--phones_dict PHONES_DICT]
                          [--tones_dict TONES_DICT]
                          [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID]
-                         [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc}]
+                         [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}]
                          [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
                          [--voc_stat VOC_STAT] [--lang LANG]
                          [--inference_dir INFERENCE_DIR] [--ngpu NGPU]
@@ -167,11 +166,10 @@ Synthesize with acoustic model & vocoder
 
 optional arguments:
   -h, --help            show this help message and exit
-  --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}
+  --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}
                         Choose acoustic model type of tts task.
   --am_config AM_CONFIG
-                        Config of acoustic model. Use deault config when it is
-                        None.
+                        Config of acoustic model.
   --am_ckpt AM_CKPT     Checkpoint file of acoustic model.
   --am_stat AM_STAT     mean and standard deviation used to normalize
                         spectrogram when training acoustic model.
@@ -182,10 +180,10 @@ optional arguments:
   --speaker_dict SPEAKER_DICT
                         speaker id map file.
   --spk_id SPK_ID       spk id for multi speaker acoustic model
-  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc}
+  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}
                         Choose vocoder type of tts task.
   --voc_config VOC_CONFIG
-                        Config of voc. Use deault config when it is None.
+                        Config of voc.
   --voc_ckpt VOC_CKPT   Checkpoint file of voc.
   --voc_stat VOC_STAT   mean and standard deviation used to normalize
                         spectrogram when training voc.
@@ -198,9 +196,9 @@ optional arguments:
                         output dir.
 ```
 1. `--am` is acoustic model type with the format {model_name}_{dataset}
-2. `--am_config`, `--am_checkpoint`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the Tacotron2 pretrained model.
+2. `--am_config`, `--am_ckpt`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the Tacotron2 pretrained model.
 3. `--voc` is vocoder type with the format {model_name}_{dataset}
-4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
+4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
 5. `--lang` is the model language, which can be `zh` or `en`.
 6. `--test_metadata` should be the metadata file in the normalized subfolder of `test`  in the `dump` folder.
 7. `--text` is the text file, which contains sentences to synthesize.
diff --git a/examples/csmsc/tts2/README.md b/examples/csmsc/tts2/README.md
index 081d85848..553a370c9 100644
--- a/examples/csmsc/tts2/README.md
+++ b/examples/csmsc/tts2/README.md
@@ -109,12 +109,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p
 ```
 ```text
 usage: synthesize.py [-h]
-                     [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}]
+                     [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}]
                      [--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
                      [--am_stat AM_STAT] [--phones_dict PHONES_DICT]
                      [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT]
                      [--voice-cloning VOICE_CLONING]
-                     [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+                     [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}]
                      [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
                      [--voc_stat VOC_STAT] [--ngpu NGPU]
                      [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR]
@@ -123,11 +123,10 @@ Synthesize with acoustic model & vocoder
 
 optional arguments:
   -h, --help            show this help message and exit
-  --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}
+  --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}
                         Choose acoustic model type of tts task.
   --am_config AM_CONFIG
-                        Config of acoustic model. Use deault config when it is
-                        None.
+                        Config of acoustic model.
   --am_ckpt AM_CKPT     Checkpoint file of acoustic model.
   --am_stat AM_STAT     mean and standard deviation used to normalize
                         spectrogram when training acoustic model.
@@ -139,10 +138,10 @@ optional arguments:
                         speaker id map file.
   --voice-cloning VOICE_CLONING
                         whether training voice cloning model.
-  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}
                         Choose vocoder type of tts task.
   --voc_config VOC_CONFIG
-                        Config of voc. Use deault config when it is None.
+                        Config of voc.
   --voc_ckpt VOC_CKPT   Checkpoint file of voc.
   --voc_stat VOC_STAT   mean and standard deviation used to normalize
                         spectrogram when training voc.
@@ -158,12 +157,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp
 ```
 ```text
 usage: synthesize_e2e.py [-h]
-                         [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}]
+                         [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}]
                          [--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
                          [--am_stat AM_STAT] [--phones_dict PHONES_DICT]
                          [--tones_dict TONES_DICT]
                          [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID]
-                         [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+                         [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}]
                          [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
                          [--voc_stat VOC_STAT] [--lang LANG]
                          [--inference_dir INFERENCE_DIR] [--ngpu NGPU]
@@ -173,11 +172,10 @@ Synthesize with acoustic model & vocoder
 
 optional arguments:
   -h, --help            show this help message and exit
-  --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}
+  --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}
                         Choose acoustic model type of tts task.
   --am_config AM_CONFIG
-                        Config of acoustic model. Use deault config when it is
-                        None.
+                        Config of acoustic model.
   --am_ckpt AM_CKPT     Checkpoint file of acoustic model.
   --am_stat AM_STAT     mean and standard deviation used to normalize
                         spectrogram when training acoustic model.
@@ -188,10 +186,10 @@ optional arguments:
   --speaker_dict SPEAKER_DICT
                         speaker id map file.
   --spk_id SPK_ID       spk id for multi speaker acoustic model
-  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}
                         Choose vocoder type of tts task.
   --voc_config VOC_CONFIG
-                        Config of voc. Use deault config when it is None.
+                        Config of voc.
   --voc_ckpt VOC_CKPT   Checkpoint file of voc.
   --voc_stat VOC_STAT   mean and standard deviation used to normalize
                         spectrogram when training voc.
@@ -204,9 +202,9 @@ optional arguments:
                         output dir.
 ```
 1. `--am` is acoustic model type with the format {model_name}_{dataset}
-2. `--am_config`, `--am_checkpoint`, `--am_stat`, `--phones_dict` and `--tones_dict` are arguments for acoustic model, which correspond to the 5 files in the speedyspeech pretrained model.
+2. `--am_config`, `--am_ckpt`, `--am_stat`, `--phones_dict` and `--tones_dict` are arguments for acoustic model, which correspond to the 5 files in the speedyspeech pretrained model.
 3. `--voc` is vocoder type with the format {model_name}_{dataset}
-4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
+4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
 5. `--lang` is the model language, which can be `zh` or `en`.
 6. `--test_metadata` should be the metadata file in the normalized subfolder of `test`  in the `dump` folder.
 7. `--text` is the text file, which contains sentences to synthesize.
diff --git a/examples/csmsc/tts3/README.md b/examples/csmsc/tts3/README.md
index c734199b4..be18de7d6 100644
--- a/examples/csmsc/tts3/README.md
+++ b/examples/csmsc/tts3/README.md
@@ -111,12 +111,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p
 ```
 ```text
 usage: synthesize.py [-h]
-                     [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}]
+                     [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}]
                      [--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
                      [--am_stat AM_STAT] [--phones_dict PHONES_DICT]
                      [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT]
                      [--voice-cloning VOICE_CLONING]
-                     [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+                     [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}]
                      [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
                      [--voc_stat VOC_STAT] [--ngpu NGPU]
                      [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR]
@@ -125,11 +125,10 @@ Synthesize with acoustic model & vocoder
 
 optional arguments:
   -h, --help            show this help message and exit
-  --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}
+  --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}
                         Choose acoustic model type of tts task.
   --am_config AM_CONFIG
-                        Config of acoustic model. Use deault config when it is
-                        None.
+                        Config of acoustic model.
   --am_ckpt AM_CKPT     Checkpoint file of acoustic model.
   --am_stat AM_STAT     mean and standard deviation used to normalize
                         spectrogram when training acoustic model.
@@ -141,10 +140,10 @@ optional arguments:
                         speaker id map file.
   --voice-cloning VOICE_CLONING
                         whether training voice cloning model.
-  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}
                         Choose vocoder type of tts task.
   --voc_config VOC_CONFIG
-                        Config of voc. Use deault config when it is None.
+                        Config of voc.
   --voc_ckpt VOC_CKPT   Checkpoint file of voc.
   --voc_stat VOC_STAT   mean and standard deviation used to normalize
                         spectrogram when training voc.
@@ -160,12 +159,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp
 ```
 ```text
 usage: synthesize_e2e.py [-h]
-                         [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}]
+                         [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}]
                          [--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
                          [--am_stat AM_STAT] [--phones_dict PHONES_DICT]
                          [--tones_dict TONES_DICT]
                          [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID]
-                         [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+                         [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}]
                          [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
                          [--voc_stat VOC_STAT] [--lang LANG]
                          [--inference_dir INFERENCE_DIR] [--ngpu NGPU]
@@ -175,11 +174,10 @@ Synthesize with acoustic model & vocoder
 
 optional arguments:
   -h, --help            show this help message and exit
-  --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}
+  --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}
                         Choose acoustic model type of tts task.
   --am_config AM_CONFIG
-                        Config of acoustic model. Use deault config when it is
-                        None.
+                        Config of acoustic model.
   --am_ckpt AM_CKPT     Checkpoint file of acoustic model.
   --am_stat AM_STAT     mean and standard deviation used to normalize
                         spectrogram when training acoustic model.
@@ -190,10 +188,10 @@ optional arguments:
   --speaker_dict SPEAKER_DICT
                         speaker id map file.
   --spk_id SPK_ID       spk id for multi speaker acoustic model
-  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}
                         Choose vocoder type of tts task.
   --voc_config VOC_CONFIG
-                        Config of voc. Use deault config when it is None.
+                        Config of voc.
   --voc_ckpt VOC_CKPT   Checkpoint file of voc.
   --voc_stat VOC_STAT   mean and standard deviation used to normalize
                         spectrogram when training voc.
@@ -204,11 +202,12 @@ optional arguments:
   --text TEXT           text to synthesize, a 'utt_id sentence' pair per line.
   --output_dir OUTPUT_DIR
                         output dir.
+
 ```
 1. `--am` is acoustic model type with the format {model_name}_{dataset}
-2. `--am_config`, `--am_checkpoint`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the fastspeech2 pretrained model.
+2. `--am_config`, `--am_ckpt`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the fastspeech2 pretrained model.
 3. `--voc` is vocoder type with the format {model_name}_{dataset}
-4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
+4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
 5. `--lang` is the model language, which can be `zh` or `en`.
 6. `--test_metadata` should be the metadata file in the normalized subfolder of `test`  in the `dump` folder.
 7. `--text` is the text file, which contains sentences to synthesize.
diff --git a/examples/csmsc/tts3/README_cn.md b/examples/csmsc/tts3/README_cn.md
index 25931ecb1..a88615134 100644
--- a/examples/csmsc/tts3/README_cn.md
+++ b/examples/csmsc/tts3/README_cn.md
@@ -117,12 +117,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p
 ```
 ```text
 usage: synthesize.py [-h]
-                     [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}]
+                     [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}]
                      [--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
                      [--am_stat AM_STAT] [--phones_dict PHONES_DICT]
                      [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT]
                      [--voice-cloning VOICE_CLONING]
-                     [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+                     [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}]
                      [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
                      [--voc_stat VOC_STAT] [--ngpu NGPU]
                      [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR]
@@ -131,11 +131,10 @@ Synthesize with acoustic model & vocoder
 
 optional arguments:
   -h, --help            show this help message and exit
-  --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}
+  --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}
                         Choose acoustic model type of tts task.
   --am_config AM_CONFIG
-                        Config of acoustic model. Use deault config when it is
-                        None.
+                        Config of acoustic model.
   --am_ckpt AM_CKPT     Checkpoint file of acoustic model.
   --am_stat AM_STAT     mean and standard deviation used to normalize
                         spectrogram when training acoustic model.
@@ -147,10 +146,10 @@ optional arguments:
                         speaker id map file.
   --voice-cloning VOICE_CLONING
                         whether training voice cloning model.
-  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}
                         Choose vocoder type of tts task.
   --voc_config VOC_CONFIG
-                        Config of voc. Use deault config when it is None.
+                        Config of voc.
   --voc_ckpt VOC_CKPT   Checkpoint file of voc.
   --voc_stat VOC_STAT   mean and standard deviation used to normalize
                         spectrogram when training voc.
@@ -167,12 +166,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp
 ```
 ```text
 usage: synthesize_e2e.py [-h]
-                         [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}]
+                         [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}]
                          [--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
                          [--am_stat AM_STAT] [--phones_dict PHONES_DICT]
                          [--tones_dict TONES_DICT]
                          [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID]
-                         [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+                         [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}]
                          [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
                          [--voc_stat VOC_STAT] [--lang LANG]
                          [--inference_dir INFERENCE_DIR] [--ngpu NGPU]
@@ -182,11 +181,10 @@ Synthesize with acoustic model & vocoder
 
 optional arguments:
   -h, --help            show this help message and exit
-  --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}
+  --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}
                         Choose acoustic model type of tts task.
   --am_config AM_CONFIG
-                        Config of acoustic model. Use deault config when it is
-                        None.
+                        Config of acoustic model.
   --am_ckpt AM_CKPT     Checkpoint file of acoustic model.
   --am_stat AM_STAT     mean and standard deviation used to normalize
                         spectrogram when training acoustic model.
@@ -197,10 +195,10 @@ optional arguments:
   --speaker_dict SPEAKER_DICT
                         speaker id map file.
   --spk_id SPK_ID       spk id for multi speaker acoustic model
-  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}
                         Choose vocoder type of tts task.
   --voc_config VOC_CONFIG
-                        Config of voc. Use deault config when it is None.
+                        Config of voc.
   --voc_ckpt VOC_CKPT   Checkpoint file of voc.
   --voc_stat VOC_STAT   mean and standard deviation used to normalize
                         spectrogram when training voc.
@@ -213,9 +211,9 @@ optional arguments:
                         output dir.
 ```
 1. `--am` 声学模型格式是否符合 {model_name}_{dataset}
-2. `--am_config`, `--am_checkpoint`, `--am_stat` 和 `--phones_dict` 是声学模型的参数，对应于 fastspeech2 预训练模型中的 4 个文件。
+2. `--am_config`, `--am_ckpt`, `--am_stat` 和 `--phones_dict` 是声学模型的参数，对应于 fastspeech2 预训练模型中的 4 个文件。
 3. `--voc` 声码器(vocoder)格式是否符合 {model_name}_{dataset}
-4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` 是声码器的参数，对应于 parallel wavegan 预训练模型中的 3 个文件。
+4. `--voc_config`, `--voc_ckpt`, `--voc_stat` 是声码器的参数，对应于 parallel wavegan 预训练模型中的 3 个文件。
 5. `--lang` 对应模型的语言可以是 `zh` 或 `en` 。
 6. `--test_metadata` 应为 `dump` 文件夹中 `test` 下的规范化元数据文件、
 7. `--text` 是文本文件，其中包含要合成的句子。
diff --git a/examples/csmsc/vits/README.md b/examples/csmsc/vits/README.md
index e69de29bb..0c16840a0 100644
--- a/examples/csmsc/vits/README.md
+++ b/examples/csmsc/vits/README.md
@@ -0,0 +1,146 @@
+# VITS with CSMSC
+This example contains code used to train a [VITS](https://arxiv.org/abs/2106.06103) model with [Chinese Standard Mandarin Speech Copus](https://www.data-baker.com/open_source.html).
+
+## Dataset
+### Download and Extract
+Download CSMSC from it's [Official Website](https://test.data-baker.com/data/index/source).
+
+### Get MFA Result and Extract
+We use [MFA](https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner) to get phonemes for VITS, the durations of MFA are not needed here.
+You can download from here [baker_alignment_tone.tar.gz](https://paddlespeech.bj.bcebos.com/MFA/BZNSYP/with_tone/baker_alignment_tone.tar.gz), or train your MFA model reference to [mfa example](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/other/mfa) of our repo.
+
+## Get Started
+Assume the path to the dataset is `~/datasets/BZNSYP`.
+Assume the path to the MFA result of CSMSC is `./baker_alignment_tone`.
+Run the command below to
+1. **source path**.
+2. preprocess the dataset.
+3. train the model.
+4. synthesize wavs.
+    - synthesize waveform from `metadata.jsonl`.
+    - synthesize waveform from a text file.
+
+```bash
+./run.sh
+```
+You can choose a range of stages you want to run, or set `stage` equal to `stop-stage` to use only one stage, for example, running the following command will only preprocess the dataset.
+```bash
+./run.sh --stage 0 --stop-stage 0
+```
+### Data Preprocessing
+```bash
+./local/preprocess.sh ${conf_path}
+```
+When it is done. A `dump` folder is created in the current directory. The structure of the dump folder is listed below.
+
+```text
+dump
+├── dev
+│   ├── norm
+│   └── raw
+├── phone_id_map.txt
+├── speaker_id_map.txt
+├── test
+│   ├── norm
+│   └── raw
+└── train
+    ├── feats_stats.npy
+    ├── norm
+    └── raw
+```
+The dataset is split into 3 parts, namely `train`, `dev`, and` test`, each of which contains a `norm` and `raw` subfolder. The raw folder contains wave and linear spectrogram of each utterance, while the norm folder contains normalized ones. The statistics used to normalize features are computed from the training set, which is located in `dump/train/feats_stats.npy`.
+
+Also, there is a `metadata.jsonl` in each subfolder. It is a table-like file that contains phones, text_lengths, feats, feats_lengths, the path of linear spectrogram features, the path of raw waves, speaker, and the id of each utterance.
+
+### Model Training
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path}
+```
+`./local/train.sh` calls `${BIN_DIR}/train.py`.
+Here's the complete help message.
+```text
+usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
+                [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
+                [--ngpu NGPU] [--phones-dict PHONES_DICT]
+
+Train a VITS model.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --config CONFIG       config file to overwrite default config.
+  --train-metadata TRAIN_METADATA
+                        training data.
+  --dev-metadata DEV_METADATA
+                        dev data.
+  --output-dir OUTPUT_DIR
+                        output dir.
+  --ngpu NGPU           if ngpu == 0, use cpu.
+  --phones-dict PHONES_DICT
+                        phone vocabulary file.
+```
+1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
+2. `--train-metadata` and `--dev-metadata` should be the metadata file in the normalized subfolder of `train` and `dev` in the `dump` folder.
+3. `--output-dir` is the directory to save the results of the experiment. Checkpoints are saved in `checkpoints/` inside this directory.
+4. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
+5. `--phones-dict` is the path of the phone vocabulary file.
+
+### Synthesizing
+
+`./local/synthesize.sh` calls `${BIN_DIR}/synthesize.py`, which can synthesize waveform from `metadata.jsonl`.
+
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name}
+```
+```text
+usage: synthesize.py [-h] [--config CONFIG] [--ckpt CKPT]
+                     [--phones_dict PHONES_DICT] [--ngpu NGPU]
+                     [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR]
+
+Synthesize with VITS
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --config CONFIG       Config of VITS.
+  --ckpt CKPT           Checkpoint file of VITS.
+  --phones_dict PHONES_DICT
+                        phone vocabulary file.
+  --ngpu NGPU           if ngpu == 0, use cpu.
+  --test_metadata TEST_METADATA
+                        test metadata.
+  --output_dir OUTPUT_DIR
+                        output dir.
+```
+`./local/synthesize_e2e.sh` calls `${BIN_DIR}/synthesize_e2e.py`, which can synthesize waveform from text file.
+```bash
+CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name}
+```
+```text
+usage: synthesize_e2e.py [-h] [--config CONFIG] [--ckpt CKPT]
+                         [--phones_dict PHONES_DICT] [--lang LANG]
+                         [--inference_dir INFERENCE_DIR] [--ngpu NGPU]
+                         [--text TEXT] [--output_dir OUTPUT_DIR]
+
+Synthesize with VITS
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --config CONFIG       Config of VITS.
+  --ckpt CKPT           Checkpoint file of VITS.
+  --phones_dict PHONES_DICT
+                        phone vocabulary file.
+  --lang LANG           Choose model language. zh or en
+  --inference_dir INFERENCE_DIR
+                        dir to save inference models
+  --ngpu NGPU           if ngpu == 0, use cpu.
+  --text TEXT           text to synthesize, a 'utt_id sentence' pair per line.
+  --output_dir OUTPUT_DIR
+                        output dir.
+```
+1. `--config`, `--ckpt`, and `--phones_dict` are arguments for acoustic model, which correspond to the 3 files in the VITS pretrained model.
+2. `--lang` is the model language, which can be `zh` or `en`.
+3. `--test_metadata` should be the metadata file in the normalized subfolder of `test`  in the `dump` folder.
+4. `--text` is the text file, which contains sentences to synthesize.
+5. `--output_dir` is the directory to save synthesized audio files.
+6. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
+
+## Pretrained Model
diff --git a/examples/csmsc/voc1/README.md b/examples/csmsc/voc1/README.md
index 77da5b185..d19fe8497 100644
--- a/examples/csmsc/voc1/README.md
+++ b/examples/csmsc/voc1/README.md
@@ -65,7 +65,7 @@ Train a ParallelWaveGAN model.
 
 optional arguments:
   -h, --help            show this help message and exit
-  --config CONFIG       config file to overwrite default config.
+  --config CONFIG       ParallelWaveGAN config file.
   --train-metadata TRAIN_METADATA
                         training data.
   --dev-metadata DEV_METADATA
diff --git a/examples/csmsc/voc3/README.md b/examples/csmsc/voc3/README.md
index 12adaf7f4..eb7710362 100644
--- a/examples/csmsc/voc3/README.md
+++ b/examples/csmsc/voc3/README.md
@@ -63,7 +63,7 @@ Train a Multi-Band MelGAN model.
 
 optional arguments:
   -h, --help            show this help message and exit
-  --config CONFIG       config file to overwrite default config.
+  --config CONFIG       Multi-Band MelGAN config file.
   --train-metadata TRAIN_METADATA
                         training data.
   --dev-metadata DEV_METADATA
diff --git a/examples/csmsc/voc4/README.md b/examples/csmsc/voc4/README.md
index b7add3e57..d9e86a88d 100644
--- a/examples/csmsc/voc4/README.md
+++ b/examples/csmsc/voc4/README.md
@@ -63,7 +63,7 @@ Train a Style MelGAN model.
 
 optional arguments:
   -h, --help            show this help message and exit
-  --config CONFIG       config file to overwrite default config.
+  --config CONFIG       Style MelGAN config file.
   --train-metadata TRAIN_METADATA
                         training data.
   --dev-metadata DEV_METADATA
diff --git a/examples/csmsc/voc5/README.md b/examples/csmsc/voc5/README.md
index 94f93b48b..e044a0c74 100644
--- a/examples/csmsc/voc5/README.md
+++ b/examples/csmsc/voc5/README.md
@@ -63,7 +63,7 @@ Train a HiFiGAN model.
 
 optional arguments:
   -h, --help            show this help message and exit
-  --config CONFIG       config file to overwrite default config.
+  --config CONFIG       HiFiGAN config file.
   --train-metadata TRAIN_METADATA
                         training data.
   --dev-metadata DEV_METADATA
diff --git a/examples/csmsc/voc6/README.md b/examples/csmsc/voc6/README.md
index 7dcf133bd..f1a5ec3bb 100644
--- a/examples/csmsc/voc6/README.md
+++ b/examples/csmsc/voc6/README.md
@@ -63,7 +63,7 @@ Train a WaveRNN model.
 
 optional arguments:
   -h, --help            show this help message and exit
-  --config CONFIG       config file to overwrite default config.
+  --config CONFIG       WaveRNN config file.
   --train-metadata TRAIN_METADATA
                         training data.
   --dev-metadata DEV_METADATA
diff --git a/examples/ljspeech/tts0/README.md b/examples/ljspeech/tts0/README.md
index ba7ad6193..581f7930f 100644
--- a/examples/ljspeech/tts0/README.md
+++ b/examples/ljspeech/tts0/README.md
@@ -103,12 +103,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p
 ```
 ```text
 usage: synthesize.py [-h]
-                     [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}]
+                     [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}]
                      [--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
                      [--am_stat AM_STAT] [--phones_dict PHONES_DICT]
                      [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT]
                      [--voice-cloning VOICE_CLONING]
-                     [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+                     [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}]
                      [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
                      [--voc_stat VOC_STAT] [--ngpu NGPU]
                      [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR]
@@ -117,11 +117,10 @@ Synthesize with acoustic model & vocoder
 
 optional arguments:
   -h, --help            show this help message and exit
-  --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}
+  --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}
                         Choose acoustic model type of tts task.
   --am_config AM_CONFIG
-                        Config of acoustic model. Use deault config when it is
-                        None.
+                        Config of acoustic model.
   --am_ckpt AM_CKPT     Checkpoint file of acoustic model.
   --am_stat AM_STAT     mean and standard deviation used to normalize
                         spectrogram when training acoustic model.
@@ -133,10 +132,10 @@ optional arguments:
                         speaker id map file.
   --voice-cloning VOICE_CLONING
                         whether training voice cloning model.
-  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}
                         Choose vocoder type of tts task.
   --voc_config VOC_CONFIG
-                        Config of voc. Use deault config when it is None.
+                        Config of voc.
   --voc_ckpt VOC_CKPT   Checkpoint file of voc.
   --voc_stat VOC_STAT   mean and standard deviation used to normalize
                         spectrogram when training voc.
@@ -152,12 +151,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp
 ```
 ```text
 usage: synthesize_e2e.py [-h]
-                         [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}]
+                         [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}]
                          [--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
                          [--am_stat AM_STAT] [--phones_dict PHONES_DICT]
                          [--tones_dict TONES_DICT]
                          [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID]
-                         [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc}]
+                         [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}]
                          [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
                          [--voc_stat VOC_STAT] [--lang LANG]
                          [--inference_dir INFERENCE_DIR] [--ngpu NGPU]
@@ -167,11 +166,10 @@ Synthesize with acoustic model & vocoder
 
 optional arguments:
   -h, --help            show this help message and exit
-  --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc}
+  --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}
                         Choose acoustic model type of tts task.
   --am_config AM_CONFIG
-                        Config of acoustic model. Use deault config when it is
-                        None.
+                        Config of acoustic model.
   --am_ckpt AM_CKPT     Checkpoint file of acoustic model.
   --am_stat AM_STAT     mean and standard deviation used to normalize
                         spectrogram when training acoustic model.
@@ -182,10 +180,10 @@ optional arguments:
   --speaker_dict SPEAKER_DICT
                         speaker id map file.
   --spk_id SPK_ID       spk id for multi speaker acoustic model
-  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc}
+  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}
                         Choose vocoder type of tts task.
   --voc_config VOC_CONFIG
-                        Config of voc. Use deault config when it is None.
+                        Config of voc.
   --voc_ckpt VOC_CKPT   Checkpoint file of voc.
   --voc_stat VOC_STAT   mean and standard deviation used to normalize
                         spectrogram when training voc.
@@ -198,9 +196,9 @@ optional arguments:
                         output dir.
 ```
 1. `--am` is acoustic model type with the format {model_name}_{dataset}
-2. `--am_config`, `--am_checkpoint`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the Tacotron2 pretrained model.
+2. `--am_config`, `--am_ckpt`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the Tacotron2 pretrained model.
 3. `--voc` is vocoder type with the format {model_name}_{dataset}
-4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
+4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
 5. `--lang` is the model language, which can be `zh` or `en`.
 6. `--test_metadata` should be the metadata file in the normalized subfolder of `test`  in the `dump` folder.
 7. `--text` is the text file, which contains sentences to synthesize.
diff --git a/examples/ljspeech/tts1/README.md b/examples/ljspeech/tts1/README.md
index 7f32522ac..f85991cba 100644
--- a/examples/ljspeech/tts1/README.md
+++ b/examples/ljspeech/tts1/README.md
@@ -61,7 +61,7 @@ Train a TransformerTTS model with LJSpeech TTS dataset.
 
 optional arguments:
   -h, --help            show this help message and exit
-  --config CONFIG       config file to overwrite default config.
+  --config CONFIG       TransformerTTS config file.
   --train-metadata TRAIN_METADATA
                         training data.
   --dev-metadata DEV_METADATA
diff --git a/examples/ljspeech/tts3/README.md b/examples/ljspeech/tts3/README.md
index e028fa05d..a6724083d 100644
--- a/examples/ljspeech/tts3/README.md
+++ b/examples/ljspeech/tts3/README.md
@@ -109,12 +109,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p
 ```
 ``text
 usage: synthesize.py [-h]
-                     [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}]
+                     [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}]
                      [--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
                      [--am_stat AM_STAT] [--phones_dict PHONES_DICT]
                      [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT]
                      [--voice-cloning VOICE_CLONING]
-                     [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+                     [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}]
                      [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
                      [--voc_stat VOC_STAT] [--ngpu NGPU]
                      [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR]
@@ -123,11 +123,10 @@ Synthesize with acoustic model & vocoder
 
 optional arguments:
   -h, --help            show this help message and exit
-  --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}
+  --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}
                         Choose acoustic model type of tts task.
   --am_config AM_CONFIG
-                        Config of acoustic model. Use deault config when it is
-                        None.
+                        Config of acoustic model.
   --am_ckpt AM_CKPT     Checkpoint file of acoustic model.
   --am_stat AM_STAT     mean and standard deviation used to normalize
                         spectrogram when training acoustic model.
@@ -139,10 +138,10 @@ optional arguments:
                         speaker id map file.
   --voice-cloning VOICE_CLONING
                         whether training voice cloning model.
-  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}
                         Choose vocoder type of tts task.
   --voc_config VOC_CONFIG
-                        Config of voc. Use deault config when it is None.
+                        Config of voc.
   --voc_ckpt VOC_CKPT   Checkpoint file of voc.
   --voc_stat VOC_STAT   mean and standard deviation used to normalize
                         spectrogram when training voc.
@@ -158,12 +157,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp
 ```
 ```text
 usage: synthesize_e2e.py [-h]
-                         [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}]
+                         [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}]
                          [--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
                          [--am_stat AM_STAT] [--phones_dict PHONES_DICT]
                          [--tones_dict TONES_DICT]
                          [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID]
-                         [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+                         [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}]
                          [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
                          [--voc_stat VOC_STAT] [--lang LANG]
                          [--inference_dir INFERENCE_DIR] [--ngpu NGPU]
@@ -173,11 +172,10 @@ Synthesize with acoustic model & vocoder
 
 optional arguments:
   -h, --help            show this help message and exit
-  --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}
+  --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}
                         Choose acoustic model type of tts task.
   --am_config AM_CONFIG
-                        Config of acoustic model. Use deault config when it is
-                        None.
+                        Config of acoustic model.
   --am_ckpt AM_CKPT     Checkpoint file of acoustic model.
   --am_stat AM_STAT     mean and standard deviation used to normalize
                         spectrogram when training acoustic model.
@@ -188,10 +186,10 @@ optional arguments:
   --speaker_dict SPEAKER_DICT
                         speaker id map file.
   --spk_id SPK_ID       spk id for multi speaker acoustic model
-  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}
                         Choose vocoder type of tts task.
   --voc_config VOC_CONFIG
-                        Config of voc. Use deault config when it is None.
+                        Config of voc.
   --voc_ckpt VOC_CKPT   Checkpoint file of voc.
   --voc_stat VOC_STAT   mean and standard deviation used to normalize
                         spectrogram when training voc.
@@ -204,9 +202,9 @@ optional arguments:
                         output dir.
 ```
 1. `--am` is acoustic model type with the format {model_name}_{dataset}
-2. `--am_config`, `--am_checkpoint`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the fastspeech2 pretrained model.
+2. `--am_config`, `--am_ckpt`, `--am_stat` and `--phones_dict` are arguments for acoustic model, which correspond to the 4 files in the fastspeech2 pretrained model.
 3. `--voc` is vocoder type with the format {model_name}_{dataset}
-4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
+4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
 5. `--lang` is the model language, which can be `zh` or `en`.
 6. `--test_metadata` should be the metadata file in the normalized subfolder of `test`  in the `dump` folder.
 7. `--text` is the text file, which contains sentences to synthesize.
diff --git a/examples/ljspeech/voc1/README.md b/examples/ljspeech/voc1/README.md
index 4513b2a05..6fd6cbe24 100644
--- a/examples/ljspeech/voc1/README.md
+++ b/examples/ljspeech/voc1/README.md
@@ -65,7 +65,7 @@ Train a ParallelWaveGAN model.
 
 optional arguments:
   -h, --help            show this help message and exit
-  --config CONFIG       config file to overwrite default config.
+  --config CONFIG       ParallelWaveGAN config file.
   --train-metadata TRAIN_METADATA
                         training data.
   --dev-metadata DEV_METADATA
diff --git a/examples/ljspeech/voc5/README.md b/examples/ljspeech/voc5/README.md
index 9b31e2650..afc1bb8be 100644
--- a/examples/ljspeech/voc5/README.md
+++ b/examples/ljspeech/voc5/README.md
@@ -57,15 +57,13 @@ Here's the complete help message.
 ```text
 usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
                 [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
-                [--ngpu NGPU] [--batch-size BATCH_SIZE] [--max-iter MAX_ITER]
-                [--run-benchmark RUN_BENCHMARK]
-                [--profiler_options PROFILER_OPTIONS]
+                [--ngpu NGPU]
 
-Train a ParallelWaveGAN model.
+Train a HiFiGAN model.
 
 optional arguments:
   -h, --help            show this help message and exit
-  --config CONFIG       config file to overwrite default config.
+  --config CONFIG       HiFiGAN config file.
   --train-metadata TRAIN_METADATA
                         training data.
   --dev-metadata DEV_METADATA
@@ -73,19 +71,6 @@ optional arguments:
   --output-dir OUTPUT_DIR
                         output dir.
   --ngpu NGPU           if ngpu == 0, use cpu.
-
-benchmark:
-  arguments related to benchmark.
-
-  --batch-size BATCH_SIZE
-                        batch size.
-  --max-iter MAX_ITER   train max steps.
-  --run-benchmark RUN_BENCHMARK
-                        runing benchmark or not, if True, use the --batch-size
-                        and --max-iter.
-  --profiler_options PROFILER_OPTIONS
-                        The option of profiler, which should be in format
-                        "key1=value1;key2=value2;key3=value3".
 ```
 
 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
diff --git a/examples/vctk/tts3/README.md b/examples/vctk/tts3/README.md
index f373ca6a3..379f5c0fd 100644
--- a/examples/vctk/tts3/README.md
+++ b/examples/vctk/tts3/README.md
@@ -112,12 +112,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p
 ```
 ```text
 usage: synthesize.py [-h]
-                     [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}]
+                     [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}]
                      [--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
                      [--am_stat AM_STAT] [--phones_dict PHONES_DICT]
                      [--tones_dict TONES_DICT] [--speaker_dict SPEAKER_DICT]
                      [--voice-cloning VOICE_CLONING]
-                     [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+                     [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}]
                      [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
                      [--voc_stat VOC_STAT] [--ngpu NGPU]
                      [--test_metadata TEST_METADATA] [--output_dir OUTPUT_DIR]
@@ -126,11 +126,10 @@ Synthesize with acoustic model & vocoder
 
 optional arguments:
   -h, --help            show this help message and exit
-  --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}
+  --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech,tacotron2_aishell3}
                         Choose acoustic model type of tts task.
   --am_config AM_CONFIG
-                        Config of acoustic model. Use deault config when it is
-                        None.
+                        Config of acoustic model.
   --am_ckpt AM_CKPT     Checkpoint file of acoustic model.
   --am_stat AM_STAT     mean and standard deviation used to normalize
                         spectrogram when training acoustic model.
@@ -142,10 +141,10 @@ optional arguments:
                         speaker id map file.
   --voice-cloning VOICE_CLONING
                         whether training voice cloning model.
-  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,wavernn_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,style_melgan_csmsc}
                         Choose vocoder type of tts task.
   --voc_config VOC_CONFIG
-                        Config of voc. Use deault config when it is None.
+                        Config of voc.
   --voc_ckpt VOC_CKPT   Checkpoint file of voc.
   --voc_stat VOC_STAT   mean and standard deviation used to normalize
                         spectrogram when training voc.
@@ -161,12 +160,12 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_outp
 ```
 ```text
 usage: synthesize_e2e.py [-h]
-                         [--am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}]
+                         [--am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}]
                          [--am_config AM_CONFIG] [--am_ckpt AM_CKPT]
                          [--am_stat AM_STAT] [--phones_dict PHONES_DICT]
                          [--tones_dict TONES_DICT]
                          [--speaker_dict SPEAKER_DICT] [--spk_id SPK_ID]
-                         [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}]
+                         [--voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}]
                          [--voc_config VOC_CONFIG] [--voc_ckpt VOC_CKPT]
                          [--voc_stat VOC_STAT] [--lang LANG]
                          [--inference_dir INFERENCE_DIR] [--ngpu NGPU]
@@ -176,11 +175,10 @@ Synthesize with acoustic model & vocoder
 
 optional arguments:
   -h, --help            show this help message and exit
-  --am {speedyspeech_csmsc,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk}
+  --am {speedyspeech_csmsc,speedyspeech_aishell3,fastspeech2_csmsc,fastspeech2_ljspeech,fastspeech2_aishell3,fastspeech2_vctk,tacotron2_csmsc,tacotron2_ljspeech}
                         Choose acoustic model type of tts task.
   --am_config AM_CONFIG
-                        Config of acoustic model. Use deault config when it is
-                        None.
+                        Config of acoustic model.
   --am_ckpt AM_CKPT     Checkpoint file of acoustic model.
   --am_stat AM_STAT     mean and standard deviation used to normalize
                         spectrogram when training acoustic model.
@@ -191,10 +189,10 @@ optional arguments:
   --speaker_dict SPEAKER_DICT
                         speaker id map file.
   --spk_id SPK_ID       spk id for multi speaker acoustic model
-  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc}
+  --voc {pwgan_csmsc,pwgan_ljspeech,pwgan_aishell3,pwgan_vctk,mb_melgan_csmsc,style_melgan_csmsc,hifigan_csmsc,hifigan_ljspeech,hifigan_aishell3,hifigan_vctk,wavernn_csmsc}
                         Choose vocoder type of tts task.
   --voc_config VOC_CONFIG
-                        Config of voc. Use deault config when it is None.
+                        Config of voc.
   --voc_ckpt VOC_CKPT   Checkpoint file of voc.
   --voc_stat VOC_STAT   mean and standard deviation used to normalize
                         spectrogram when training voc.
@@ -207,9 +205,9 @@ optional arguments:
                         output dir.
 ```
 1. `--am` is acoustic model type with the format {model_name}_{dataset}
-2. `--am_config`, `--am_checkpoint`, `--am_stat`, `--phones_dict` `--speaker_dict` are arguments for acoustic model, which correspond to the 5 files in the fastspeech2 pretrained model.
+2. `--am_config`, `--am_ckpt`, `--am_stat`, `--phones_dict` `--speaker_dict` are arguments for acoustic model, which correspond to the 5 files in the fastspeech2 pretrained model.
 3. `--voc` is vocoder type with the format {model_name}_{dataset}
-4. `--voc_config`, `--voc_checkpoint`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
+4. `--voc_config`, `--voc_ckpt`, `--voc_stat` are arguments for vocoder, which correspond to the 3 files in the parallel wavegan pretrained model.
 5. `--lang` is the model language, which can be `zh` or `en`.
 6. `--test_metadata` should be the metadata file in the normalized subfolder of `test`  in the `dump` folder.
 7. `--text` is the text file, which contains sentences to synthesize.
diff --git a/examples/vctk/voc1/README.md b/examples/vctk/voc1/README.md
index 1c3016f88..c4c40d1d0 100644
--- a/examples/vctk/voc1/README.md
+++ b/examples/vctk/voc1/README.md
@@ -70,7 +70,7 @@ Train a ParallelWaveGAN model.
 
 optional arguments:
   -h, --help            show this help message and exit
-  --config CONFIG       config file to overwrite default config.
+  --config CONFIG       ParallelWaveGAN config file.
   --train-metadata TRAIN_METADATA
                         training data.
   --dev-metadata DEV_METADATA
diff --git a/examples/vctk/voc5/README.md b/examples/vctk/voc5/README.md
index 4eb25c02d..c53d46325 100644
--- a/examples/vctk/voc5/README.md
+++ b/examples/vctk/voc5/README.md
@@ -62,15 +62,13 @@ Here's the complete help message.
 ```text
 usage: train.py [-h] [--config CONFIG] [--train-metadata TRAIN_METADATA]
                 [--dev-metadata DEV_METADATA] [--output-dir OUTPUT_DIR]
-                [--ngpu NGPU] [--batch-size BATCH_SIZE] [--max-iter MAX_ITER]
-                [--run-benchmark RUN_BENCHMARK]
-                [--profiler_options PROFILER_OPTIONS]
+                [--ngpu NGPU]
 
-Train a ParallelWaveGAN model.
+Train a HiFiGAN model.
 
 optional arguments:
   -h, --help            show this help message and exit
-  --config CONFIG       config file to overwrite default config.
+  --config CONFIG       HiFiGAN config file.
   --train-metadata TRAIN_METADATA
                         training data.
   --dev-metadata DEV_METADATA
@@ -78,19 +76,6 @@ optional arguments:
   --output-dir OUTPUT_DIR
                         output dir.
   --ngpu NGPU           if ngpu == 0, use cpu.
-
-benchmark:
-  arguments related to benchmark.
-
-  --batch-size BATCH_SIZE
-                        batch size.
-  --max-iter MAX_ITER   train max steps.
-  --run-benchmark RUN_BENCHMARK
-                        runing benchmark or not, if True, use the --batch-size
-                        and --max-iter.
-  --profiler_options PROFILER_OPTIONS
-                        The option of profiler, which should be in format
-                        "key1=value1;key2=value2;key3=value3".
 ```
 
 1. `--config` is a config file in yaml format to overwrite the default config, which can be found at `conf/default.yaml`.
diff --git a/paddlespeech/t2s/exps/gan_vocoder/hifigan/train.py b/paddlespeech/t2s/exps/gan_vocoder/hifigan/train.py
index c70821e78..4c733dc9b 100644
--- a/paddlespeech/t2s/exps/gan_vocoder/hifigan/train.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/hifigan/train.py
@@ -243,8 +243,7 @@ def main():
     # parse args and config and redirect to train_sp
 
     parser = argparse.ArgumentParser(description="Train a HiFiGAN model.")
-    parser.add_argument(
-        "--config", type=str, help="config file to overwrite default config.")
+    parser.add_argument("--config", type=str, help="HiFiGAN config file.")
     parser.add_argument("--train-metadata", type=str, help="training data.")
     parser.add_argument("--dev-metadata", type=str, help="dev data.")
     parser.add_argument("--output-dir", type=str, help="output dir.")
diff --git a/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py b/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py
index 27ffded63..3b3ebb478 100644
--- a/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py
@@ -233,7 +233,7 @@ def main():
     parser = argparse.ArgumentParser(
         description="Train a Multi-Band MelGAN model.")
     parser.add_argument(
-        "--config", type=str, help="config file to overwrite default config.")
+        "--config", type=str, help="Multi-Band MelGAN config file.")
     parser.add_argument("--train-metadata", type=str, help="training data.")
     parser.add_argument("--dev-metadata", type=str, help="dev data.")
     parser.add_argument("--output-dir", type=str, help="output dir.")
diff --git a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py
index 92de7a2c4..b26407028 100644
--- a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py
@@ -208,7 +208,7 @@ def main():
     parser = argparse.ArgumentParser(
         description="Train a ParallelWaveGAN model.")
     parser.add_argument(
-        "--config", type=str, help="config file to overwrite default config.")
+        "--config", type=str, help="ParallelWaveGAN config file.")
     parser.add_argument("--train-metadata", type=str, help="training data.")
     parser.add_argument("--dev-metadata", type=str, help="dev data.")
     parser.add_argument("--output-dir", type=str, help="output dir.")
diff --git a/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py b/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py
index be3ba7425..a87cc7a18 100644
--- a/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/style_melgan/train.py
@@ -224,8 +224,7 @@ def main():
     # parse args and config and redirect to train_sp
 
     parser = argparse.ArgumentParser(description="Train a Style MelGAN model.")
-    parser.add_argument(
-        "--config", type=str, help="config file to overwrite default config.")
+    parser.add_argument("--config", type=str, help="Style MelGAN config file.")
     parser.add_argument("--train-metadata", type=str, help="training data.")
     parser.add_argument("--dev-metadata", type=str, help="dev data.")
     parser.add_argument("--output-dir", type=str, help="output dir.")
diff --git a/paddlespeech/t2s/exps/transformer_tts/train.py b/paddlespeech/t2s/exps/transformer_tts/train.py
index 45ecb269b..da48b6b99 100644
--- a/paddlespeech/t2s/exps/transformer_tts/train.py
+++ b/paddlespeech/t2s/exps/transformer_tts/train.py
@@ -160,7 +160,7 @@ def main():
     parser = argparse.ArgumentParser(description="Train a TransformerTTS "
                                      "model with LJSpeech TTS dataset.")
     parser.add_argument(
-        "--config", type=str, help="config file to overwrite default config.")
+        "--config", type=str, help="TransformerTTS config file.")
     parser.add_argument("--train-metadata", type=str, help="training data.")
     parser.add_argument("--dev-metadata", type=str, help="dev data.")
     parser.add_argument("--output-dir", type=str, help="output dir.")
diff --git a/paddlespeech/t2s/exps/vits/train.py b/paddlespeech/t2s/exps/vits/train.py
index b921f92af..dbda8b717 100644
--- a/paddlespeech/t2s/exps/vits/train.py
+++ b/paddlespeech/t2s/exps/vits/train.py
@@ -226,9 +226,8 @@ def train_sp(args, config):
 def main():
     # parse args and config and redirect to train_sp
 
-    parser = argparse.ArgumentParser(description="Train a HiFiGAN model.")
-    parser.add_argument(
-        "--config", type=str, help="config file to overwrite default config.")
+    parser = argparse.ArgumentParser(description="Train a VITS model.")
+    parser.add_argument("--config", type=str, help="VITS config file")
     parser.add_argument("--train-metadata", type=str, help="training data.")
     parser.add_argument("--dev-metadata", type=str, help="dev data.")
     parser.add_argument("--output-dir", type=str, help="output dir.")
diff --git a/paddlespeech/t2s/exps/wavernn/train.py b/paddlespeech/t2s/exps/wavernn/train.py
index 8661d311d..cf24ea268 100644
--- a/paddlespeech/t2s/exps/wavernn/train.py
+++ b/paddlespeech/t2s/exps/wavernn/train.py
@@ -180,8 +180,7 @@ def main():
     # parse args and config and redirect to train_sp
 
     parser = argparse.ArgumentParser(description="Train a WaveRNN model.")
-    parser.add_argument(
-        "--config", type=str, help="config file to overwrite default config.")
+    parser.add_argument("--config", type=str, help="WaveRNN config file.")
     parser.add_argument("--train-metadata", type=str, help="training data.")
     parser.add_argument("--dev-metadata", type=str, help="dev data.")
     parser.add_argument("--output-dir", type=str, help="output dir.")

From 6c57c2bf8e3568ab5518731de113d075467aeb9a Mon Sep 17 00:00:00 2001
From: KP <109694228@qq.com>
Date: Wed, 25 May 2022 21:32:14 +0800
Subject: [PATCH 06/14] Dynamic cli commands registration.

---
 paddlespeech/cli/__init__.py      |  7 -------
 paddlespeech/cli/asr/infer.py     |  3 ---
 paddlespeech/cli/base_commands.py | 18 ++++++++++++++++++
 paddlespeech/cli/cls/infer.py     |  5 +----
 paddlespeech/cli/entry.py         |  5 +++++
 paddlespeech/cli/st/infer.py      |  3 ---
 paddlespeech/cli/text/infer.py    |  2 --
 paddlespeech/cli/tts/infer.py     |  3 ---
 paddlespeech/cli/utils.py         | 11 +++++++++++
 paddlespeech/cli/vector/infer.py  |  6 +-----
 10 files changed, 36 insertions(+), 27 deletions(-)

diff --git a/paddlespeech/cli/__init__.py b/paddlespeech/cli/__init__.py
index ddf0359bc..ca6993f2b 100644
--- a/paddlespeech/cli/__init__.py
+++ b/paddlespeech/cli/__init__.py
@@ -13,14 +13,7 @@
 # limitations under the License.
 import _locale
 
-from .asr import ASRExecutor
 from .base_commands import BaseCommand
 from .base_commands import HelpCommand
-from .cls import CLSExecutor
-from .st import STExecutor
-from .stats import StatsExecutor
-from .text import TextExecutor
-from .tts import TTSExecutor
-from .vector import VectorExecutor
 
 _locale._getdefaultlocale = (lambda *args: ['en_US', 'utf8'])
diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py
index 2d74afa6d..09e8202fd 100644
--- a/paddlespeech/cli/asr/infer.py
+++ b/paddlespeech/cli/asr/infer.py
@@ -29,7 +29,6 @@ from yacs.config import CfgNode
 from ..download import get_path_from_url
 from ..executor import BaseExecutor
 from ..log import logger
-from ..utils import cli_register
 from ..utils import CLI_TIMER
 from ..utils import MODEL_HOME
 from ..utils import stats_wrapper
@@ -45,8 +44,6 @@ __all__ = ['ASRExecutor']
 
 
 @timer_register
-@cli_register(
-    name='paddlespeech.asr', description='Speech to text infer command.')
 class ASRExecutor(BaseExecutor):
     def __init__(self):
         super().__init__()
diff --git a/paddlespeech/cli/base_commands.py b/paddlespeech/cli/base_commands.py
index 0a26b1203..4d4d2cc69 100644
--- a/paddlespeech/cli/base_commands.py
+++ b/paddlespeech/cli/base_commands.py
@@ -15,6 +15,7 @@ from typing import List
 
 from .entry import commands
 from .utils import cli_register
+from .utils import explicit_command_register
 from .utils import get_command
 
 __all__ = [
@@ -73,3 +74,20 @@ class VersionCommand:
 
         print(msg)
         return True
+
+
+# Dynamic import when running specific command
+_commands = {
+    'asr': ['Speech to text infer command.', 'ASRExecutor'],
+    'cls': ['Audio classification infer command.', 'CLSExecutor'],
+    'st': ['Speech translation infer command.', 'STExecutor'],
+    'text': ['Text command.', 'TextExecutor'],
+    'tts': ['Text to Speech infer command.', 'TTSExecutor'],
+    'vector': ['Speech to vector embedding infer command.', 'VectorExecutor'],
+}
+
+for com, info in _commands.items():
+    explicit_command_register(
+        name='paddlespeech.{}'.format(com),
+        description=info[0],
+        cls='paddlespeech.cli.{}.{}'.format(com, info[1]))
diff --git a/paddlespeech/cli/cls/infer.py b/paddlespeech/cli/cls/infer.py
index 40072d997..3d807b60b 100644
--- a/paddlespeech/cli/cls/infer.py
+++ b/paddlespeech/cli/cls/infer.py
@@ -27,7 +27,6 @@ from paddlespeech.utils.dynamic_import import dynamic_import
 
 from ..executor import BaseExecutor
 from ..log import logger
-from ..utils import cli_register
 from ..utils import stats_wrapper
 from .pretrained_models import model_alias
 from .pretrained_models import pretrained_models
@@ -36,8 +35,6 @@ from .pretrained_models import pretrained_models
 __all__ = ['CLSExecutor']
 
 
-@cli_register(
-    name='paddlespeech.cls', description='Audio classification infer command.')
 class CLSExecutor(BaseExecutor):
     def __init__(self):
         super().__init__()
@@ -246,4 +243,4 @@ class CLSExecutor(BaseExecutor):
         self.infer()
         res = self.postprocess(topk)  # Retrieve result of cls.
 
-        return res
\ No newline at end of file
+        return res
diff --git a/paddlespeech/cli/entry.py b/paddlespeech/cli/entry.py
index 32123ece7..e0c306d62 100644
--- a/paddlespeech/cli/entry.py
+++ b/paddlespeech/cli/entry.py
@@ -34,6 +34,11 @@ def _execute():
     # The method 'execute' of a command instance returns 'True' for a success
     # while 'False' for a failure. Here converts this result into a exit status
     # in bash: 0 for a success and 1 for a failure.
+    if not callable(com['_entry']):
+        i = com['_entry'].rindex('.')
+        module, cls = com['_entry'][:i], com['_entry'][i + 1:]
+        exec("from {} import {}".format(module, cls))
+        com['_entry'] = locals()[cls]
     status = 0 if com['_entry']().execute(sys.argv[idx:]) else 1
     return status
 
diff --git a/paddlespeech/cli/st/infer.py b/paddlespeech/cli/st/infer.py
index 4f210fbe6..ae188b349 100644
--- a/paddlespeech/cli/st/infer.py
+++ b/paddlespeech/cli/st/infer.py
@@ -28,7 +28,6 @@ from yacs.config import CfgNode
 
 from ..executor import BaseExecutor
 from ..log import logger
-from ..utils import cli_register
 from ..utils import download_and_decompress
 from ..utils import MODEL_HOME
 from ..utils import stats_wrapper
@@ -42,8 +41,6 @@ from paddlespeech.utils.dynamic_import import dynamic_import
 __all__ = ["STExecutor"]
 
 
-@cli_register(
-    name="paddlespeech.st", description="Speech translation infer command.")
 class STExecutor(BaseExecutor):
     def __init__(self):
         super().__init__()
diff --git a/paddlespeech/cli/text/infer.py b/paddlespeech/cli/text/infer.py
index 97f3bbe21..be5b5a10d 100644
--- a/paddlespeech/cli/text/infer.py
+++ b/paddlespeech/cli/text/infer.py
@@ -23,7 +23,6 @@ import paddle
 
 from ..executor import BaseExecutor
 from ..log import logger
-from ..utils import cli_register
 from ..utils import stats_wrapper
 from .pretrained_models import model_alias
 from .pretrained_models import pretrained_models
@@ -33,7 +32,6 @@ from paddlespeech.utils.dynamic_import import dynamic_import
 __all__ = ['TextExecutor']
 
 
-@cli_register(name='paddlespeech.text', description='Text infer command.')
 class TextExecutor(BaseExecutor):
     def __init__(self):
         super().__init__()
diff --git a/paddlespeech/cli/tts/infer.py b/paddlespeech/cli/tts/infer.py
index efab9cb25..5fa9b3ed0 100644
--- a/paddlespeech/cli/tts/infer.py
+++ b/paddlespeech/cli/tts/infer.py
@@ -28,7 +28,6 @@ from yacs.config import CfgNode
 
 from ..executor import BaseExecutor
 from ..log import logger
-from ..utils import cli_register
 from ..utils import stats_wrapper
 from .pretrained_models import model_alias
 from .pretrained_models import pretrained_models
@@ -40,8 +39,6 @@ from paddlespeech.utils.dynamic_import import dynamic_import
 __all__ = ['TTSExecutor']
 
 
-@cli_register(
-    name='paddlespeech.tts', description='Text to Speech infer command.')
 class TTSExecutor(BaseExecutor):
     def __init__(self):
         super().__init__()
diff --git a/paddlespeech/cli/utils.py b/paddlespeech/cli/utils.py
index e7b499f72..128767e62 100644
--- a/paddlespeech/cli/utils.py
+++ b/paddlespeech/cli/utils.py
@@ -41,6 +41,7 @@ requests.adapters.DEFAULT_RETRIES = 3
 __all__ = [
     'timer_register',
     'cli_register',
+    'explicit_command_register',
     'get_command',
     'download_and_decompress',
     'load_state_dict_from_url',
@@ -70,6 +71,16 @@ def cli_register(name: str, description: str='') -> Any:
     return _warpper
 
 
+def explicit_command_register(name: str, description: str='', cls: str=''):
+    items = name.split('.')
+    com = commands
+    for item in items:
+        com = com[item]
+    com['_entry'] = cls
+    if description:
+        com['_description'] = description
+
+
 def get_command(name: str) -> Any:
     items = name.split('.')
     com = commands
diff --git a/paddlespeech/cli/vector/infer.py b/paddlespeech/cli/vector/infer.py
index cc664369f..07fb73a4c 100644
--- a/paddlespeech/cli/vector/infer.py
+++ b/paddlespeech/cli/vector/infer.py
@@ -28,7 +28,6 @@ from yacs.config import CfgNode
 
 from ..executor import BaseExecutor
 from ..log import logger
-from ..utils import cli_register
 from ..utils import stats_wrapper
 from .pretrained_models import model_alias
 from .pretrained_models import pretrained_models
@@ -37,9 +36,6 @@ from paddlespeech.vector.io.batch import feature_normalize
 from paddlespeech.vector.modules.sid_model import SpeakerIdetification
 
 
-@cli_register(
-    name="paddlespeech.vector",
-    description="Speech to vector embedding infer command.")
 class VectorExecutor(BaseExecutor):
     def __init__(self):
         super().__init__()
@@ -476,4 +472,4 @@ class VectorExecutor(BaseExecutor):
         else:
             logger.info("The audio file format is right")
 
-        return True
\ No newline at end of file
+        return True

From 27a5de1af7852a70526673495250cf3ae0bc6b86 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Thu, 26 May 2022 10:35:08 +0800
Subject: [PATCH 07/14] Update README.md

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index a43e21bd2..c9d4796c8 100644
--- a/README.md
+++ b/README.md
@@ -24,6 +24,8 @@
   | <a href="#documents"> Documents </a>
   | <a href="#model-list"> Models List </a>
   | <a href="https://aistudio.baidu.com/aistudio/education/group/info/25130"> AIStudio Courses </a>
+  | <a href="https://arxiv.org/abs/2205.12007"> Paper </a>
+  | <a href="https://gitee.com/paddlepaddle/PaddleSpeech"> Gitee </a>
 </h4>
 </div>
 

From fe3474729de6dd0720dd1f848eb92a480f485843 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Thu, 26 May 2022 10:36:05 +0800
Subject: [PATCH 08/14] Update README_cn.md

---
 README_cn.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README_cn.md b/README_cn.md
index ed5c6a90d..c751b061d 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -25,6 +25,8 @@
   | <a href="#教程文档"> 教程文档 </a>
   | <a href="#模型列表"> 模型列表 </a>
   | <a href="https://aistudio.baidu.com/aistudio/education/group/info/25130"> AIStudio 课程 </a>
+  | <a href="https://arxiv.org/abs/2205.12007"> 论文 </a>
+  | <a href="https://gitee.com/paddlepaddle/PaddleSpeech"> Gitee 
 </h4>
 </div>
 

From 780da806d75f8e07ba62ec47e16a2b5cfa636ac7 Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Thu, 26 May 2022 03:46:01 +0000
Subject: [PATCH 09/14] fix test_cli, test=doc

---
 tests/unit/cli/test_cli.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/unit/cli/test_cli.sh b/tests/unit/cli/test_cli.sh
index e1f1853f6..e0ebd1412 100755
--- a/tests/unit/cli/test_cli.sh
+++ b/tests/unit/cli/test_cli.sh
@@ -25,7 +25,7 @@ paddlespeech asr --model deepspeech2offline_librispeech --lang en --input ./en.w
 # long audio restriction
 {
 wget -c https://paddlespeech.bj.bcebos.com/datasets/single_wav/zh/test_long_audio_01.wav
-paddlespeech asr --input test_long_audio_01.wav
+paddlespeech asr --model deepspeech2online_wenetspeech --input test_long_audio_01.wav -y
 if [ $? -ne 255 ]; then
    echo -e "\e[1;31mTime restriction not passed\e[0m"
    exit 1
@@ -54,7 +54,7 @@ paddlespeech tts --am tacotron2_ljspeech --voc pwgan_ljspeech --lang en --input
 # Speech Translation (only support linux)
 paddlespeech st --input ./en.wav
 
-# Speaker Verification 
+# Speaker Verification
 wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav
 paddlespeech vector --task spk --input 85236145389.wav
 
@@ -65,7 +65,7 @@ echo -e "demo1 85236145389.wav \n demo2 85236145389.wav" > vec.job
 paddlespeech vector --task spk --input vec.job
 
 echo -e "demo3 85236145389.wav \n demo4 85236145389.wav" | paddlespeech vector --task spk
-rm 85236145389.wav 
+rm 85236145389.wav
 rm vec.job
 
 # shell pipeline

From 49dadc8044ace30a12782775dc1a8c659a5b30e7 Mon Sep 17 00:00:00 2001
From: KP <109694228@qq.com>
Date: Thu, 26 May 2022 13:32:26 +0800
Subject: [PATCH 10/14] Update usage and doc of cli executor.

---
 demos/audio_searching/src/encode.py           | 2 +-
 demos/audio_tagging/README.md                 | 2 +-
 demos/audio_tagging/README_cn.md              | 2 +-
 demos/automatic_video_subtitiles/README.md    | 3 ++-
 demos/automatic_video_subtitiles/README_cn.md | 3 ++-
 demos/automatic_video_subtitiles/recognize.py | 4 ++--
 demos/punctuation_restoration/README.md       | 2 +-
 demos/punctuation_restoration/README_cn.md    | 2 +-
 demos/speaker_verification/README.md          | 2 +-
 demos/speaker_verification/README_cn.md       | 2 +-
 demos/speech_recognition/README.md            | 2 +-
 demos/speech_recognition/README_cn.md         | 2 +-
 demos/speech_translation/README.md            | 2 +-
 demos/speech_translation/README_cn.md         | 2 +-
 demos/text_to_speech/README.md                | 2 +-
 demos/text_to_speech/README_cn.md             | 2 +-
 16 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/demos/audio_searching/src/encode.py b/demos/audio_searching/src/encode.py
index c89a11c1f..f6bcb00ad 100644
--- a/demos/audio_searching/src/encode.py
+++ b/demos/audio_searching/src/encode.py
@@ -14,7 +14,7 @@
 import numpy as np
 from logs import LOGGER
 
-from paddlespeech.cli import VectorExecutor
+from paddlespeech.cli.vector import VectorExecutor
 
 vector_executor = VectorExecutor()
 
diff --git a/demos/audio_tagging/README.md b/demos/audio_tagging/README.md
index 9d4af0be6..fc4a334ea 100644
--- a/demos/audio_tagging/README.md
+++ b/demos/audio_tagging/README.md
@@ -57,7 +57,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/cat.wav https://paddlespe
 - Python API
   ```python
   import paddle
-  from paddlespeech.cli import CLSExecutor
+  from paddlespeech.cli.cls import CLSExecutor
 
   cls_executor = CLSExecutor()
   result = cls_executor(
diff --git a/demos/audio_tagging/README_cn.md b/demos/audio_tagging/README_cn.md
index 79f87bf8c..36b5d8aaf 100644
--- a/demos/audio_tagging/README_cn.md
+++ b/demos/audio_tagging/README_cn.md
@@ -57,7 +57,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/cat.wav https://paddlespe
 - Python API
   ```python
   import paddle
-  from paddlespeech.cli import CLSExecutor
+  from paddlespeech.cli.cls import CLSExecutor
 
   cls_executor = CLSExecutor()
   result = cls_executor(
diff --git a/demos/automatic_video_subtitiles/README.md b/demos/automatic_video_subtitiles/README.md
index db6da40db..b815425ec 100644
--- a/demos/automatic_video_subtitiles/README.md
+++ b/demos/automatic_video_subtitiles/README.md
@@ -28,7 +28,8 @@ ffmpeg -i subtitle_demo1.mp4 -ac 1 -ar 16000 -vn input.wav
 - Python API
   ```python
   import paddle
-  from paddlespeech.cli import ASRExecutor, TextExecutor
+  from paddlespeech.cli.asr import ASRExecutor
+  from paddlespeech.cli.text import TextExecutor
 
   asr_executor = ASRExecutor()
   text_executor = TextExecutor()
diff --git a/demos/automatic_video_subtitiles/README_cn.md b/demos/automatic_video_subtitiles/README_cn.md
index fc7b2cf6a..990ff6dbd 100644
--- a/demos/automatic_video_subtitiles/README_cn.md
+++ b/demos/automatic_video_subtitiles/README_cn.md
@@ -23,7 +23,8 @@ ffmpeg -i subtitle_demo1.mp4 -ac 1 -ar 16000 -vn input.wav
 - Python API
   ```python
   import paddle
-  from paddlespeech.cli import ASRExecutor, TextExecutor
+  from paddlespeech.cli.asr import ASRExecutor
+  from paddlespeech.cli.text import TextExecutor
 
   asr_executor = ASRExecutor()
   text_executor = TextExecutor()
diff --git a/demos/automatic_video_subtitiles/recognize.py b/demos/automatic_video_subtitiles/recognize.py
index 72e3c3a85..304599d19 100644
--- a/demos/automatic_video_subtitiles/recognize.py
+++ b/demos/automatic_video_subtitiles/recognize.py
@@ -16,8 +16,8 @@ import os
 
 import paddle
 
-from paddlespeech.cli import ASRExecutor
-from paddlespeech.cli import TextExecutor
+from paddlespeech.cli.asr import ASRExecutor
+from paddlespeech.cli.text import TextExecutor
 
 # yapf: disable
 parser = argparse.ArgumentParser(__doc__)
diff --git a/demos/punctuation_restoration/README.md b/demos/punctuation_restoration/README.md
index 518d437dc..458ab92f9 100644
--- a/demos/punctuation_restoration/README.md
+++ b/demos/punctuation_restoration/README.md
@@ -42,7 +42,7 @@ The input of this demo should be a text of the specific language that can be pas
 - Python API
   ```python
   import paddle
-  from paddlespeech.cli import TextExecutor
+  from paddlespeech.cli.text import TextExecutor
 
   text_executor = TextExecutor()
   result = text_executor(
diff --git a/demos/punctuation_restoration/README_cn.md b/demos/punctuation_restoration/README_cn.md
index 9d4be8bf0..f25acdadb 100644
--- a/demos/punctuation_restoration/README_cn.md
+++ b/demos/punctuation_restoration/README_cn.md
@@ -44,7 +44,7 @@
 - Python API
   ```python
   import paddle
-  from paddlespeech.cli import TextExecutor
+  from paddlespeech.cli.text import TextExecutor
 
   text_executor = TextExecutor()
   result = text_executor(
diff --git a/demos/speaker_verification/README.md b/demos/speaker_verification/README.md
index 63dc9294e..900b5ae40 100644
--- a/demos/speaker_verification/README.md
+++ b/demos/speaker_verification/README.md
@@ -96,7 +96,7 @@ wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav
 
 - Python API
   ```python
-  from paddlespeech.cli import VectorExecutor
+  from paddlespeech.cli.vector import VectorExecutor
 
   vector_executor = VectorExecutor()
   audio_emb = vector_executor(
diff --git a/demos/speaker_verification/README_cn.md b/demos/speaker_verification/README_cn.md
index 07eeac2ee..f6afa86ac 100644
--- a/demos/speaker_verification/README_cn.md
+++ b/demos/speaker_verification/README_cn.md
@@ -95,7 +95,7 @@ wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav
 - Python API
   ```python
   import paddle
-  from paddlespeech.cli import VectorExecutor
+  from paddlespeech.cli.vector import VectorExecutor
 
   vector_executor = VectorExecutor()
   audio_emb = vector_executor(
diff --git a/demos/speech_recognition/README.md b/demos/speech_recognition/README.md
index 6493e8e61..c815a88af 100644
--- a/demos/speech_recognition/README.md
+++ b/demos/speech_recognition/README.md
@@ -58,7 +58,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
 - Python API
   ```python
   import paddle
-  from paddlespeech.cli import ASRExecutor
+  from paddlespeech.cli.asr import ASRExecutor
 
   asr_executor = ASRExecutor()
   text = asr_executor(
diff --git a/demos/speech_recognition/README_cn.md b/demos/speech_recognition/README_cn.md
index 8d631d89c..13aa9f277 100644
--- a/demos/speech_recognition/README_cn.md
+++ b/demos/speech_recognition/README_cn.md
@@ -56,7 +56,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
 - Python API
   ```python
   import paddle
-  from paddlespeech.cli import ASRExecutor
+  from paddlespeech.cli.asr import ASRExecutor
 
   asr_executor = ASRExecutor()
   text = asr_executor(
diff --git a/demos/speech_translation/README.md b/demos/speech_translation/README.md
index f675a4eda..00a9c7932 100644
--- a/demos/speech_translation/README.md
+++ b/demos/speech_translation/README.md
@@ -47,7 +47,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
 - Python API
   ```python
   import paddle
-  from paddlespeech.cli import STExecutor
+  from paddlespeech.cli.st import STExecutor
 
   st_executor = STExecutor()
   text = st_executor(
diff --git a/demos/speech_translation/README_cn.md b/demos/speech_translation/README_cn.md
index bad9b392f..5119bf9f4 100644
--- a/demos/speech_translation/README_cn.md
+++ b/demos/speech_translation/README_cn.md
@@ -47,7 +47,7 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespee
 - Python API
   ```python
   import paddle
-  from paddlespeech.cli import STExecutor
+  from paddlespeech.cli.st import STExecutor
   
   st_executor = STExecutor()
   text = st_executor(
diff --git a/demos/text_to_speech/README.md b/demos/text_to_speech/README.md
index 2df72a82d..389847a12 100644
--- a/demos/text_to_speech/README.md
+++ b/demos/text_to_speech/README.md
@@ -77,7 +77,7 @@ The input of this demo should be a text of the specific language that can be pas
 - Python API
   ```python
   import paddle
-  from paddlespeech.cli import TTSExecutor
+  from paddlespeech.cli.tts import TTSExecutor
 
   tts_executor = TTSExecutor()
   wav_file = tts_executor(
diff --git a/demos/text_to_speech/README_cn.md b/demos/text_to_speech/README_cn.md
index 7e02b9624..f967d3d4d 100644
--- a/demos/text_to_speech/README_cn.md
+++ b/demos/text_to_speech/README_cn.md
@@ -80,7 +80,7 @@
 - Python API
   ```python
   import paddle
-  from paddlespeech.cli import TTSExecutor
+  from paddlespeech.cli.tts import TTSExecutor
 
   tts_executor = TTSExecutor()
   wav_file = tts_executor(

From be70016edba9552c4d96c26c3f1e76847db17ecb Mon Sep 17 00:00:00 2001
From: r <ryanrussell@users.noreply.github.com>
Date: Thu, 26 May 2022 16:11:05 -0500
Subject: [PATCH 11/14] Improve readability

---
 demos/README.md                  |  6 +++---
 speechx/README.md                |  6 +++---
 third_party/README.md            | 14 +++++++-------
 third_party/ctc_decoders/LICENSE |  2 +-
 4 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/demos/README.md b/demos/README.md
index 8abd67249..2a306df6b 100644
--- a/demos/README.md
+++ b/demos/README.md
@@ -2,14 +2,14 @@
 
 ([简体中文](./README_cn.md)|English)
 
-The directory containes many speech applications in multi scenarios.
+This directory contains many speech applications in multiple scenarios.
 
 * audio searching - mass audio similarity retrieval
 * audio tagging - multi-label tagging of an audio file
-* automatic_video_subtitiles - generate subtitles from a video
+* automatic_video_subtitles - generate subtitles from a video
 * metaverse - 2D AR with TTS  
 * punctuation_restoration - restore punctuation from raw text
-* speech recogintion - recognize text of an audio file 
+* speech recognition - recognize text of an audio file 
 * speech server - Server for Speech Task, e.g. ASR,TTS,CLS
 * streaming asr server - receive audio stream from websocket, and recognize to transcript.
 * speech translation - end to end speech translation  
diff --git a/speechx/README.md b/speechx/README.md
index f75d8ac4e..cd1cd62c1 100644
--- a/speechx/README.md
+++ b/speechx/README.md
@@ -44,13 +44,13 @@ More details please see `README.md` under `examples`.
 > If using docker please check `--privileged` is set when `docker run`.
 
 * Fatal error at startup: `a function redirection which is mandatory for this platform-tool combination cannot be set up`
-```
+```bash
 apt-get install libc6-dbg
 ```
 
 * Install
 
-```
+```bash
 pushd tools
 ./setup_valgrind.sh
 popd
@@ -59,4 +59,4 @@ popd
 ## TODO
 
 ### Deepspeech2 with linear feature
-* DecibelNormalizer: there is a little bit difference between offline and online db norm. The computation of online db norm read feature chunk by chunk, which causes the feature size is different with offline db norm. In normalizer.cc:73, the samples.size() is different, which causes the difference of result.
+* DecibelNormalizer: there is a small difference between the offline and online db norm. The computation of online db norm reads features chunk by chunk, which causes the feature size to be different different with offline db norm. In `normalizer.cc:73`, the `samples.size()` is different, which causes the different result.
diff --git a/third_party/README.md b/third_party/README.md
index c73df5427..843d0d3b2 100644
--- a/third_party/README.md
+++ b/third_party/README.md
@@ -1,27 +1,27 @@
 * [python_kaldi_features](https://github.com/ZitengWang/python_kaldi_features)  
 commit: fc1bd6240c2008412ab64dc25045cd872f5e126c  
 ref: https://zhuanlan.zhihu.com/p/55371926  
-licence: MIT
+license: MIT
 
 * [python-pinyin](https://github.com/mozillazg/python-pinyin.git)
 commit: 55e524aa1b7b8eec3d15c5306043c6cdd5938b03
-licence: MIT
+license: MIT
 
 * [zhon](https://github.com/tsroten/zhon)
 commit: 09bf543696277f71de502506984661a60d24494c
-licence: MIT
+license: MIT
 
 * [pymmseg-cpp](https://github.com/pluskid/pymmseg-cpp.git)
 commit: b76465045717fbb4f118c4fbdd24ce93bab10a6d
-licence: MIT
+license: MIT
 
 * [chinese_text_normalization](https://github.com/speechio/chinese_text_normalization.git)
 commit: 9e92c7bf2d6b5a7974305406d8e240045beac51c
-licence: MIT
+license: MIT
 
 * [phkit](https://github.com/KuangDD/phkit.git)
 commit: b2100293c1e36da531d7f30bd52c9b955a649522
-licence: None
+license: None
 
 * [nnAudio](https://github.com/KinWaiCheuk/nnAudio.git)
-licence: MIT
+license: MIT
diff --git a/third_party/ctc_decoders/LICENSE b/third_party/ctc_decoders/LICENSE
index eeef74b30..ad947f8d7 100644
--- a/third_party/ctc_decoders/LICENSE
+++ b/third_party/ctc_decoders/LICENSE
@@ -5,4 +5,4 @@ score.h and score.cpp is under the LGPL license.
 The two files include the header files from KenLM project.
 
 For the rest:
-The default licence of paddlespeech-ctcdecoders is Apache License 2.0.
+The default license of paddlespeech-ctcdecoders is Apache License 2.0.

From 8373eed67f0a9b2a642e5fe1e95e084a236844ba Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Fri, 27 May 2022 00:02:53 +0000
Subject: [PATCH 12/14] fix speechx compile error

---
 speechx/examples/CMakeLists.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/speechx/examples/CMakeLists.txt b/speechx/examples/CMakeLists.txt
index 3c274a20a..bcb23eddb 100644
--- a/speechx/examples/CMakeLists.txt
+++ b/speechx/examples/CMakeLists.txt
@@ -1,4 +1,3 @@
 cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
 
 add_subdirectory(ds2_ol)
-add_subdirectory(dev)
\ No newline at end of file

From 42fba661c9073c415dfd7d460bc8a510bc46359d Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Fri, 27 May 2022 00:03:19 +0000
Subject: [PATCH 13/14] more detail of copyright

---
 examples/wenetspeech/asr1/local/extract_meta.py | 16 +++-------------
 paddlespeech/kws/exps/mdtc/compute_det.py       |  2 ++
 paddlespeech/kws/exps/mdtc/plot_det_curve.py    |  2 ++
 paddlespeech/kws/exps/mdtc/score.py             |  4 +++-
 paddlespeech/kws/models/loss.py                 |  1 +
 paddlespeech/kws/models/mdtc.py                 |  1 +
 paddlespeech/s2t/io/dataset.py                  |  1 +
 paddlespeech/s2t/models/u2/u2.py                |  1 +
 paddlespeech/s2t/models/u2/updater.py           |  2 +-
 paddlespeech/s2t/utils/ctc_utils.py             |  1 +
 paddlespeech/s2t/utils/text_grid.py             |  1 +
 utils/compute-wer.py                            |  2 +-
 12 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/examples/wenetspeech/asr1/local/extract_meta.py b/examples/wenetspeech/asr1/local/extract_meta.py
index 0e1b27278..2cad977be 100644
--- a/examples/wenetspeech/asr1/local/extract_meta.py
+++ b/examples/wenetspeech/asr1/local/extract_meta.py
@@ -1,18 +1,7 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 # Copyright 2021  Xiaomi Corporation (Author: Yongqing Wang)
 #                 Mobvoi Inc(Author: Di Wu, Binbin Zhang)
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -24,6 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 import argparse
 import json
 import os
diff --git a/paddlespeech/kws/exps/mdtc/compute_det.py b/paddlespeech/kws/exps/mdtc/compute_det.py
index e43a953db..853056966 100644
--- a/paddlespeech/kws/exps/mdtc/compute_det.py
+++ b/paddlespeech/kws/exps/mdtc/compute_det.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2021 Binbin Zhang(binbzha@qq.com)
+#               2022 Shaoqing Yu(954793264@qq.com)
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/paddlespeech/kws/exps/mdtc/plot_det_curve.py b/paddlespeech/kws/exps/mdtc/plot_det_curve.py
index a3ea21eff..4960281ee 100644
--- a/paddlespeech/kws/exps/mdtc/plot_det_curve.py
+++ b/paddlespeech/kws/exps/mdtc/plot_det_curve.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2021 Binbin Zhang(binbzha@qq.com)
+#                    Menglong Xu
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/paddlespeech/kws/exps/mdtc/score.py b/paddlespeech/kws/exps/mdtc/score.py
index 1b5e1e296..556455ca1 100644
--- a/paddlespeech/kws/exps/mdtc/score.py
+++ b/paddlespeech/kws/exps/mdtc/score.py
@@ -1,4 +1,6 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2021 Binbin Zhang(binbzha@qq.com)
+#               2022 Shaoqing Yu(954793264@qq.com)
+#               2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddlespeech/kws/models/loss.py b/paddlespeech/kws/models/loss.py
index 64c9a32c9..bda77f2ba 100644
--- a/paddlespeech/kws/models/loss.py
+++ b/paddlespeech/kws/models/loss.py
@@ -1,3 +1,4 @@
+# Copyright (c) 2021 Binbin Zhang
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/paddlespeech/kws/models/mdtc.py b/paddlespeech/kws/models/mdtc.py
index 5d2e5de64..c605a02b6 100644
--- a/paddlespeech/kws/models/mdtc.py
+++ b/paddlespeech/kws/models/mdtc.py
@@ -1,3 +1,4 @@
+# Copyright (c) 2021 Jingyong Hou (houjingyong@gmail.com)
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/paddlespeech/s2t/io/dataset.py b/paddlespeech/s2t/io/dataset.py
index 0e94f047b..9987b5110 100644
--- a/paddlespeech/s2t/io/dataset.py
+++ b/paddlespeech/s2t/io/dataset.py
@@ -1,4 +1,5 @@
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2021 Mobvoi Inc. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py
index d5471369f..b4b61666f 100644
--- a/paddlespeech/s2t/models/u2/u2.py
+++ b/paddlespeech/s2t/models/u2/u2.py
@@ -1,3 +1,4 @@
+# Copyright 2021 Mobvoi Inc. All Rights Reserved.
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/paddlespeech/s2t/models/u2/updater.py b/paddlespeech/s2t/models/u2/updater.py
index c59090a84..898a50bf0 100644
--- a/paddlespeech/s2t/models/u2/updater.py
+++ b/paddlespeech/s2t/models/u2/updater.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# Modified from wenet(https://github.com/wenet-e2e/wenet)
+
 from contextlib import nullcontext
 
 import paddle
diff --git a/paddlespeech/s2t/utils/ctc_utils.py b/paddlespeech/s2t/utils/ctc_utils.py
index 886b72033..42564d8e1 100644
--- a/paddlespeech/s2t/utils/ctc_utils.py
+++ b/paddlespeech/s2t/utils/ctc_utils.py
@@ -1,3 +1,4 @@
+# Copyright 2021 Mobvoi Inc. All Rights Reserved.
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/paddlespeech/s2t/utils/text_grid.py b/paddlespeech/s2t/utils/text_grid.py
index cbd9856e4..e696f43d5 100644
--- a/paddlespeech/s2t/utils/text_grid.py
+++ b/paddlespeech/s2t/utils/text_grid.py
@@ -1,3 +1,4 @@
+# Copyright 2021 Mobvoi Inc. All Rights Reserved.
 # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/utils/compute-wer.py b/utils/compute-wer.py
index 978a80c9f..98bb24a7e 100755
--- a/utils/compute-wer.py
+++ b/utils/compute-wer.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
-# CopyRight WeNet Apache-2.0 License
+# Copyright 2021 Mobvoi Inc. All Rights Reserved.
 import codecs
 import re
 import sys

From aa49d2539ddefc2562a6745e94fb5a54cbdf3576 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Fri, 27 May 2022 00:07:01 +0000
Subject: [PATCH 14/14] 2022 year for default copyright

---
 .pre-commit-hooks/copyright-check.hook | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-hooks/copyright-check.hook b/.pre-commit-hooks/copyright-check.hook
index 26044c29e..761edbc01 100644
--- a/.pre-commit-hooks/copyright-check.hook
+++ b/.pre-commit-hooks/copyright-check.hook
@@ -19,7 +19,7 @@ import subprocess
 import platform
 
 COPYRIGHT = '''
-Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.